library(knitr, quietly = TRUE) # need to do this in order to access general-purpose functions library(soilReports, quietly = TRUE) # package options opts_knit$set(message = FALSE, warning = FALSE, verbose = FALSE, progress = FALSE) # chunk options # R session options options(width = 100, stringsAsFactors = FALSE) ### SANITY CHECK #1 - make sure user has run reportSetup recently ### check if needed CRAN packages are installed packz <- c("MASS", "terra", "data.table", "latticeExtra", "cluster", "clhs", "randomForest", "RColorBrewer", "spdep") newpackz <- packz[!(packz %in% installed.packages()[,"Package"])] loaded <- sapply(packz, FUN = require, character.only = TRUE, quietly = TRUE) if (sum(as.numeric(loaded)) != length(packz)) { stop("Failed to load one or more required packages! Be sure you have the latest version of the soilReports package from GitHub. Then run `soilReports::reportSetup('southwest/mu-comparison')` to install all required packages. Use reportUpdate() to ensure you have the latest version of report.", call. = FALSE) geterrmessage() } ### SANITY CHECK #2: make sure required report files are present required_src_files <- c("custom.R", "config.R", "categorical_definitions.R") required_src <- required_src_files %in% list.files() if (any(!required_src)) { stop(sprintf("Required report files (%s) are missing in current directory (%s). Re-install your report instance with `soilReports::reportInit()`", paste0(required_src_files[!required_src], collapse = ","), getwd()), call. = FALSE) } ## load report-specific functions source('custom.R') ## load local configuration # TODO: allow entry of simple fields interactively if they are not specified? # TODO: allow for batching from a basic report.Rmd source('config.R') # load the structures that define value:label:color for categorical plot. #this defines the aliases and label:value:color sets for displaying categorical variables categorical.defs <- list() source("categorical_definitions.R") # updates the "section_toggle" variable defined at top of report # which will knit the appropriate plots for the specified categoricals from config.R ### SANITY CHECK #3 - make sure all top-level elements of raster.list are "allowed" if (!all(names(raster.list) %in% c("continuous", "categorical", "circular"))) { stop('Check formatting of config.R raster list.\nAllowed top-level elements in `raster.list` include: "continuous", "categorical", "circular".', call. = FALSE) } ### SANITY CHECK #4 - make sure the shapefile to be summarized exists and has the specified column ## load map unit polygons from OGR data source mu <- try(terra::vect(mu.dsn, layer = mu.layer)) if (inherits(mu, 'try-error')) stop(paste0('Cannot read map unit polygon/feature file: "', mu.dsn, ' / ', mu.layer, '"'), call. = FALSE) if (!(mu.col %in% names(mu))) stop(paste0('Cannot find map unit column (',mu.col,') in attribute table of: "', mu.dsn, ' / ', mu.layer,'"'), call. = FALSE) if (terra::crs(mu) == "") { stop("CRS of input polygon file is undefined.", call. = FALSE) } if (terra::is.lonlat(mu)) { stop("CRS of input polygon file must be projected (meters), not geographic (longitude/latitude) for constant density sampling.", call. = FALSE) } ### SANITY CHECK #5 - make sure all raster/input file paths exist and are readable ### NB: this might not give correct results for parallel computation accessible.inputs <- file.access(as.character(unlist(raster.list)), mode = 4) + 1 #file.access returns 0 for success and -1 for failure if (any(accessible.inputs == 0 )) { unreadable.files <- names(which(accessible.inputs == 0)) stop(paste0("The following input files either do not exist or are unreadable: \n", paste0(unreadable.files, collapse = ", "))) } ### ----- # TODO: this type of if(exists(obj)) needs to wrap all references to report parameters if (exists('mu.set')) { # coerce mu.set to character just in integers were specified mu.set <- as.character(mu.set) # check to see if the mu.set specifies musyms that are not in the spatial layer mu.set.bak <- mu.set mu_nosp <- !(mu.set %in% mu[[mu.col]][[1]]) # check if any of the predefined musyms are absent from the spatial if (any(mu_nosp)) mu.set <- mu.set[-which(mu_nosp)] #if so, remove them # if we removed everything (e.g. no musyms match due to wrong config file?), fail gracefully if (length(mu.set) == 0) { stop(paste0("Cannot find map unit polygons with symbol: ", paste0(mu.set.bak, collapse = ", "))) } # if mu.set defined in config.R, only keep the features with musym matching set mu <- mu[which(mu[[mu.col]][[1]] %in% mu.set), ] } else { # if mu.set is not predefined, ordering is determined by the order of musyms in the source file mu.set <- sort(unique(mu[[mu.col]][[1]])) } ### ----- ### SANITY CHECK #6 - make sure output paths are valid (no illegal characters) ### # make an output directory if it doesn't exist if (!dir.exists('output')) dir.create('./output') # shapefile/CSV output names; # NB: writeOGR uses 'output' as the data source name, so just the base file name # whereas the CSV file names end in .csv if (!exists('shp.unsampled.fname')) shp.unsampled.fname <- paste0('un-sampled-', paste(mu.set, collapse = '_')) if (!exists('shp.stats.fname')) shp.stats.fname <- paste0('polygons-with-stats-', paste(mu.set, collapse = '_')) if (!exists('shp.qc.fname')) shp.qc.fname <- paste0('poly-qc-', paste(mu.set, collapse = '_')) if (!exists('csv.mucol.stats.fname')) csv.mucol.stats.fname <- paste0('mucol-stats-', paste(mu.set, collapse = '_'), '.csv') if (!exists('csv.stats.fname')) csv.stats.fname <- paste0('poly-stats-', paste(mu.set, collapse = '_'), '.csv') if (!exists('csv.qc.fname')) csv.qc.fname <- paste0('poly-qc-', paste(mu.set, collapse = '_'),'.csv') outputfiles <- c(paste0(c(shp.unsampled.fname, shp.stats.fname, shp.qc.fname), ".shp"), csv.qc.fname, csv.stats.fname) # SANITY CHECK #7: let the user know that things might not quite look right with tons of mapunit symbols if (length(mu.set) > 8) { warning(sprintf("This report was designed to work with 1 to 8 map units. Found %s in `mu.set`. The report tabular and spatial output is stable for larger numbers of map units, but graphics and colors in the report itself may become cluttered or impossible to interpret.", length(mu.set)), call. = FALSE) } # SANITY CHECK #6: default names do not exceed Windows path length limits if (any(nchar(outputfiles) > 260)) { # shapefile containing any unsampled polygons (usually too small or odd shape) shp.unsampled.fname <- 'un-sampled-polygons' # shapefile containing median values / most likely classes by delineation shp.stats.fname <- 'polygons-with-stats' # shapefile containing "proportion of samples outside 5-95% quantile range" by delineation shp.qc.fname <- 'poly-qc' # comma-separated value file containing quantiles by mapunit column level csv.mucol.stats.fname <- 'mucol-stats.csv' # comma-separated value file containing median values / most likely classes by delineation csv.stats.fname <- 'poly-stats.csv' # comma-separated value file containing "proportion of samples outside 5-95% quantile range" by delineation csv.qc.fname <- 'poly-qc.csv' outputfiles <- c(paste0(c(shp.unsampled.fname,shp.stats.fname,shp.qc.fname),".shp"), csv.qc.fname, csv.stats.fname) warning(sprintf("One or more SHP or output file paths exceeds 260 character limit when including all `mu.col` symbols (n = %s) in the name. Reverting to shorter (non-specific) file names. See options in config.R to customize output file names.", length(mu.set)), call. = FALSE) } if (any(grepl(basename(outputfiles), pattern = '([/\\|<>:\\*?\"])'))) { stop("Map unit set or output file name contains invalid characters for filename. Either override default output filenames in config.R or remove [/\\|<>:\\*?\"] characters from your map unit symbols.", call. = FALSE) } ### ----- mu$pID <- 1:nrow(mu) if (cache.samples & file.exists('cached-samples.Rda')) { message('Using cached raster samples...') .sampling.time <- 'using cached samples' load('cached-samples.Rda') } else { .timer.start <- Sys.time() musub <- subset(mu, mu[[mu.col]][[1]] %in% mu.set) # determine sampling locations loc <- do.call('rbind', lapply(mu.set, function(musym) { musubsub <- subset(musub, musub[[mu.col]][[1]] %in% musym) terra::spatSample( musubsub, size = pmax(5, sum(terra::expanse(musubsub) * 0.000247 * pts.per.acre)), method = "regular" ) })) # extract values from raster .do_raster_samples <- function(raster.list, musub, loc){ step1 <- rapply(raster.list, how = "replace", function(path) { r <- rast(path) levels(r) <- NULL if (!terra::same.crs(r, musub)) { musub <- terra::project(musub, crs(r)) } if (!terra::same.crs(r, loc)) { loc2 <- terra::project(loc, crs(r)) } else { loc2 <- loc } # smp <- data.table::rbindlist( # ## TODO: this approach is an issue if rasters have heterogeneous grid system # ## would require an alternate approach for some parts of downstream # ## (i.e. multivariate summary) # # exactextractr::exact_extract( # # r, # # musub, # # include_cols = c("pID", mu.col), # # force_df = TRUE # # ) # ) smp <- cbind(as.data.frame(loc2)[c("pID", mu.col)], terra::extract(r, loc2)) colnames(smp)[colnames(smp) == names(r)] <- "value" smp$sID <- seq(nrow(smp)) smp$ID <- NULL smp }) # create long format samples step2 <- data.table::rbindlist(lapply(names(raster.list), function(k) { data.table::rbindlist(lapply(names(step1[[k]]), function(n) { res <- step1[[k]][[n]] res$variable <- n res$variable.type <- k res })) })) step2 } sampling.res <- list() sampling.res$raster.samples <- .do_raster_samples(raster.list, musub, loc) .do_area_stats <- function(musub, samples) { res <- aggregate( terra::expanse(musub) * 0.000247, by = list(musub[[mu.col]][[1]]), FUN = function(x) c(round(quantile(x, probs = c( 0, 0.05, 0.25, 0.5, 0.75, 0.95, 1 ))), round(sum(x)), length(x)) ) colnames(res[[2]]) <- c("Min", "Q5", "Q25", "Median", "Q75", "Q95", "Max", "Total Area", "Polygons") res2 <- aggregate(samples$pID, by = list(samples[[mu.col]]), FUN = function(x) { c(length(x), sum(!x %in% musub$pID)) }) colnames(res2[[2]]) <- c("Samples", "Polygons Not Sampled") res3 <- type.convert(as.data.frame(cbind(res[[1]], res[[2]], res2[[2]])), as.is = TRUE) res3$`Mean Sample Dens.` <- signif(res3$`Samples` / res3$`Total Area`, 3) colnames(res3)[1] <- mu.col res3[match(res3[[mu.col]], mu.set),] } sampling.res$area.stats <- .do_area_stats(musub, sampling.res$raster.samples) sampling.res$unsampled.ids <- unique(sampling.res$raster.samples$pID[!sampling.res$raster.samples$pID %in% musub$pID]) if (correct.sample.size) { message("Estimating effective sample size...") MIl <- lapply( lapply(raster.list$continuous, terra::rast), function (r, mu.extent = NULL, n = NULL, k = NULL, # do.correlogram = FALSE, cor.order = 5, crop.raster = TRUE) { if (!requireNamespace("terra")) stop("please install the `terra` package") if (!requireNamespace("spdep")) stop("please install the `spdep` package") if (is.null(mu.extent)) { mu.extent <- terra::project(terra::as.polygons(terra::ext(r), crs = terra::crs(r)), terra::crs(r)) } if (crop.raster && !is.null(mu.extent)) { mu.extent <- terra::project(terra::as.polygons(terra::ext(mu.extent), crs = terra::crs(mu.extent)), terra::crs(r)) r <- terra::crop(r, mu.extent) } if (is.null(n)) { n <- ifelse(terra::ncell(r) < 10000, terra::ncell(r), 10000) } if (is.null(k)) { k <- 5 } s <- suppressWarnings(terra::spatSample( r, size = n, na.rm = TRUE, as.points = TRUE )) s.n <- spdep::knearneigh(as(s, "Spatial"), k = k) s.nb <- spdep::knn2nb(s.n) s.listw <- spdep::nb2listw(s.nb) # if (do.correlogram) { # s.sp <- spdep::sp.correlogram(s.nb, s[[1]][[1]], order = cor.order, method = "I") # } else { # s.sp <- NULL # } res <- spdep::moran.test(as.numeric(s[[1]][[1]]), s.listw, rank = TRUE, randomisation = FALSE) # if (do.correlogram) # return(list(I = res$estimate[1], correlogram = s.sp)) res$estimate[1] } ) MI <- do.call('rbind', lapply(names(MIl), function(x) data.frame(Variable = x, Moran.I = MIl[[x]]))) rownames(MI) <- NULL } else { MI <- data.frame(Variable = names(raster.list$continuous), Moran.I = 0) } .do_raster_summary <- function(raster.list, musub) { e.mu <- terra::as.polygons(terra::ext(musub), crs = terra::crs(musub)) data.table::rbindlist(lapply(raster.list, function(x) { data.frame(Variable = names(x), File = unlist(x), Resolution = sapply(x, function(y) terra::res(terra::rast(y))[1]), inMemory = FALSE, ContainsMU = rapply(x, f = function(r) { r <- terra::rast(r) e.r <- terra::as.polygons(terra::ext(r), crs = terra::crs(r)) e.mu.r <- terra::project(e.mu, terra::crs(e.r)) return(terra::relate(e.r, e.mu.r, "contains")[1]) })) })) } sampling.res$raster.summary <- .do_raster_summary(raster.list, musub) sampling.res$mu.validity.check <- data.frame( id = musub[[mu.col]], Polygon.Validity = terra::is.valid(musub), stringsAsFactors = FALSE ) sampling.res$Moran_I <- MI .timer.stop <- Sys.time() .sampling.time <- format(difftime(.timer.stop, .timer.start, units = 'mins'), digits = 2) print(paste0("Completed sampling of raster stack for poly symbols (", paste(mu.set, collapse = ","),") in ", .sampling.time, ".")) sampling.res$raster.samples[[mu.col]] <- factor(sampling.res$raster.samples[[mu.col]], levels = mu.set) print("DONE!") if (cache.samples) save(mu, sampling.res,file = 'cached-samples.Rda') } ## subset raster samples by data type: continuous, categorical, circular d.continuous <- subset(sampling.res$raster.samples, variable.type == 'continuous') d.continuous$variable <- factor(d.continuous$variable, levels = names(raster.list$continuous)) d.circ <- subset(sampling.res$raster.samples, variable.type == 'circular') d.cat <- subset(sampling.res$raster.samples, variable.type == 'categorical') ### SUBSET CATEGORICAL VARIABLEs d.cat[[mu.col]] <- factor(d.cat[[mu.col]], levels = mu.set) d.cat.sub <- list() do.continuous <- isTRUE(nrow(d.continuous) > 0) do.aspect <- isTRUE(nrow(d.circ) > 0) do.categorical <- isTRUE(nrow(d.cat) > 0) ### FORMATTING ## figure out reasonable figure heights for bwplots and density plots # baseline height for figure margins, axes, and legend min.height <- 2 # height required for each panel panel.height <- 2 * length(levels(d.continuous$variable)) # extra height for each ID id.height <- 0.25 * length(levels(d.continuous[[mu.col]])) dynamic.fig.height <- min.height + panel.height + id.height # nice colors cols <- makeNiceColors(length(mu.set)) # TODO: Null out raster samples to save memory TOGGLE ## http://adv-r.had.co.nz/memory.html#memory # sampling.res$raster.samples <- NULL # TODO / DEBUGGING: report on invalid geometries that could be interfering with sampling # * https://github.com/ncss-tech/sharpshootR/commit/9ada343fea46d18f7c2e10c12a7dc1cbfb71679f # # results in: # sampling.res$mu.validity.check
r mu.col
): r paste(mu.set, collapse = ", ")
r .report.version
r format(Sys.time(), "%Y-%m-%d")
This report is designed to provide statistical summaries of the environmental properties for one or more map units. Summaries are based on raster data extracted from fixed-density sampling of map unit polygons. Percentiles are used as robust metrics of distribution central tendency and spread. Please see the document titled R-Based Map Unit Summary Report Introduction and Description for background and setup.
fd <- data.frame(`MU Polygons` = mu.dsn, `File or Feature` = mu.layer) kable(fd, row.names = FALSE)
kable(sampling.res$raster.summary, row.names = FALSE, digits = 3)
Target sampling density: r pts.per.acre
points/ac. defined in config.R
. Consider increasing if there are unsampled polygons or if the number of samples is less than about 200. Note that the mean sampling density (per polygon) will always be slightly lower than the target sampling density, depending on polygon shape.
kable(sampling.res$area.stats, caption='Map Unit Acreage by Polygon', align = 'r', col.names = c(mu.col, names(sampling.res$area.stats)[-1]))
Whiskers extend from the 5th to 95th percentiles, the body represents the 25th through 75th percentiles, and the dot is the 50th percentile. Box width is proportional to the number of samples per map unit (e.g. total map unit area). Notches (if enabled) represent an approximate confidence interval around the median, adjusted for spatial autocorrelation. Overlapping notches suggest that median values are not significantly different. This feature can be enabled by setting correct.sample.size=TRUE
in config.R
.
Suggested usage:
tps <- list(box.rectangle = list(col = 'black'), box.umbrella = list(col = 'black', lty = 1), box.dot = list(cex = 0.75), plot.symbol = list(col = rgb(0.1, 0.1, 0.1, alpha = 0.25, maxColorValue = 1), cex = 0.25)) # NOTE: notches rely on effective sampling size bwplot(as.formula(sprintf("%s ~ value | variable", mu.col)), data = d.continuous, scales = list(y = list(alternating = 1), x = list(relation = 'free', tick.number = 10)), as.table = TRUE, col = 'black', strip = strip.custom(bg = grey(0.85)), xlab = '', par.settings = tps, subscripts = TRUE, layout = c(1, length(levels(d.continuous$variable))), varwidth = TRUE, panel = function(x, subscripts=subscripts, ...) { # extract the current raster name this.raster <- as.character(unique(d.continuous$variable[subscripts])) # get associated Moran's I idx <- which(sampling.res$Moran_I$Variable == this.raster) this.Moran.I <- sampling.res$Moran_I$Moran.I[idx] # make a grid panel.grid(h = 0, v = -1, col = 'grey', lty = 3) panel.abline(h = 1:length(unique(d.continuous[[mu.col]])), col = 'grey', lty = 3) # boxplots with custom sampling size: # coef: Moran's I associated with this raster panel.bwplot(x, stats = custom.bwplot, notch = correct.sample.size, coef = this.Moran.I, ...) })
These plots are a smooth alternative (density estimation) to the classic "binned" (histogram) approach to visualizing distributions. Peaks correspond to values that are most frequent within a data set. Each data set (ID / variable) are rescaled to {0,1} so that the y-axis can be interpreted as the "relative proportion of samples".
Suggested usage:
## TODO: consider variable line width proportional to area ## https://github.com/ncss-tech/soilReports/issues/81 # var.lwd <- scales::rescale(log(sampling.res$area.stats$`Total Area`), to=c(0.5, 2.5)) tps <- list(superpose.line = list(col = cols, lwd = 2, lend = 2)) # dynamic setting of columns in legend n.cols <- ifelse(length(mu.set) <= 4, length(mu.set), 5) # scaling of density curves # just in case this isn't in the config file, set a default value here if (!exists('scaleDensityCurves')) scaleDensityCurves <- TRUE # compute densities and optionally re-scale to {0,1} density.plot.data <- data.frame(data.table::data.table(d.continuous)[, scaled.density(.SD), by = c(mu.col, 'variable')]) density.plot.data$.id <- density.plot.data[[mu.col]] xyplot(y ~ x | variable, groups = .id, data = density.plot.data, xlab = '', ylab = 'Relative Proportion', scales = list(relation = 'free', x = list(tick.number = 10), y = list(at = NULL)), plot.points = FALSE, strip = strip.custom(bg = grey(0.85)), as.table = TRUE, layout = c(1, length(levels(d.continuous$variable))), auto.key = list(lines = TRUE, points = FALSE, columns = n.cols, cex = 0.8), par.settings = tps, type = c('l','g')) rm(density.plot.data)
Table of select percentiles, by variable. In these tables, headings like "Q5" can be interpreted as the the "5th percentile"; 5% of the data are less than this value. The 50th percentile ("Q50") is the median.
# summarize raster data for tabular output mu.stats <- data.table::data.table(d.continuous)[, f.summary(.SD, p = p.quantiles), by = c('variable', mu.col)] # print medians dg <- c(0, rep(2, times = length(unique(mu.stats$variable)))) mu.stats.wide <- data.table::dcast(data.table::setDT(mu.stats), as.formula(sprintf("%s ~ variable", mu.col)), value.var = 'Q50') kable(mu.stats.wide, row.names = FALSE, caption = 'Median Values', align = 'r', digits = , col.names = c(mu.col, names(mu.stats.wide)[-1]))
muv <- split(mu.stats, mu.stats$variable) l <- lapply(seq_along(muv), function(i) { # remove variable column var.name <- unique(muv[[i]]$variable) if (length(var.name) == 0 || is.null(var.name) || is.na(var.name)) { return(NULL) } muv[[i]]$variable <- NULL dg <- c(0, rep(2, times = length(p.quantiles)), 3) # note: chunk option results='asis' print(kable( muv[[i]], caption = var.name, row.names = FALSE, align = 'r', digits = dg, col.names = c(mu.col, names(muv[[i]])[-1]) )) })
A graphical summary of slope aspect values using density and percentile estimation methods adapted to circular data. Spread and central tendency are depicted with a combination of (circular) kernel density estimate (dashed blue lines) and arrows. The 50th percentile value is shown with a red arrow and the 10th and 90th percentile values are shown with gray arrows. Arrow length is proportional to the strength of directionality. Use the figures and table below to determine "clockwise" / "counter clockwise" values for NASIS component records.
Suggested usage:
## circular stats, by map unit d.circ.list <- split(d.circ, d.circ[[mu.col]]) # this has to be called 2x, as we are adjusting the device settings on the fly fig.geom <- dynamicPar(length(d.circ.list)) # update default device output size opts_chunk$set(fig.height = fig.geom[1] * 5) # rows opts_chunk$set(fig.width = fig.geom[2] * 5) # cols
# reset multi-figure plotting parameters dynamicPar(length(d.circ.list)) res <- do.call('rbind', lapply(names(d.circ.list), function(n) { i <- d.circ.list[[n]] mu <- unique(i[[mu.col]]) circ.stats <- sharpshootR::aspect.plot( i$value, q = c(0.1, 0.5, 0.9), plot.title = mu, pch = NA, bg = 'RoyalBlue', col = 'black', arrow.col = c('grey', 'red', 'grey'), stack = FALSE, p.bw = 90 ) qs <- as.numeric(circ.stats) us <- attr(circ.stats, "uniformity") us[1] <- signif(us[1], 3) if (us[2] < 1e-8) { us[2] <- "<1e-8" } res <- data.frame(t(c(round(qs), us))) colnames(res) <- c("Q10", "Q50", "Q90") res <- cbind(data.frame(V1=n),res) return(res) })) # tabular summary kable(res, align = 'r', col.names = c(mu.col, names(res)[-1]))
# make categorical summaries for all categorical variables d.cat.list <- split(d.cat, f = d.cat$variable) l <- lapply( d.cat.list, FUN = makeCategoricalOutput, mu.col = mu.col, categorical.defs = categorical.defs )
This plot displays the similarity of the map units across the set of environmental variables used in this report. The contours contain 75% (dotted line), 50% (dashed line), and 25% (solid line) of the points in an optimal 2D projection of multivariate data space. Data from map units with more than 1,000 samples are (sub-sampled via cLHS). Map units with very low variation in environmental variables can result in tightly clustered points in the 2D projection. It is not possible to generate a multivariate summary when any sampled variable (e.g. slope) has a near-zero variance. See this chapter, from the Statistics for Soil Survey NEDS course, for an soils-specific introduction to these concepts.
Suggested usage:
## TODO: # 1. combine median of continuous, geomorphons proportions, and NLCD proportions for dendrogram # cast to wide format d.mu.wide <- as.data.frame(data.table::dcast(data.table::setDT(d.continuous), as.formula(sprintf("sID + pID + %s ~ variable", mu.col)), value.var = 'value')) # drop rows with NA d.mu.wide <- na.omit(d.mu.wide) # locate "non-id" vars that have non-zero SD # these are safe for cLHs and distance calc # solution for https://github.com/ncss-tech/soilReports/issues/87 d.mu.wide.vars <- findSafeVars(d.mu.wide, id = c(mu.col, 'sID', 'pID')) # must have > 1 variables to perform multivariate summary if (length(d.mu.wide.vars) < 2) { multivariate.summary <- FALSE } else { multivariate.summary <- TRUE ## TODO: what is a reasonable sample size? # only sub-sample if there are "a lot" of samples if (nrow(d.mu.wide) > 1000) { d.sub <- data.frame(data.table::data.table(d.mu.wide)[, cLHS_subset(.SD, n = 50, non.id.vars = d.mu.wide.vars), by = mu.col], check.names = FALSE) } else { d.sub <- d.mu.wide } ## NOTE: data with very low variability will cause warnings # eval numerical distance, removing 'sID' and '.id' columns d.dist <- daisy(d.sub[d.mu.wide.vars], stand = TRUE) ## map distance matrix to 2D space via principal coordinates d.betadisper <- vegan::betadisper( d.dist, group = d.sub[[mu.col]], bias.adjust = TRUE, sqrt.dist = TRUE, type = 'median' ) d.scores <- vegan::scores(d.betadisper) # contour density estimates # add contours for fixed pct of data density using KDE # other ideas: https://stat.ethz.ch/pipermail/r-help/2012-March/305425.html s <- data.frame(x = d.scores$sites[, 1], y = d.scores$sites[, 2], .id = d.sub[[mu.col]]) colnames(s)[3] <- mu.col s <- split(s, s[[mu.col]]) # plot par(mar = c(1, 1, 3, 1)) plot(d.scores$sites, type = 'n', axes = FALSE) abline( h = 0, v = 0, lty = 2, col = 'grey' ) # NOTE: lines are not added if data are too densely spaced for evaluation of requested prob. level # add contours of prob density res <- lapply( s, kdeContours, id = mu.col, prob = c(0.75), cols = cols, m = levels(d.sub[[mu.col]]), lwd = 1, lty = 3 ) res <- lapply( s, kdeContours, id = mu.col, prob = (0.5), cols = cols, m = levels(d.sub[[mu.col]]), lwd = 1, lty = 2 ) res <- lapply( s, kdeContours, id = mu.col, prob = c(0.25), cols = cols, m = levels(d.sub[[mu.col]]), lwd = 2, lty = 1 ) points(d.scores$sites, cex = 0.45, col = cols[as.numeric(d.sub[[mu.col]])], pch = 16) # note special indexing for cases when low-var MU have been removed vegan::ordilabel(d.betadisper, display = 'centroids', col = cols[match(mu.set, levels(d.sub[[mu.col]]))]) title('Ordination of Raster Samples (cLHS Subset) with 25%, 50%, 75% Density Contours') box() }
if (multivariate.summary == FALSE) print('Cannot create ordination plot: >1 rasters required or not enough variance within each map unit.')
The following figure highlights shared information among raster data sources based on Spearman's Ranked Correlation coefficient. Branch height is associated with the degree of shared information between raster data.
Suggested usage:
par(mar = c(2,5,2,2)) ## note that we don't load the Hmisc package as it causes many NAMESPACE conflicts ## This requires 3 or more variables if (length(d.mu.wide.vars) > 3) { try(plot(Hmisc::varclus(as.matrix(d.sub[, d.mu.wide.vars]))), silent = TRUE) } else print('This plot requires three or more raster variables, apart from aspect, curvature class, and geomorphons.')
The following figure ranks raster data sources in terms of how accurately each can be used to discriminate between map unit concepts.
Suggested usage:
# this will only work with >= 2 map units and >= 2 variables # reset factor levels so that empty classes are not passed on to randomForest() (will cause error if empty/low-variance MUs are present) d.sub[[mu.col]] <- factor(d.sub[[mu.col]]) if (length(levels(d.sub[[mu.col]])) >= 2) { # use supervised classification to empirically determine the relative importance of each raster layer # TODO: include geomorphons and curvature classes # TODO: consider using party::cforest() for conditional variable importance-- varimp m <- randomForest(x = d.sub[, d.mu.wide.vars], y = d.sub[[mu.col]], importance = TRUE) # variable importance # TODO: how to interpret raw output from importance: # http://stats.stackexchange.com/questions/164569/interpreting-output-of-importance-of-a-random-forest-object-in-r/164585#164585 varImpPlot(m, scale = TRUE, type = 1, main = 'Mean Decrease in Accuracy') # kable(importance(m, scale=FALSE, type=2), digits = 3) } else { # print message about not enough map unit s print('This plot requires two or more map units.') }
# apply across raster values, to all polygons #calculates prop outside range for _all_ polys polygons.to.check <- data.frame(data.table::data.table(d.continuous)[, flagPolygons(.SD), by = c(mu.col, 'variable')]) poly.check.wide <- as.data.frame(data.table::dcast(data.table::setDT(polygons.to.check), pID ~ variable, value.var = 'prop.outside.range')) #replace NAs with zero (no samples outside 5-95% percentile range) poly.check.wide[is.na(poly.check.wide)] = 0 mu.check <- merge(mu, poly.check.wide, by = 'pID', all.x = TRUE) names(mu.check)[-1] <- abbreviateNames(mu.check) # fix names for printing names(polygons.to.check)[1] <- mu.col
#save a SHP file with prop.outside.range for each polygon and raster data source combination if (nrow(polygons.to.check) > 0) { try(writeVector(mu.check, paste0(file.path('output', shp.qc.fname), ".shp"), overwrite = TRUE)) try(write.csv(mu.check, paste0('output/', csv.qc.fname))) }
# save SHP with any un-sampled polygons if (length(sampling.res$unsampled.ids) > 0) { try(terra::writeVector(mu[sampling.res$unsampled.ids, ], paste0(file.path('output', shp.unsampled.fname), ".shp"), overwrite = TRUE)) } # compute summaries by polygon*variable poly.stats <- data.frame(data.table::data.table(d.continuous)[, f.summary(.SD, p = p.quantiles), by = c('pID', 'variable')]) # convert to wide format, keeping median value poly.stats.wide <- as.data.frame(data.table::dcast(data.table::setDT(poly.stats), pID ~ variable, value.var = 'Q50')) # compute summaries by variable mucol.stats <- data.frame(data.table::data.table(d.continuous)[, f.summary(.SD, p = p.quantiles), by = c(mu.col, 'variable')]) mucol.stats.wide <- data.frame(.id = unique(d.continuous[[mu.col]])) names(mucol.stats.wide)[1] <- mu.col res <- lapply(split(mucol.stats, mucol.stats$variable), function(vardata) { varname <- abbreviate(gsub('[^[:alpha:]_]', '', unique(vardata$variable)), minlength = 10) names(vardata) <- paste0(varname,"_", names(vardata)) names(vardata)[1] <- mu.col mucol.stats.wide <<- merge(mucol.stats.wide, vardata) }) # # convert to wide format, keeping log_abs_madm # poly.stats.wide.2 <- dcast(poly.stats, pID ~ variable, value.var = 'log_abs_madm') # add a suffix to variable names so that we can combine # names(poly.stats.wide.1)[-1] <- paste0(names(poly.stats.wide.1)[-1], '_med') # names(poly.stats.wide.2)[-1] <- paste0(names(poly.stats.wide.2)[-1], '_var') ## TODO: pending further review # join median + MADM stats for each polygon # poly.stats.wide <- join(poly.stats.wide.1, poly.stats.wide.2, by='pID') # poly.stats.wide <- poly.stats.wide.1 # save try(write.csv(poly.stats.wide, file = paste0('output/', csv.stats.fname), row.names = FALSE)) try(write.csv(mucol.stats.wide, file = paste0('output/', csv.mucol.stats.fname), row.names = FALSE)) ## join stats to map unit polygon attribute table mu <- merge(mu, poly.stats.wide, by = 'pID', all.x = TRUE) names(mu)[-1] <- abbreviateNames(mu) # remove internally-used MU ID mu$.id <- NULL # save to file try(terra::writeVector(mu, paste0(file.path('output', shp.stats.fname), ".shp"), overwrite = TRUE)) ## TODO: how do you trap warnings within a .Rmd knitting session? # save warnings to log file # cat(warnings(), file = 'output/warning-log.txt')
A variety of output files are created by this report (in output
folder). The output file names can be adjusted in config.R
. The quantiles for each continuous variable by mapunit symbol are written to a CSV file that looks like this (first 6 columns):
kable(head(mucol.stats.wide[, 1:pmin(6, length(names(mucol.stats.wide)))]), row.names = FALSE)
A shapefile is generated ("polygons-with-stats-XXX" where "XXX" is the set of map units symbols listed in config.R
) that contains summaries computed by polygon. Polygons are uniquely identified by the pID
column. Median raster values are given, with data source names abbreviated to conform to the limitations of DBF files:
kable(head(poly.stats.wide), row.names = FALSE)
In the case of un-sampled polygons (very small delineations or too low sampling density), an additional shapefile will be saved in the output folder with a prefix of "un-sampled-". This file contains those polygons that were not allocated any sampling points and thus not included in the report summaries.
A shapefile is generated each time a report is run ("poly-qc-XXX" where "XXX" is the set of map units symbols listed in config.R
) that contains the proportion of samples outside the 5-95% percentile range. In the attribute table there is one column per raster data source and one row per map unit delineation.
The 5 - 95% percentile range for each map unit is derived from the samples across all polygons with the corresponding map unit symbol. The proportion of samples outside the 5 - 95% range within individual polygons is given for each (continuous) raster data source. Approximately 10% of samples across the extent of a particular map unit are expected to fall outside the 5 - 95% percentile range.
Delineations with more than 10 - 15% of samples outside the range for a data source may need to be investigated. It is expected that some delineations will occur at the margins of the map unit extent. Expert judgement is required to determine whether action should be taken to resolve any potentially problematic delineations.
Here is a sample output table for a few polygons. Sorting these columns by magnitude in your GIS software offers a quick way to find potentially problematic areas.
kable(head(mu.check[complete.cases(as.data.frame(mu.check)),], row.names = FALSE))
This document is based on sharpshootR
version r utils::packageDescription("sharpshootR", field="Version")
.
Report configuration and source code are hosted on GitHub.
Sampling time: r .sampling.time
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.