knitr::opts_chunk$set(warning=FALSE, message=FALSE, eval=FALSE, results='hide')

This run was carried out on shared workstation SSD00887. Covariates used are chosen by expert judgement, with a slightly different mix to run 01. R has been updated to 3.4.0patched with all packages current to 28 May 2017. DSMART has been modified to use the sf package wherever possible, to sample using preallocated lists of cell numbers that intersect each polygon. Class allocation has been modified to more closely adhere to the randomisation produced by using the dirichlet distribution Covariates are converted to a rasterBrick as this offers substantial speed gains on raster operations over rasterStack, particularly extract.

This run includes the T-factor dirichlet modifier described in Odgers (2014) - this was missing from all earlier R implementations of the DSMART method. Sample n has also been boosted to give it a chance to have an effect.

library(sp)
library(sf)
library(raster)
library(rgdal)
library(rgeos)
library(maptools)
library(doParallel)
library(gtools) 
library(snow)
library(C50)
library(tidyverse)
library(rosm)
library(ggspatial)

options(stringsAsFactors = FALSE)
rasterOptions(tmpdir = 'C:/DATA/DSMART/temp') # use starship for very large inputs
# setup
covariate_dir <- file.path(getwd(), 'covariates')
polygon_dir   <- file.path(getwd(), 'polygons')
desired_runs  <- 100L
sites_per_km2 <- 25L
tfct <- 1000

# it may be prudent to set tempdir() to a larger space than default 
# e.g. (C:/user/username/appdata/local/temp) on windows
#write("TMPDIR = 'C:/DATA/DSMART/temp'", file=file.path(Sys.getenv('R_USER'), '.Renviron'))

# read in and brick covariates (offers an important speed gain over stack later)
DSMART_covariates <- as.list(list.files(covariate_dir, pattern = '\\_3577.tif$', full.names = T))
covariate_stack   <- stack(DSMART_covariates)
if (file.exists(file.path(covariate_dir, 'covariate_brick.tif'))) {
  covariate_brick <- brick(file.path(covariate_dir, 'covariate_brick.tif'))
  } else {
    covariate_brick <- brick(covariate_stack,
                             filename  = file.path(covariate_dir, 'covariate_brick.tif'),
                             datatype  = 'FLT4S', 
                             overwrite = TRUE)
  }
names(covariate_brick) <- names(covariate_stack)
print(paste0('Number of covariates loaded = ', nlayers(covariate_brick)))
print(names(covariate_brick))
rm(covariate_stack)

# Read in the polygon/composition data (edit layer and mutate params as needed)
polygons <- st_read(file.path(polygon_dir, 'WT_DSMART_polygons_simplesp.gpkg'), quiet = TRUE) 

#st_write(polygons, file.path(getwd(), 'MW2_polys_with_NUID.gpkg'), quiet = TRUE)

# quick plot (nb ggplot won't be able to handle files the size of most DSMART inputs)
plot(covariate_brick[[1]], main = 'DSMART input polygons over covariate #1')
plot(st_geometry(polygons), add = TRUE)

Prepare Polygon Data

This function does a couple of QC checks and determines sample number per polygon, as well as extracting a list of sampleable cells per polygon.

dsmart_prep <- function(indata = NULL, covariates = NULL, id_field = NULL, area_rate = NULL, floor_rate = NULL) {
  # coerce to sf
  indata <- if(is(indata, 'sf') == FALSE) {
    st_as_sf(indata)
  } else {
    indata
  }

  # remove any rows where classes are missing percentages and vice versa
  missing_data <- split(indata, 1:nrow(indata)) %>%
    map(function(pol) {
      ifelse(sum(!is.na(pol) & grepl('CLASS', names(pol)) == T) != 
               sum(!is.na(pol) & grepl('PERC', names(pol)) == T), FALSE, TRUE)
    })

  indata2 <- if(any(missing_data == FALSE)) {
    bad_rows <- which(missing_data == FALSE)
    IDStr_m  <- toString(unlist(indata[bad_rows, id_field, drop = TRUE]))
    message(paste0('Removing the following polygons as essential data is missing: ', IDStr_m))
    dplyr::filter(indata, !(row_number() %in% bad_rows))
  } else {
    IDStr_m <- NA
    indata2 <- indata
  }

  # warn for polygons adding up to <100%
  short_props <- split(indata2, 1:nrow(indata2)) %>%
    map(function(pol) {
      ifelse(sum(grepl('PERC', names(pol))) < 100, FALSE, TRUE)
    })
  if (any(short_props == TRUE)) {
    short_rows <- which(short_props == TRUE)
    IDStr_s    <- toString(unlist(indata2[short_rows, id_field, drop = TRUE]))
    message(paste0('The following polygons have proportions that add to < 100%: ', IDStr_s)) 
  } else {
    IDStr_s <- NA
    indata2  <- indata2
  }

  # calculate n_samples and remove polygons where no cell centers intersect
  indata_prepped <- indata2 %>%
    mutate(area_sqm = as.numeric(st_area(.))) %>%
    # number of soil classes in polygon
    mutate(n_classes = split(., 1:nrow(.)) %>% 
             map_int(function(x) {
               sum(!is.na(x) & grepl('CLASS', names(x)) == T)
             }) %>% 
             as.vector() # ditches names
           ) %>%
    # sample number required
    mutate(n_samples = split(., 1:nrow(.)) %>%
             map_int(function(y) {
               poly_percs    <- as.vector(na.omit(unlist(y[, c(grep('PERC',  names(y))), drop = TRUE])))
               poly_nclass   <- as.integer(y[, 'n_classes', drop = TRUE])
               samp_min      <- ceiling(max(1, (max(poly_percs) / min(poly_percs))) * poly_nclass)
               samp_area     <- ceiling((as.numeric(y[, 'area_sqm', drop = TRUE]) / 1000000) * area_rate)
               # floor of 10 helps ensure that dirichlet allocations are reasonable in most model runs
               samp_n        <- as.integer(max(samp_min, samp_area, floor_rate)) 
             } ) %>% as.vector() 
           ) %>%
    # cell numbers that intersect the polygon (these are subset randomly on each model run)
    mutate(intersecting_cells = split(., 1:nrow(.)) %>%
             map(function(g) {
                ## BOTTLENECK
                cells <- as.integer(unlist(raster::cellFromPolygon(na.omit(covariates), 
                                                                   as(g, 'Spatial'))))
                if (length(cells) == 0) { NA } else { cells }
             }) %>% stats::setNames(., NULL) # can't cheat with as.vector here
           ) %>%
    filter(!(is.na(intersecting_cells)))

    IDstr_r <- if (nrow(indata2) > nrow(indata_prepped)) {
      removed   <- setdiff(indata2[ , id_field, drop = TRUE], indata_prepped[ , id_field, drop = TRUE])
      n_removed <- nrow(removed)
      IDstr     <- toString(unlist(removed))
      message(paste0(IDstr, ' excluded - no cell intersections.'))
      IDstr
    } else { 
      NA
    }

    prep_report <- list('missing_data' = IDStr_m,
                        'under_100'    = IDStr_s,
                        'no_cells'     = IDstr_r)


    assign('prep_report', prep_report, envir = .GlobalEnv)
    return(indata_prepped)
}

t <- proc.time()
polygons_prepped <- if (file.exists(file.path(polygon_dir, 'polygons_prepped.rds'))) {
  readRDS(file.path(polygon_dir, 'polygons_prepped.rds'))
} else {
    dsmart_prep(indata = polygons, covariates = covariate_brick, id_field = 'NUID', area_rate = sites_per_km2, floor_rate = sites_per_km2)
}
t_prep <- proc.time() - t

# backup: NB can't write to shp or gpkg bc of list-column
saveRDS(polygons_prepped, file.path(polygon_dir, 'polygons_prepped.rds'))
#polygons_prepped <- readRDS(file.path(polygon_dir, 'polygons_prepped.rds'))

Generate n model realisations

Note that this function doesn't return anything to the workspace as the volume of data would max out the RAM on most machines. Outputs are all written to #disk.

dsmart_gen_reals <- function (covariates    = NULL, indata = NULL, obsdat = NULL,
                              id_field      = NULL, reals  = NULL, t_factor = 10000,
                              cpus = 1, write_files   = c('rds_only', 'all'), 
                              write_samples = FALSE, resume_from = NULL) {
  # output directories
  dir.create('dsmartOuts',         showWarnings = F)
  dir.create('dsmartOuts/rasters', showWarnings = F)
  dir.create('dsmartOuts/models',  showWarnings = F)
  strr   <- file.path(getwd(), 'dsmartOuts', 'rasters')
  strm   <- file.path(getwd(), 'dsmartOuts', 'models' )

  p_crs    <- st_crs(indata)$proj4string

  # coerce to sf
  indata <- if(is(indata, 'sf') == FALSE) {
    st_as_sf(indata)
  } else {
    indata
  }

  # set consistent factoring across model runs
  class_levels <- as.data.frame(indata) %>%
    select(matches('CLASS_')) %>%
    gather() %>%
    distinct(value) %>%
    na.omit() %>%
    unlist(., use.names = FALSE) %>%
    sort() %>%
    as.factor()

  pb <- txtProgressBar(min = 0, max = reals, style = 3)

  ## realisations
  start <- if (is.null(resume_from)) { 1 } else { resume_from }
  lapply(seq.int(start:reals), function(j) {

    beginCluster(cpus)

    sample_points <- split(indata, 1:nrow(indata)) %>%
    map(function(z) {
      poly_cells    <- unlist(z[ , 'intersecting_cells', drop = TRUE], use.names = FALSE)
      poly_nsample  <- unlist(z[ , 'n_samples',          drop = TRUE], use.names = FALSE)
      poly_cellsamp <- if (length(poly_cells) <= poly_nsample) {
        poly_cells
        } else {
          sample(poly_cells, size = poly_nsample, replace = FALSE)
        }
      poly_percs    <- as.vector(na.omit(unlist(z[, c(grep('PERC',  names(z))), drop = TRUE])))
      poly_dirprops <- as.vector(gtools::rdirichlet(1, poly_percs * t_factor))
      poly_classes  <- as.vector(na.omit(unlist(z[, c(grep('CLASS', names(z))), drop = TRUE])))
      poly_alloc <- mapply(function(class, dpn) {
            rep(class, times = dpn)
          },  
          class = poly_classes, 
          dpn   = ceiling(poly_dirprops * length(poly_cellsamp))
        ) %>%
        unlist() %>%
        # shuffle randomly and make sure n is what it should be (sometimes you get n + 1 above)
        sample(., size = length(poly_cellsamp), replace = FALSE) %>% 
        as.vector()

      # tried using a matrix here - speed boost insignificant and code harder to read
      poly_spoints  <- data.frame('NUID'  = as.integer(z[, id_field, drop = TRUE]),
                                  'PTID'  = 1:length(poly_cellsamp),
                                  'CLASS' = poly_alloc,
                                  'CELL'  = poly_cellsamp)
    })

    # get all the sampling data for all the polygons for this realisation into one spdf
    all_samplepoints <- do.call('rbind', sample_points)

    # sample covariates
    cov_sample <- raster::extract(covariates, y = all_samplepoints$CELL)  ## BOTTLENECK
    all_samplepoints <- cbind(all_samplepoints, cov_sample)

    # add known locations
    all_samplepoints <- if (!is.null(obsdat)) {
      od <- obsdat %>%
        mutate(NUID = as.integer(paste0(as.integer(as.factor(PROJECT_CODE)), POLY_NO)),
               PTID = as.numeric(paste0(99, SAMP_NO)),
               CELL = cellFromXY(covariates, as.matrix(.[, c('SAMP_X', 'SAMP_Y')]))) %>%
        select(NUID, PTID, CLASS, CELL)
      od_sample <- raster::extract(covariates, y = od$CELL)
      od_sample <- cbind(od, od_sample)

      # make sure these cells haven't been randomly sampled already
      # if so, drop the randomly generated row
      all_samplepoints <- all_samplepoints %>%
        filter(!(CELL %in% c(od$CELL))) %>%
        rbind(all_samplepoints, od_sample)
      } else {
        all_samplepoints
      }

    # force all outputs to be on the same scale eg. output map value 1 always equals factor level 1
    all_samplepoints$CLASS <- as.factor(all_samplepoints$CLASS)
    levels(all_samplepoints$CLASS) <- class_levels

    # set up model input
    model_input <- all_samplepoints[complete.cases(all_samplepoints), ] %>%
      dplyr::select( -NUID, -PTID, -CELL)

    # generate decision tree 
    res <- C5.0(x = model_input[, !(names(model_input) %in% 'CLASS')], y = model_input$CLASS)

    # make prediction map
    r1 <- clusterR(covariates, raster::predict, args = list(res), datatype = 'INT2S')

    # factorise r1
    lookup     <- data.frame("ID" = as.integer(as.factor(res$levels)), "CLASS" = as.factor(res$levels)) 
    levels(r1) <- lookup

    if (write_files == 'rds_only') {

      saveRDS(res, file.path(strm, paste0('C5_model_', j, '.rds')))
      suppressWarnings(saveRDS(readAll(r1), file.path(strr, paste0('map_', j, '.rds'))))

      } else {

        saveRDS(res, file.path(strm, paste0('C5_model_', j, '.rds')))
        suppressWarnings(saveRDS(readAll(r1), file.path(strr, paste0('map_', j, '.rds'))))

        # write decision tree from this realisation to text
        out <- capture.output(summary(res))
        f2  <- file.path(strm, paste0('C5_model_', j, '.txt'))
        cat(out, file = f2, sep = "\n", append = TRUE)

        # write probability map from this realisation to GeoTIFF (lookup values are embedded)
        nme <- file.path(strr, paste0('map_', j, '.tif'))
        writeRaster(r1, filename = nme, format = "GTiff", overwrite = T, datatype = "INT2S", 
                    NAflag = -9999)

        # may as well write lookup as embedded lookups don't work in QGIS
        if (j == 1) {
          # Excel is the default csv viewer for most ppl but it doesn't like csv files 
          # where column 1 is called 'ID'
          names(lookup) <- c('VALUE', 'CLASS')
          write.table(lookup, file.path(strr, 'class_lookup.csv'), 
                      col.names = TRUE,  row.names = FALSE,
                      quote     = FALSE, sep       = ",")
        }
    }

    if (write_samples == TRUE) {
      dir.create('dsmartOuts/samples', showWarnings = F)
      strd <- file.path(getwd(), 'dsmartOuts', 'samples')
      # spatialise (sf point object)
      allsamp_sf <- all_samplepoints %>%
          mutate(geometry = st_sfc(lapply(CELL, function(x) {
                              st_point(as.vector(xyFromCell(covariates, x)))
                              }))
                 ) %>%
          st_as_sf(., crs = st_crs(indata)$proj4string)

      if (write_files == 'rds_only') {
        saveRDS(allsamp_sf, file.path(strd, paste0('samples_',  j, '.rds')))

        } else {

          saveRDS(allsamp_sf, file.path(strd, paste0('samples_',  j, '.rds')))

          # NB Don't change this to GPKG format yet - disk write time is crazy slow

          # make a lookup table for all_samplepoints covariate column names, because they're about
          # to get severely abbreviated for writing to shp
          cov_LUT_nm   <- file.path(strd, 'covariate_LUT.csv')
          cov_names    <- names(all_samplepoints[5:ncol(all_samplepoints)])
          cov_shpnames <- paste0('COV_', 1:length(cov_names))
          if (!file.exists(cov_LUT_nm)) {
                cov_LUT <- data.frame("COV_NAMES" = cov_names, "SHPCOL_NAMES" = cov_shpnames)
                write.table(cov_LUT, file = cov_LUT_nm, sep = ', ', 
                            quote = FALSE, col.names = TRUE, row.names = FALSE) }

          names(allsamp_sf)[5:(ncol(allsamp_sf) - 1)] <- cov_shpnames
          sf_name    <- paste0('samples_', j, '.shp')
          write_sf(allsamp_sf, 
                   file.path(strd, sf_name),
                   driver = 'ESRI Shapefile',
                   delete_layer = TRUE)
        }
    }

    endCluster()
    setTxtProgressBar(pb, j)
    } )

  close(pb)
  message(paste0(Sys.time(), ': DSMART outputs can be located at ', 
                 file.path(getwd(), 'dsmartOuts')))

}

t    <- proc.time()
dsmart_gen_reals(covariates = covariate_brick, indata = polygons_prepped, id_field = 'NUID', 
                 reals = desired_runs, t_factor = tfct, cpus = 6, 
                 write_files = 'all', write_samples = TRUE)
t_gr <- proc.time() - t

print(paste0('This run took ', round(t_gr[3] / 60 / 60 , 2), ' hours to complete.'))

Process Model Realisations

Now that the run is complete, output files must be read in and collated.

save.image(file.path(getwd(), 'run_results.RData'))

# may need to restart R here and reload the above image plus the packages - frees up space in tmp

DSMARTouts <- as.list(list.files(paste0(getwd(), '/dsmartOuts/rasters'),
                                 pattern = '\\.tif$', full.names = T))
reals <- stack(DSMARTouts) # brick won't speed up the next bit, don't bother
print(paste0('Number of prediction maps loaded = ', nlayers(reals)))

# create lookup table from RAT
LUT <- levels(reals[[1]])[[1]]
LUT$category <- as.character(LUT$category)
LUT <- LUT[-c(1) , ]
names(LUT) <- c("ID", "CLASS")

This next function takes the r desired_runs model realisations and stacks them, then tallies how many times each #soil was predicted on a given pixel. It then reorders those tallies so that the most-probable soil is in layer 1, #second most probable in layer 2, etc. It also outputs a stack of the probabilities associated with each of those #most-probable surfaces (i.e. a strong model has a lot of values close to 1 in layer 1). Optionally, it can output the #intermediate files that lead to these stacks. Outputs are written to file as multiband GeoTiffs, which can be large.

dsmart_tally_reals <- function(realstack = NULL, lookup = NULL, cpus = 1, keep_tallies = TRUE) {
  if (!dir.exists("dsmartOuts/summaries")) {
      dir.create("dsmartOuts/summaries", showWarnings = F)
  }
  strs    <- file.path(getwd(), 'dsmartOuts', 'summaries')
  classes <- nrow(lookup)
  realn   <- nlayers(realstack)

  ### cell by cell calc functions ###

  # class_count produces an integer vector counting the number of times a given soil was 
  # predicted on this pixel
  # e.g. 0 5 8 0 == soil 1 was not predicted, soil 2 was predicted 5 times, soil 3 was predicted 
  # 8 times, soil 4 was not predicted.
  class_count <- function(x) { 
    if (is.na(sum(x))) {
      rep(NA, classes)
    } else {
      tabulate(x, nbins = classes)
    }
  }

  # class_prop takes the vector produced by class_count and normalises it against the 
  # total number of model runs
  # e.g. for 20 runs, 0 5 8 0 becomes 0 0.25 0.4 0
  class_prop  <- function(x) {round((x / realn), 3)}

  # stack_cell_sort sorts the elements of class_order or probs_order from largest to smallest.
  # e.g. 0 0.25 0.4 0 becomes [1] 0.4 0.25 0 0
  stack_cell_sort <- function(x) {
    if (is.na(sum(x))) {
      rep(NA, classes)
    } else { 
      sort(x, decreasing = TRUE, na.last = TRUE)
    }
  }


  # stack_cell_order returns a vector of the position of the elements of probs_order
  # from largest to smallest.
  # e.g. 0 0.25 0.4 0 becomes [1] 3 2 1 4
  # these values correspond to the lookup ID column, so they can be linked to soil class name
  stack_cell_order <- function(x) {
    if (is.na(sum(x))) {
      rep(NA, classes)
    } else { 
      order(x, decreasing = TRUE, na.last = TRUE)
    }
  }

  # note that its easiest to think about the results of stack_cell_order as the names
  # of stack_cell_sort  e.g. [1] 3 2 1 4 or [1]  3   2   1 4
  #                              8 5 0 0        0.4 0.25 0 0

  ###
  beginCluster(cpus)
  assign('classes', classes, envir = .GlobalEnv)
  assign('realn',   realn,   envir = .GlobalEnv)

  pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3)

  # 1. get a stack of predicted class counts (NA cells accounted for)
  counts <- clusterR(x         = realstack,
                     fun       = calc, 
                     args      = list(fun = class_count), 
                     export    = 'classes', 
                     datatype  = 'INT2S')

  if (keep_tallies == TRUE) { 
    writeRaster(counts,
                filename  = file.path(strs, 'class_counts.tif'),
                datatype  = 'INT2S',
                format    = 'GTiff',
                NAflag    = -9999,
                overwrite = TRUE)
    assign('counts', counts, envir = .GlobalEnv) 
  }

  setTxtProgressBar(pb, 40)

  # 2. express that stack as a probability
  probs <- clusterR(x         = counts,
                    fun       = calc,
                    args      = list(fun = class_prop),
                    export    = 'realn',
                    datatype  = 'FLT4S')

  if (keep_tallies == TRUE) { 
    writeRaster(probs,
                filename  = file.path(strs, 'class_props.tif'),
                datatype  = 'FLT4S',
                format    = 'GTiff',
                NAflag    = -9999,
                overwrite = TRUE)
    assign('probs', probs, envir = .GlobalEnv)
  }

  setTxtProgressBar(pb, 60)

  # 3. sort probs by most to least probable
  sorted  <- clusterR(x         = probs,
                      fun       = calc, 
                      args      = list(fun = stack_cell_sort),
                      filename  = file.path(strs, 'class_probabilities_ranked.tif'),
                      datatype  = 'FLT4S',
                      format    = 'GTiff',
                      NAflag    = -9999,
                      overwrite = TRUE)
  assign('sorted', sorted, envir = .GlobalEnv)

  setTxtProgressBar(pb, 80)

  # 4. order probs by most to least probable
  ordered <- clusterR(x        = na.omit(probs),
                      fun      = calc, 
                      args     = list(fun = stack_cell_order),
                      datatype = 'INT2S')

 # give the raster names for these values
  for (i in 1:nlayers(ordered)) { 
    levels(ordered[[i]]) <- lookup
  }
  writeRaster(ordered,
              filename  = file.path(strs, 'class_predictions_ranked.tif'),
              datatype  = 'INT2S',
              format    = 'GTiff',
              NAflag    = -9999,
              overwrite = TRUE)
  assign('ordered', ordered, envir = .GlobalEnv)

  if (keep_tallies == FALSE) { 
    rm(counts)
    rm(probs) 
    }

  setTxtProgressBar(pb, 100)
  close(pb)
  endCluster()

}

t       <- proc.time()
dsmart_tally_reals(realstack = reals, lookup = LUT, cpus = 6, keep_tallies = TRUE)
t_tally <- proc.time() - t

Most-likely Soils Maps

The outputs produced above can be split into individual files. The following function creates n most-probable soil #class maps,and (optionally) a probability surface for each.

dsmart_nmp <- function(class_predictions = NULL, class_probabilities = NULL, lookup = NULL, nmaps = 1) {
  if (!dir.exists('dsmartOuts/summaries/most_probable_maps/')) { 
    dir.create('dsmartOuts/summaries/most_probable_maps/', showWarnings = F)
  }
  mp_dir <- file.path(getwd(), 'dsmartOuts', 'summaries', 'most_probable_maps') 

  # only unstack the number of maps requested
  if (!is.null(class_predictions)) {
      suppressWarnings(map_list <- unstack(class_predictions[[1:nmaps]]))
  } else {
      stop('class_predictions raster stack has not been specified')
  }

  lapply(map_list, function(i) ratify(i))

  outs_list <- mapply(FUN = function(x, i) {
    writeRaster(x, 
                filename  = file.path(mp_dir, paste0('mostlikely_', i ,'.tif')),
                format    = 'GTiff',
                NAflag    = -9999,
                datatype  = 'INT2S',
                overwrite = TRUE)
    },
    x = map_list, i = seq_along(1:nmaps)
    )

  assign('most_prob_maps', outs_list, envir = .GlobalEnv)
  message(paste0(nmaps, ' most-likely soils maps produced.'))

  ### optional probability surfaces
  if(!is.null(class_probabilities)) {
    suppressWarnings(probmap_list <- unstack(class_probabilities[[1:nmaps]]))
  }

  probsouts_list <- mapply(FUN = function(x, i) {
    writeRaster(x,
                filename  = file.path(mp_dir, paste0('mostlikely_prob_', i, '.tif')),
                format    = 'GTiff',
                datatype  = 'FLT4S',
                NAflag    = -9999,
                overwrite = TRUE)
    },
    x = probmap_list, 
    i = seq_along(1:nmaps)
    )

  assign('most_prob_prob_maps', probsouts_list, envir = .GlobalEnv)
  message(paste0(nmaps, ' probability maps produced.'))

}

t     <- proc.time()
dsmart_nmp(class_predictions = ordered, class_probabilities = sorted, lookup = LUT, nmaps = 3)
t_nmp <- proc.time() - t

Class Probability Surfaces

Probability surfaces for each soil class can also be produced. This function requires a 'probs' output from #tally_reals (using option keep_tallies = TRUE).

dsmart_classmaps <- function(probstack = NULL, lookup = NULL, cpus = 1) {
  if (!dir.exists('dsmartOuts/summaries/class_maps/')) {
    dir.create('dsmartOuts/summaries/class_maps/', showWarnings = F)
  }
  class_dir <- file.path(getwd(), 'dsmartOuts', 'summaries', 'class_maps')

  beginCluster(cpus)
  suppressWarnings(probs_list <- unstack(probs))

  lapply(seq_along(probs_list), function(i) {
    names(probs_list[[i]]) <- as.character(lookup[i, 2])
  })
  #for (i in 1:length(probs_list)) {
  #  names(probs_list[[i]]) <- as.character(lookup[i, 2])
  #}

  prob_tifs <- lapply(probs_list, function(x) {
    writeRaster(x, 
                filename  = file.path(class_dir, paste0(names(x), "_probability.tif")),
                format    = "GTiff",
                NAflag    = -9999,
                datatype  = 'FLT4S',
                overwrite = TRUE)
  })
  endCluster()
}
t    <- proc.time() 
dsmart_classmaps(probstack = probs, lookup = LUT, cpus = 6)
t_cm <- proc.time() - t

Confusion Index

To help evaluate DSMART's performance, some additional rasters can be generated. The first is called the 'confusion #index', which is a measure of the probability gap between the first and second most probable soils. Ideally, the gap #is large, implying that the most-probable soil is far more probable than any other.

dsmart_ci <- function(class_probabilities = NULL, cpus = 1) {

  conf_index  <- function(x) {1 - (x[[1]] - x[[2]])}

  if (!dir.exists("dsmartOuts/evaluation")) {
      dir.create("dsmartOuts/evaluation", showWarnings = F)
  }
  strs    <- file.path(getwd(),'dsmartOuts', 'evaluation')

  beginCluster(cpus)
  confus_ind  <- clusterR(na.omit(class_probabilities),
                          fun       = conf_index,
                          filename  = file.path(strs, 'confusion_index.tif'),
                          datatype  = 'FLT4S',
                          NAflag    = -9999,
                          overwrite = TRUE)
  assign('confus_ind', confus_ind,   envir = .GlobalEnv)
  endCluster()

}

t    <- proc.time()
dsmart_ci(class_probabilities = sorted, cpus = 6) 
t_ci <- proc.time() - t

Number of soils predicted

In practice, for many pixels the most probable soil is often a close race between two or even more possible classes. #This can happen when a number of soil classes are very similar, or where DSMART is having trouble making effective #predictions.

A second measure of DSMART's certainty is the number of different soils predicted on a given pixel - ideally, this #number is low. DSMART outputs also have a certain amount of 'noise' - soils predicted only a few times on a given #pixel during the n model runs, which can safely be disregarded. This means we can set a cutoff level below which #counts is ignored.

dsmart_npred <- function(class_counts = NULL, cpus = 1, noise_cutoff = 0.1, nreals = NULL) {
  if (is.null(nreals)) {
    stop('total number of DSMART model runs must be supplied.')
  }

  if (!dir.exists('dsmartOuts/evaluation')) {
      dir.create('dsmartOuts/evaluation', showWarnings = F)
  }
  strs <- file.path(getwd(), 'dsmartOuts', 'evaluation')

  # per-cell functions
  #n_classes_predicted <- function(x) { 
  #  if (is.na(sum(x))) {
  #    NA
  #  } else {
  #    length(x[x > 0])
  #  }
  #}
  # 
  n_classes_predicted <- function(x) {
    ifelse(is.na(sum(x)), NA, length(x[x > 0]))
  }

  # as above, with 'noise' removed - only classes predicted on >x% of model runs
  #n_classes_predicted_x <- function(x) { 
  #  if (is.na(sum(x))) {
  #    NA
  #  } else {
  #    length(x[x > (ceiling(nreals * noise_cutoff))])
  #  }
  #}
  n_classes_predicted_x <- function(x) { 
    ifelse(is.na(sum(x)), NA, length(x[x > (ceiling(nreals * noise_cutoff))]))
  }

  beginCluster(cpus)
  assign('noise_cutoff', noise_cutoff, envir = .GlobalEnv)
  assign('nreals',       nreals,       envir = .GlobalEnv)

  npred <- clusterR(class_counts,
                    fun       = calc,
                    args      = list(fun = n_classes_predicted),
                    filename  = file.path(strs, 'n_classes_predicted.tif'),
                    NAflag    = -9999,
                    datatype  = 'INT2S',
                    overwrite = TRUE)

  assign('n_classes_predicted', npred, envir = .GlobalEnv)

  npredx <- clusterR(class_counts,
                     fun       = calc, args = list(fun = n_classes_predicted_x),
                     export    = c('nreals', 'noise_cutoff'),
                     filename  = file.path(strs, paste0('n_classes_predicted_over_', 
                                                       (nreals * noise_cutoff), '.tif')),
                     datatype  = 'INT2S',
                     NAflag    = -9999,
                     overwrite = TRUE)

  assign('n_classes_predicted_x', npredx, envir = .GlobalEnv)
  endCluster()

}

t     <- proc.time()
dsmart_npred(class_counts = counts, cpus = 6, noise_cutoff = 0.1, nreals = 100)
t_ncl <- proc.time() - t

The DSMART run is now complete.


save.image(file.path(getwd(), 'run_results.RData'))


obrl-soil/dsmartr documentation built on Feb. 1, 2024, 10:57 p.m.