knitr::opts_chunk$set(warning=FALSE, message=FALSE, eval=FALSE, results='hide')
This run was carried out on shared workstation SSD00887. Covariates used are chosen by expert judgement, with a slightly different mix to run 01. R has been updated to 3.4.0patched with all packages current to 28 May 2017. DSMART has been modified to use the sf package wherever possible, to sample using preallocated lists of cell numbers that intersect each polygon. Class allocation has been modified to more closely adhere to the randomisation produced by using the dirichlet distribution Covariates are converted to a rasterBrick as this offers substantial speed gains on raster operations over rasterStack, particularly extract.
This run includes the T-factor dirichlet modifier described in Odgers (2014) - this was missing from all earlier R implementations of the DSMART method. Sample n has also been boosted to give it a chance to have an effect.
library(sp) library(sf) library(raster) library(rgdal) library(rgeos) library(maptools) library(doParallel) library(gtools) library(snow) library(C50) library(tidyverse) library(rosm) library(ggspatial) options(stringsAsFactors = FALSE) rasterOptions(tmpdir = 'C:/DATA/DSMART/temp') # use starship for very large inputs
# setup covariate_dir <- file.path(getwd(), 'covariates') polygon_dir <- file.path(getwd(), 'polygons') desired_runs <- 100L sites_per_km2 <- 25L tfct <- 1000 # it may be prudent to set tempdir() to a larger space than default # e.g. (C:/user/username/appdata/local/temp) on windows #write("TMPDIR = 'C:/DATA/DSMART/temp'", file=file.path(Sys.getenv('R_USER'), '.Renviron')) # read in and brick covariates (offers an important speed gain over stack later) DSMART_covariates <- as.list(list.files(covariate_dir, pattern = '\\_3577.tif$', full.names = T)) covariate_stack <- stack(DSMART_covariates) if (file.exists(file.path(covariate_dir, 'covariate_brick.tif'))) { covariate_brick <- brick(file.path(covariate_dir, 'covariate_brick.tif')) } else { covariate_brick <- brick(covariate_stack, filename = file.path(covariate_dir, 'covariate_brick.tif'), datatype = 'FLT4S', overwrite = TRUE) } names(covariate_brick) <- names(covariate_stack) print(paste0('Number of covariates loaded = ', nlayers(covariate_brick))) print(names(covariate_brick)) rm(covariate_stack) # Read in the polygon/composition data (edit layer and mutate params as needed) polygons <- st_read(file.path(polygon_dir, 'WT_DSMART_polygons_simplesp.gpkg'), quiet = TRUE) #st_write(polygons, file.path(getwd(), 'MW2_polys_with_NUID.gpkg'), quiet = TRUE) # quick plot (nb ggplot won't be able to handle files the size of most DSMART inputs) plot(covariate_brick[[1]], main = 'DSMART input polygons over covariate #1') plot(st_geometry(polygons), add = TRUE)
This function does a couple of QC checks and determines sample number per polygon, as well as extracting a list of sampleable cells per polygon.
dsmart_prep <- function(indata = NULL, covariates = NULL, id_field = NULL, area_rate = NULL, floor_rate = NULL) { # coerce to sf indata <- if(is(indata, 'sf') == FALSE) { st_as_sf(indata) } else { indata } # remove any rows where classes are missing percentages and vice versa missing_data <- split(indata, 1:nrow(indata)) %>% map(function(pol) { ifelse(sum(!is.na(pol) & grepl('CLASS', names(pol)) == T) != sum(!is.na(pol) & grepl('PERC', names(pol)) == T), FALSE, TRUE) }) indata2 <- if(any(missing_data == FALSE)) { bad_rows <- which(missing_data == FALSE) IDStr_m <- toString(unlist(indata[bad_rows, id_field, drop = TRUE])) message(paste0('Removing the following polygons as essential data is missing: ', IDStr_m)) dplyr::filter(indata, !(row_number() %in% bad_rows)) } else { IDStr_m <- NA indata2 <- indata } # warn for polygons adding up to <100% short_props <- split(indata2, 1:nrow(indata2)) %>% map(function(pol) { ifelse(sum(grepl('PERC', names(pol))) < 100, FALSE, TRUE) }) if (any(short_props == TRUE)) { short_rows <- which(short_props == TRUE) IDStr_s <- toString(unlist(indata2[short_rows, id_field, drop = TRUE])) message(paste0('The following polygons have proportions that add to < 100%: ', IDStr_s)) } else { IDStr_s <- NA indata2 <- indata2 } # calculate n_samples and remove polygons where no cell centers intersect indata_prepped <- indata2 %>% mutate(area_sqm = as.numeric(st_area(.))) %>% # number of soil classes in polygon mutate(n_classes = split(., 1:nrow(.)) %>% map_int(function(x) { sum(!is.na(x) & grepl('CLASS', names(x)) == T) }) %>% as.vector() # ditches names ) %>% # sample number required mutate(n_samples = split(., 1:nrow(.)) %>% map_int(function(y) { poly_percs <- as.vector(na.omit(unlist(y[, c(grep('PERC', names(y))), drop = TRUE]))) poly_nclass <- as.integer(y[, 'n_classes', drop = TRUE]) samp_min <- ceiling(max(1, (max(poly_percs) / min(poly_percs))) * poly_nclass) samp_area <- ceiling((as.numeric(y[, 'area_sqm', drop = TRUE]) / 1000000) * area_rate) # floor of 10 helps ensure that dirichlet allocations are reasonable in most model runs samp_n <- as.integer(max(samp_min, samp_area, floor_rate)) } ) %>% as.vector() ) %>% # cell numbers that intersect the polygon (these are subset randomly on each model run) mutate(intersecting_cells = split(., 1:nrow(.)) %>% map(function(g) { ## BOTTLENECK cells <- as.integer(unlist(raster::cellFromPolygon(na.omit(covariates), as(g, 'Spatial')))) if (length(cells) == 0) { NA } else { cells } }) %>% stats::setNames(., NULL) # can't cheat with as.vector here ) %>% filter(!(is.na(intersecting_cells))) IDstr_r <- if (nrow(indata2) > nrow(indata_prepped)) { removed <- setdiff(indata2[ , id_field, drop = TRUE], indata_prepped[ , id_field, drop = TRUE]) n_removed <- nrow(removed) IDstr <- toString(unlist(removed)) message(paste0(IDstr, ' excluded - no cell intersections.')) IDstr } else { NA } prep_report <- list('missing_data' = IDStr_m, 'under_100' = IDStr_s, 'no_cells' = IDstr_r) assign('prep_report', prep_report, envir = .GlobalEnv) return(indata_prepped) } t <- proc.time() polygons_prepped <- if (file.exists(file.path(polygon_dir, 'polygons_prepped.rds'))) { readRDS(file.path(polygon_dir, 'polygons_prepped.rds')) } else { dsmart_prep(indata = polygons, covariates = covariate_brick, id_field = 'NUID', area_rate = sites_per_km2, floor_rate = sites_per_km2) } t_prep <- proc.time() - t # backup: NB can't write to shp or gpkg bc of list-column saveRDS(polygons_prepped, file.path(polygon_dir, 'polygons_prepped.rds')) #polygons_prepped <- readRDS(file.path(polygon_dir, 'polygons_prepped.rds'))
Note that this function doesn't return anything to the workspace as the volume of data would max out the RAM on most machines. Outputs are all written to #disk.
dsmart_gen_reals <- function (covariates = NULL, indata = NULL, obsdat = NULL, id_field = NULL, reals = NULL, t_factor = 10000, cpus = 1, write_files = c('rds_only', 'all'), write_samples = FALSE, resume_from = NULL) { # output directories dir.create('dsmartOuts', showWarnings = F) dir.create('dsmartOuts/rasters', showWarnings = F) dir.create('dsmartOuts/models', showWarnings = F) strr <- file.path(getwd(), 'dsmartOuts', 'rasters') strm <- file.path(getwd(), 'dsmartOuts', 'models' ) p_crs <- st_crs(indata)$proj4string # coerce to sf indata <- if(is(indata, 'sf') == FALSE) { st_as_sf(indata) } else { indata } # set consistent factoring across model runs class_levels <- as.data.frame(indata) %>% select(matches('CLASS_')) %>% gather() %>% distinct(value) %>% na.omit() %>% unlist(., use.names = FALSE) %>% sort() %>% as.factor() pb <- txtProgressBar(min = 0, max = reals, style = 3) ## realisations start <- if (is.null(resume_from)) { 1 } else { resume_from } lapply(seq.int(start:reals), function(j) { beginCluster(cpus) sample_points <- split(indata, 1:nrow(indata)) %>% map(function(z) { poly_cells <- unlist(z[ , 'intersecting_cells', drop = TRUE], use.names = FALSE) poly_nsample <- unlist(z[ , 'n_samples', drop = TRUE], use.names = FALSE) poly_cellsamp <- if (length(poly_cells) <= poly_nsample) { poly_cells } else { sample(poly_cells, size = poly_nsample, replace = FALSE) } poly_percs <- as.vector(na.omit(unlist(z[, c(grep('PERC', names(z))), drop = TRUE]))) poly_dirprops <- as.vector(gtools::rdirichlet(1, poly_percs * t_factor)) poly_classes <- as.vector(na.omit(unlist(z[, c(grep('CLASS', names(z))), drop = TRUE]))) poly_alloc <- mapply(function(class, dpn) { rep(class, times = dpn) }, class = poly_classes, dpn = ceiling(poly_dirprops * length(poly_cellsamp)) ) %>% unlist() %>% # shuffle randomly and make sure n is what it should be (sometimes you get n + 1 above) sample(., size = length(poly_cellsamp), replace = FALSE) %>% as.vector() # tried using a matrix here - speed boost insignificant and code harder to read poly_spoints <- data.frame('NUID' = as.integer(z[, id_field, drop = TRUE]), 'PTID' = 1:length(poly_cellsamp), 'CLASS' = poly_alloc, 'CELL' = poly_cellsamp) }) # get all the sampling data for all the polygons for this realisation into one spdf all_samplepoints <- do.call('rbind', sample_points) # sample covariates cov_sample <- raster::extract(covariates, y = all_samplepoints$CELL) ## BOTTLENECK all_samplepoints <- cbind(all_samplepoints, cov_sample) # add known locations all_samplepoints <- if (!is.null(obsdat)) { od <- obsdat %>% mutate(NUID = as.integer(paste0(as.integer(as.factor(PROJECT_CODE)), POLY_NO)), PTID = as.numeric(paste0(99, SAMP_NO)), CELL = cellFromXY(covariates, as.matrix(.[, c('SAMP_X', 'SAMP_Y')]))) %>% select(NUID, PTID, CLASS, CELL) od_sample <- raster::extract(covariates, y = od$CELL) od_sample <- cbind(od, od_sample) # make sure these cells haven't been randomly sampled already # if so, drop the randomly generated row all_samplepoints <- all_samplepoints %>% filter(!(CELL %in% c(od$CELL))) %>% rbind(all_samplepoints, od_sample) } else { all_samplepoints } # force all outputs to be on the same scale eg. output map value 1 always equals factor level 1 all_samplepoints$CLASS <- as.factor(all_samplepoints$CLASS) levels(all_samplepoints$CLASS) <- class_levels # set up model input model_input <- all_samplepoints[complete.cases(all_samplepoints), ] %>% dplyr::select( -NUID, -PTID, -CELL) # generate decision tree res <- C5.0(x = model_input[, !(names(model_input) %in% 'CLASS')], y = model_input$CLASS) # make prediction map r1 <- clusterR(covariates, raster::predict, args = list(res), datatype = 'INT2S') # factorise r1 lookup <- data.frame("ID" = as.integer(as.factor(res$levels)), "CLASS" = as.factor(res$levels)) levels(r1) <- lookup if (write_files == 'rds_only') { saveRDS(res, file.path(strm, paste0('C5_model_', j, '.rds'))) suppressWarnings(saveRDS(readAll(r1), file.path(strr, paste0('map_', j, '.rds')))) } else { saveRDS(res, file.path(strm, paste0('C5_model_', j, '.rds'))) suppressWarnings(saveRDS(readAll(r1), file.path(strr, paste0('map_', j, '.rds')))) # write decision tree from this realisation to text out <- capture.output(summary(res)) f2 <- file.path(strm, paste0('C5_model_', j, '.txt')) cat(out, file = f2, sep = "\n", append = TRUE) # write probability map from this realisation to GeoTIFF (lookup values are embedded) nme <- file.path(strr, paste0('map_', j, '.tif')) writeRaster(r1, filename = nme, format = "GTiff", overwrite = T, datatype = "INT2S", NAflag = -9999) # may as well write lookup as embedded lookups don't work in QGIS if (j == 1) { # Excel is the default csv viewer for most ppl but it doesn't like csv files # where column 1 is called 'ID' names(lookup) <- c('VALUE', 'CLASS') write.table(lookup, file.path(strr, 'class_lookup.csv'), col.names = TRUE, row.names = FALSE, quote = FALSE, sep = ",") } } if (write_samples == TRUE) { dir.create('dsmartOuts/samples', showWarnings = F) strd <- file.path(getwd(), 'dsmartOuts', 'samples') # spatialise (sf point object) allsamp_sf <- all_samplepoints %>% mutate(geometry = st_sfc(lapply(CELL, function(x) { st_point(as.vector(xyFromCell(covariates, x))) })) ) %>% st_as_sf(., crs = st_crs(indata)$proj4string) if (write_files == 'rds_only') { saveRDS(allsamp_sf, file.path(strd, paste0('samples_', j, '.rds'))) } else { saveRDS(allsamp_sf, file.path(strd, paste0('samples_', j, '.rds'))) # NB Don't change this to GPKG format yet - disk write time is crazy slow # make a lookup table for all_samplepoints covariate column names, because they're about # to get severely abbreviated for writing to shp cov_LUT_nm <- file.path(strd, 'covariate_LUT.csv') cov_names <- names(all_samplepoints[5:ncol(all_samplepoints)]) cov_shpnames <- paste0('COV_', 1:length(cov_names)) if (!file.exists(cov_LUT_nm)) { cov_LUT <- data.frame("COV_NAMES" = cov_names, "SHPCOL_NAMES" = cov_shpnames) write.table(cov_LUT, file = cov_LUT_nm, sep = ', ', quote = FALSE, col.names = TRUE, row.names = FALSE) } names(allsamp_sf)[5:(ncol(allsamp_sf) - 1)] <- cov_shpnames sf_name <- paste0('samples_', j, '.shp') write_sf(allsamp_sf, file.path(strd, sf_name), driver = 'ESRI Shapefile', delete_layer = TRUE) } } endCluster() setTxtProgressBar(pb, j) } ) close(pb) message(paste0(Sys.time(), ': DSMART outputs can be located at ', file.path(getwd(), 'dsmartOuts'))) } t <- proc.time() dsmart_gen_reals(covariates = covariate_brick, indata = polygons_prepped, id_field = 'NUID', reals = desired_runs, t_factor = tfct, cpus = 6, write_files = 'all', write_samples = TRUE) t_gr <- proc.time() - t print(paste0('This run took ', round(t_gr[3] / 60 / 60 , 2), ' hours to complete.'))
Now that the run is complete, output files must be read in and collated.
save.image(file.path(getwd(), 'run_results.RData')) # may need to restart R here and reload the above image plus the packages - frees up space in tmp DSMARTouts <- as.list(list.files(paste0(getwd(), '/dsmartOuts/rasters'), pattern = '\\.tif$', full.names = T)) reals <- stack(DSMARTouts) # brick won't speed up the next bit, don't bother print(paste0('Number of prediction maps loaded = ', nlayers(reals))) # create lookup table from RAT LUT <- levels(reals[[1]])[[1]] LUT$category <- as.character(LUT$category) LUT <- LUT[-c(1) , ] names(LUT) <- c("ID", "CLASS")
This next function takes the r desired_runs
model realisations and stacks them, then tallies how many times each #soil was predicted on a given pixel. It then reorders those tallies so that the most-probable soil is in layer 1, #second most probable in layer 2, etc. It also outputs a stack of the probabilities associated with each of those #most-probable surfaces (i.e. a strong model has a lot of values close to 1 in layer 1). Optionally, it can output the #intermediate files that lead to these stacks. Outputs are written to file as multiband GeoTiffs, which can be large.
dsmart_tally_reals <- function(realstack = NULL, lookup = NULL, cpus = 1, keep_tallies = TRUE) { if (!dir.exists("dsmartOuts/summaries")) { dir.create("dsmartOuts/summaries", showWarnings = F) } strs <- file.path(getwd(), 'dsmartOuts', 'summaries') classes <- nrow(lookup) realn <- nlayers(realstack) ### cell by cell calc functions ### # class_count produces an integer vector counting the number of times a given soil was # predicted on this pixel # e.g. 0 5 8 0 == soil 1 was not predicted, soil 2 was predicted 5 times, soil 3 was predicted # 8 times, soil 4 was not predicted. class_count <- function(x) { if (is.na(sum(x))) { rep(NA, classes) } else { tabulate(x, nbins = classes) } } # class_prop takes the vector produced by class_count and normalises it against the # total number of model runs # e.g. for 20 runs, 0 5 8 0 becomes 0 0.25 0.4 0 class_prop <- function(x) {round((x / realn), 3)} # stack_cell_sort sorts the elements of class_order or probs_order from largest to smallest. # e.g. 0 0.25 0.4 0 becomes [1] 0.4 0.25 0 0 stack_cell_sort <- function(x) { if (is.na(sum(x))) { rep(NA, classes) } else { sort(x, decreasing = TRUE, na.last = TRUE) } } # stack_cell_order returns a vector of the position of the elements of probs_order # from largest to smallest. # e.g. 0 0.25 0.4 0 becomes [1] 3 2 1 4 # these values correspond to the lookup ID column, so they can be linked to soil class name stack_cell_order <- function(x) { if (is.na(sum(x))) { rep(NA, classes) } else { order(x, decreasing = TRUE, na.last = TRUE) } } # note that its easiest to think about the results of stack_cell_order as the names # of stack_cell_sort e.g. [1] 3 2 1 4 or [1] 3 2 1 4 # 8 5 0 0 0.4 0.25 0 0 ### beginCluster(cpus) assign('classes', classes, envir = .GlobalEnv) assign('realn', realn, envir = .GlobalEnv) pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3) # 1. get a stack of predicted class counts (NA cells accounted for) counts <- clusterR(x = realstack, fun = calc, args = list(fun = class_count), export = 'classes', datatype = 'INT2S') if (keep_tallies == TRUE) { writeRaster(counts, filename = file.path(strs, 'class_counts.tif'), datatype = 'INT2S', format = 'GTiff', NAflag = -9999, overwrite = TRUE) assign('counts', counts, envir = .GlobalEnv) } setTxtProgressBar(pb, 40) # 2. express that stack as a probability probs <- clusterR(x = counts, fun = calc, args = list(fun = class_prop), export = 'realn', datatype = 'FLT4S') if (keep_tallies == TRUE) { writeRaster(probs, filename = file.path(strs, 'class_props.tif'), datatype = 'FLT4S', format = 'GTiff', NAflag = -9999, overwrite = TRUE) assign('probs', probs, envir = .GlobalEnv) } setTxtProgressBar(pb, 60) # 3. sort probs by most to least probable sorted <- clusterR(x = probs, fun = calc, args = list(fun = stack_cell_sort), filename = file.path(strs, 'class_probabilities_ranked.tif'), datatype = 'FLT4S', format = 'GTiff', NAflag = -9999, overwrite = TRUE) assign('sorted', sorted, envir = .GlobalEnv) setTxtProgressBar(pb, 80) # 4. order probs by most to least probable ordered <- clusterR(x = na.omit(probs), fun = calc, args = list(fun = stack_cell_order), datatype = 'INT2S') # give the raster names for these values for (i in 1:nlayers(ordered)) { levels(ordered[[i]]) <- lookup } writeRaster(ordered, filename = file.path(strs, 'class_predictions_ranked.tif'), datatype = 'INT2S', format = 'GTiff', NAflag = -9999, overwrite = TRUE) assign('ordered', ordered, envir = .GlobalEnv) if (keep_tallies == FALSE) { rm(counts) rm(probs) } setTxtProgressBar(pb, 100) close(pb) endCluster() } t <- proc.time() dsmart_tally_reals(realstack = reals, lookup = LUT, cpus = 6, keep_tallies = TRUE) t_tally <- proc.time() - t
The outputs produced above can be split into individual files. The following function creates n
most-probable soil #class maps,and (optionally) a probability surface for each.
dsmart_nmp <- function(class_predictions = NULL, class_probabilities = NULL, lookup = NULL, nmaps = 1) { if (!dir.exists('dsmartOuts/summaries/most_probable_maps/')) { dir.create('dsmartOuts/summaries/most_probable_maps/', showWarnings = F) } mp_dir <- file.path(getwd(), 'dsmartOuts', 'summaries', 'most_probable_maps') # only unstack the number of maps requested if (!is.null(class_predictions)) { suppressWarnings(map_list <- unstack(class_predictions[[1:nmaps]])) } else { stop('class_predictions raster stack has not been specified') } lapply(map_list, function(i) ratify(i)) outs_list <- mapply(FUN = function(x, i) { writeRaster(x, filename = file.path(mp_dir, paste0('mostlikely_', i ,'.tif')), format = 'GTiff', NAflag = -9999, datatype = 'INT2S', overwrite = TRUE) }, x = map_list, i = seq_along(1:nmaps) ) assign('most_prob_maps', outs_list, envir = .GlobalEnv) message(paste0(nmaps, ' most-likely soils maps produced.')) ### optional probability surfaces if(!is.null(class_probabilities)) { suppressWarnings(probmap_list <- unstack(class_probabilities[[1:nmaps]])) } probsouts_list <- mapply(FUN = function(x, i) { writeRaster(x, filename = file.path(mp_dir, paste0('mostlikely_prob_', i, '.tif')), format = 'GTiff', datatype = 'FLT4S', NAflag = -9999, overwrite = TRUE) }, x = probmap_list, i = seq_along(1:nmaps) ) assign('most_prob_prob_maps', probsouts_list, envir = .GlobalEnv) message(paste0(nmaps, ' probability maps produced.')) } t <- proc.time() dsmart_nmp(class_predictions = ordered, class_probabilities = sorted, lookup = LUT, nmaps = 3) t_nmp <- proc.time() - t
Probability surfaces for each soil class can also be produced. This function requires a 'probs' output from #tally_reals (using option keep_tallies = TRUE
).
dsmart_classmaps <- function(probstack = NULL, lookup = NULL, cpus = 1) { if (!dir.exists('dsmartOuts/summaries/class_maps/')) { dir.create('dsmartOuts/summaries/class_maps/', showWarnings = F) } class_dir <- file.path(getwd(), 'dsmartOuts', 'summaries', 'class_maps') beginCluster(cpus) suppressWarnings(probs_list <- unstack(probs)) lapply(seq_along(probs_list), function(i) { names(probs_list[[i]]) <- as.character(lookup[i, 2]) }) #for (i in 1:length(probs_list)) { # names(probs_list[[i]]) <- as.character(lookup[i, 2]) #} prob_tifs <- lapply(probs_list, function(x) { writeRaster(x, filename = file.path(class_dir, paste0(names(x), "_probability.tif")), format = "GTiff", NAflag = -9999, datatype = 'FLT4S', overwrite = TRUE) }) endCluster() } t <- proc.time() dsmart_classmaps(probstack = probs, lookup = LUT, cpus = 6) t_cm <- proc.time() - t
To help evaluate DSMART's performance, some additional rasters can be generated. The first is called the 'confusion #index', which is a measure of the probability gap between the first and second most probable soils. Ideally, the gap #is large, implying that the most-probable soil is far more probable than any other.
dsmart_ci <- function(class_probabilities = NULL, cpus = 1) { conf_index <- function(x) {1 - (x[[1]] - x[[2]])} if (!dir.exists("dsmartOuts/evaluation")) { dir.create("dsmartOuts/evaluation", showWarnings = F) } strs <- file.path(getwd(),'dsmartOuts', 'evaluation') beginCluster(cpus) confus_ind <- clusterR(na.omit(class_probabilities), fun = conf_index, filename = file.path(strs, 'confusion_index.tif'), datatype = 'FLT4S', NAflag = -9999, overwrite = TRUE) assign('confus_ind', confus_ind, envir = .GlobalEnv) endCluster() } t <- proc.time() dsmart_ci(class_probabilities = sorted, cpus = 6) t_ci <- proc.time() - t
In practice, for many pixels the most probable soil is often a close race between two or even more possible classes. #This can happen when a number of soil classes are very similar, or where DSMART is having trouble making effective #predictions.
A second measure of DSMART's certainty is the number of different soils predicted on a given pixel - ideally, this #number is low. DSMART outputs also have a certain amount of 'noise' - soils predicted only a few times on a given #pixel during the n
model runs, which can safely be disregarded. This means we can set a cutoff level below which #counts
is ignored.
dsmart_npred <- function(class_counts = NULL, cpus = 1, noise_cutoff = 0.1, nreals = NULL) { if (is.null(nreals)) { stop('total number of DSMART model runs must be supplied.') } if (!dir.exists('dsmartOuts/evaluation')) { dir.create('dsmartOuts/evaluation', showWarnings = F) } strs <- file.path(getwd(), 'dsmartOuts', 'evaluation') # per-cell functions #n_classes_predicted <- function(x) { # if (is.na(sum(x))) { # NA # } else { # length(x[x > 0]) # } #} # n_classes_predicted <- function(x) { ifelse(is.na(sum(x)), NA, length(x[x > 0])) } # as above, with 'noise' removed - only classes predicted on >x% of model runs #n_classes_predicted_x <- function(x) { # if (is.na(sum(x))) { # NA # } else { # length(x[x > (ceiling(nreals * noise_cutoff))]) # } #} n_classes_predicted_x <- function(x) { ifelse(is.na(sum(x)), NA, length(x[x > (ceiling(nreals * noise_cutoff))])) } beginCluster(cpus) assign('noise_cutoff', noise_cutoff, envir = .GlobalEnv) assign('nreals', nreals, envir = .GlobalEnv) npred <- clusterR(class_counts, fun = calc, args = list(fun = n_classes_predicted), filename = file.path(strs, 'n_classes_predicted.tif'), NAflag = -9999, datatype = 'INT2S', overwrite = TRUE) assign('n_classes_predicted', npred, envir = .GlobalEnv) npredx <- clusterR(class_counts, fun = calc, args = list(fun = n_classes_predicted_x), export = c('nreals', 'noise_cutoff'), filename = file.path(strs, paste0('n_classes_predicted_over_', (nreals * noise_cutoff), '.tif')), datatype = 'INT2S', NAflag = -9999, overwrite = TRUE) assign('n_classes_predicted_x', npredx, envir = .GlobalEnv) endCluster() } t <- proc.time() dsmart_npred(class_counts = counts, cpus = 6, noise_cutoff = 0.1, nreals = 100) t_ncl <- proc.time() - t
The DSMART run is now complete.
save.image(file.path(getwd(), 'run_results.RData'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.