#########################################################################################################################
################################### FUNCTIONS FOR RUNNING SDM ANALYSIS ---- ############################################
#########################################################################################################################
#' @title Run SDM analyses for 'species with data' format.
#'
#' @description This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the dismo package https://github.com/johnbaums/rmaxent
#' @param taxa_list Character string - the taxa to run maxent models for
#' @param taxa_level Character string - the taxnomic level to run maxent models for
#' @param sdm_df SpatialPointsDataFrame. Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param sdm_predictors Character string - Vector of enviro conditions that you want to include
#' @param maxent_dir Character string - The file path used for saving the maxent output
#' @param bs_dir Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr Numeric - The min allowable percent variable contribution
#' @param k_thr Numeric - The min number of variables to be kept in the model
#' @param min_n Numeric - The min number of records for running maxent
#' @param max_bg_size Numeric - The max number of background points to keep
#' @param background_buffer_width Numeric - The max distance (km) from occ points that BG points should be selected?
#' @param feat_save Logical - Save features of the occ and bg data (T/F)?
#' @param features Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates Numeric - The number of replicates to use
#' @param responsecurves Logical - Save response curves of the maxent models (T/F)?
#' @param poly_path Character string - file path to feature polygon layer
#' @param epsg Numeric - ERSP code of coord ref system to be translated into WKT format
#' @export run_sdm_analysis_no_crop
run_sdm_analysis_no_crop = function(taxa_list,
taxa_level,
sdm_df,
sdm_predictors,
maxent_dir,
bs_dir,
backwards_sel,
template_raster,
cor_thr,
pct_thr,
k_thr,
min_n,
max_bg_size,
background_buffer_width,
features,
feat_save,
replicates,
responsecurves,
poly_path,
epsg) {
## Convert to SF object for selection - inefficient
message('Preparing spatial data for SDMs')
poly <- st_read(poly_path) %>%
st_transform(., st_crs(epsg))
## Loop over all the taxa
## taxa <- sort(taxa_list)[2]
lapply(taxa_list, function(taxa){
## Skip the taxa if the directory already exists, before the loop
# outdir <- maxent_dir
## Could also check the folder size, so folders with no contents aren't skipped eg
## && sum(file.info(dir_name)$size) < 1000 (EG 1MB)
dir_name = file.path(maxent_dir, gsub(' ', '_', taxa))
if(!dir.exists(dir_name)) {
message('Maxent not yet run for ', taxa, ' - running')
## Create the directory for the taxa in progress,
## so other parallel runs don't run the same taxa
dir.create(dir_name)
file.create(file.path(dir_name, "in_progress.txt"))
## Print the taxa being processed to screen
if(taxa %in% sdm_df[[taxa_level]]) {
message('Doing ', taxa)
## Subset the records to only the taxa being processed - in the searched taxa or returned
occurrence <- sdm_df %>% .[.[[taxa_level]] %in% taxa, ]
message('Using ', nrow(occurrence), ' occ records from ', unique(occurrence$SOURCE))
## Now get the background points. These can come from any taxa, other than the modelled taxa.
## However, they should be limited to the same SOURCE as the occ data
background <- sdm_df %>% .[!.[[taxa_level]] %in% taxa, ]
message('Using ', nrow(background), ' background records from ', unique(background$SOURCE))
gc()
## Finally fit the models using FIT_MAXENT_TARG_BG. Also use tryCatch to skip any exceptions
tryCatch(
fit_maxent_targ_bg_back_sel_no_crop(occ = occurrence,
bg = background,
sdm_predictors = sdm_predictors,
taxa = taxa,
outdir = maxent_dir,
bsdir = bs_dir,
backwards_sel = backwards_sel,
cor_thr = cor_thr,
pct_thr = pct_thr,
k_thr = k_thr,
template_raster = template_raster,
min_n = min_n,
max_bg_size = max_bg_size,
background_buffer_width = background_buffer_width,
features = features,
feat_save = feat_save,
replicates = replicates,
responsecurves = responsecurves,
poly = poly),
## Save error message
error = function(cond) {
## How to put the message into the file?
file.create(file.path(dir_name, "sdm_failed.txt"))
message(taxa, ' failed')
cat(cond$message, file = file.path(dir_name, "sdm_failed.txt"))
warning(taxa, ': ', cond$message)
})
} else {
message(taxa, ' skipped - no data.') ## This condition ignores taxa which have no data...
file.create(file.path(dir_name, "completed.txt"))
}
## now add a file to the dir to denote that it has completed
file.create(file.path(dir_name, "completed.txt"))
} else {
message(taxa, ' sdm already exists, skipped') ## This condition ignores taxa which have no data...
}
})
}
#' @title Maxent Backwards selection for 'species with data' format, without biogeographic cropping.
#' @description
#' This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the rmaxent package https://github.com/johnbaums/rmaxent
#' It assumes that the input df is that returned by the prepare_sdm_table function
#'
#' @param occ SpatialPointsDataFrame - Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param bg SpatialPointsDataFrame - Spdf of of candidate background points
#' @param sdm_predictors Character string - Vector of enviro conditions that you want to include
#' @param outdir Character string - The file path used for saving the maxent output
#' @param bsdir Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param template_raster RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr Numeric - The min allowable percent variable contribution
#' @param k_thr Numeric - The min number of variables to be kept in the model
#' @param min_n Numeric - The min number of records for running maxent
#' @param max_bg_size Numeric - The max number of background points to keep
#' @param feat_save Logical - Save shapefiles of the occ and bg data (T/F)?
#' @param features Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates Numeric - The number of replicates to use
#' @param responsecurves Logical - Save response curves of the maxent models (T/F)?
#' @param poly Character string - path to shapefile
#' @param rep_args RasterLayer of global koppen zones, in Mollweide54009 projection
#' @param full_args Dataframe of global koppen zones, with columns : GRIDCODE, Koppen
#' @export fit_maxent_targ_bg_back_sel_no_crop
fit_maxent_targ_bg_back_sel_no_crop <- function(occ,
bg,
sdm_predictors,
taxa,
outdir,
bsdir,
cor_thr,
pct_thr,
k_thr,
backwards_sel,
template_raster,
min_n,
max_bg_size,
background_buffer_width,
shapefiles,
features,
feat_save,
replicates,
responsecurves,
poly) {
## First, stop if the outdir file exists:
outdir_sp <- file.path(outdir, gsub(' ', '_', taxa))
bsdir_sp <- file.path(bsdir, gsub(' ', '_', taxa))
if(!file.exists(bsdir_sp)) stop('outdir does not exist :(', call. = FALSE)
## If the file doesn't exist, split out the features
if(!file.exists(outdir_sp)) dir.create(outdir_sp)
features <- unlist(strsplit(features, ''))
## Make sure user features are allowed: don't run the model if the
## features have been incorrectly specified in the main argument
## l: linear
## p: product
## q: quadratic
## h: hinge ## disabled for this analysis
## t: threshold ## disabled for this analysis
if(length(setdiff(features, c('l', 'p', 'q', 'h', 't'))) > 1)
stop("features must be a vector of one or more of ',
'l', 'p', 'q', 'h', and 't'.")
## Create a buffer of xkm around the occurrence points
## So the spatial data change has caused this problem
message('get distinct occurrence records')
buffer <- aggregate(gBuffer(occ, width = background_buffer_width, byid = TRUE))
## Get unique cell numbers for taxa occurrences
template_raster_spat <- terra::rast(template_raster)
occ_sf <- occ %>% st_as_sf()
raster_extent <- raster::extent(template_raster)
cell_size <- xres(template_raster)
# cells_coords <- raster::xyFromCell(template_raster, occ_coord)
# cells_distinct <- cells_coords %>% as.data.frame %>% distinct()
# distinct_cells <- dplyr::distinct(as.data.frame(cells)) %>% na.omit()
# xcoord_snapped_to_cell = xmin + floor ((Xcoord - xmin) / cellsize) * cellsize + cellsize/2
# ycoord_snapped_to_cell = ymin + floor ((Ycoord - ymin) / cellsize) * cellsize + cellsize/2
occX <- raster_extent[1] + floor((occ_sf$X - raster_extent[1]) / cell_size ) * cell_size + cell_size /2
occY <- raster_extent[2] + floor((occ_sf$Y - raster_extent[2]) / cell_size ) * cell_size + cell_size /2
occ_sf$X <- occX
occ_sf$Y <- occY
occ_cells <- occ_sf %>% dplyr::distinct(., X, Y, .keep_all = TRUE)
message(nrow(occ_cells), ' occurrence records (unique cells).')
## Skip taxa that have less than a minimum number of records: eg 20 taxa
if(nrow(occ_cells) > min_n) {
message('get distinct background records')
bg_buffer <- bg[buffer, ] %>% st_as_sf()
if(nrow(bg_buffer) > 100000) {
sample_row <- nrow(bg_buffer)/100000
bg_buffer <- bg_buffer[seq(1, nrow(bg_buffer), sample_row), ]
}
bgX <- raster_extent[1] + floor((bg_buffer$X - raster_extent[1]) / cell_size ) * cell_size + cell_size /2
bgY <- raster_extent[2] + floor((bg_buffer$Y - raster_extent[2]) / cell_size ) * cell_size + cell_size /2
bg_buffer$X <- bgX
bg_buffer$Y <- bgY
bg_cells <- bg_buffer[!duplicated(bg_buffer, by = c("X", "Y")), names(bg_buffer), with = FALSE]
## Subset the background records to the 200km buffered polygon
message(taxa, ' creating background cells')
## Don't use which to get unique cells, that has already been done
message(taxa, ' Do not intersect background cells with Koppen zones')
message('country poly is a ', class(poly))
## Now save an image of the background points
## This is useful to quality control the models
save_name = gsub(' ', '_', taxa)
## Then save the occurrence points
png(sprintf('%s/%s/%s_%s.png', outdir, save_name, save_name, "buffer_occ"),
16, 10, units = 'in', res = 300)
raster::plot(st_geometry(poly),
main = paste0('Occurence SDM records for ', taxa))
raster::plot(buffer, add = TRUE, col = "red")
raster::plot(st_geometry(occ_cells), add = TRUE, col = "blue")
dev.off()
gc()
## Reduce background sample, if it's larger than max_bg_size
if (nrow(bg_cells) > max_bg_size) {
message(nrow(bg_cells), ' target taxa background records for ', taxa,
', reduced to random ', max_bg_size, ' using random points from :: ', unique(bg_cells$SOURCE))
bg.samp <- bg_cells[sample(nrow(bg_cells), max_bg_size), ]
} else {
## If the bg points are smaller that the max_bg_size, just get all the points
message(nrow(bg_cells), ' target taxa background records for ', taxa,
', using all points from :: ', unique(bg_cells$SOURCE))
bg.samp <- bg_cells
}
## Now save the buffer, the occ and bg points as shapefiles
if(feat_save) {
suppressWarnings({
message(taxa, ' writing occ and bg geopackages')
st_write(buffer %>% st_as_sf(),
dsn = paste0(outdir_sp, '/', save_name, '_maxent_points.gpkg'),
layer = paste0(save_name, '_buffer'),
quiet = TRUE,
append = FALSE)
st_write(bg.samp %>% st_as_sf(),
dsn = paste0(outdir_sp, '/', save_name, '_maxent_points.gpkg'),
layer = paste0(save_name, '_samples'),
quiet = TRUE,
append = FALSE)
st_write(occ_cells,
dsn = paste0(outdir_sp, '/', save_name, '_maxent_points.gpkg'),
layer = paste0(save_name, '_occurrence'),
quiet = TRUE,
append = FALSE)
})
}
## Also save the background and occurrence points as .rds files.
saveRDS(bg.samp, file.path(outdir_sp, paste0(save_name, '_bg.rds')))
saveRDS(occ_cells, file.path(outdir_sp, paste0(save_name, '_occ.rds')))
## SWD = taxa with data. Now sample the environmental
## variables used in the model at all the occ and bg points
swd_occ <- occ_cells[, sdm_predictors]
swd_occ_sp <- as_Spatial(swd_occ)
saveRDS(swd_occ, file.path(outdir_sp, paste0(save_name,'_occ_swd.rds')))
swd_bg <- bg.samp[, sdm_predictors]
saveRDS(swd_bg, file.path(outdir_sp, paste0(save_name, '_bg_swd.rds')))
gc()
## Now combine the occurrence and background data
swd <- st_join(swd_occ, swd_bg) #as.data.frame(rbind(swd_occ@data, swd_bg@data))
saveRDS(swd, file.path(outdir_sp, 'swd.rds'))
pa <- rep(1:0, c(nrow(swd_occ), nrow(swd_bg)))
## Now, set the features to be used by maxent ::
## Linear, product and quadratic
off <- setdiff(c('l', 'p', 'q', 't', 'h'), features)
## This sets threshold and hinge features to "off"
if(length(off) > 0) {
off <- c(l = 'linear=false', p = 'product=false', q = 'quadratic=false',
t = 'threshold=false', h = 'hinge=false')[off]
}
off <- unname(off)
gc()
if (backwards_sel) {
## Coerce the "taxa with data" (SWD) files to regular data.frames
## This is needed to use the simplify function
swd_occ_df <- as.data.frame(swd_occ_sp)
swd_occ_df$lon <- NULL
swd_occ_df$lat <- NULL
swd_bg_df <- as.data.frame(swd_bg)
swd_bg_df$lon <- NULL
swd_bg_df$lat <- NULL
## Need to create a taxa column here.
swd_occ_df$searchTaxon <- taxa
swd_bg_df$searchTaxon <- taxa
## Make sure the colnames match
common_cols <- intersect(names(swd_occ_df), names(swd_bg_df))
swd_occ_df <- dplyr::select(swd_occ_df, all_of(common_cols))
swd_bg_df <- dplyr::select(swd_bg_df, all_of(common_cols))
## Run simplify rmaxent::simplify
# Given a candidate set of predictor variables, this function identifies
# a subset that meets specified multicollinearity criteria. Subsequently,
# backward stepwise variable selection (VIF) is used to iteratively drop
# the variable that contributes least to the model, until the contribution
# of each variable meets a specified minimum, or until a predetermined
# minimum number of predictors remains. It returns a model object for the
# full model, rather than a list of models as does the previous function
## Using a modified versionof rmaxent::simplify, so that the name of the
## maxent model object "maxent_fitted.rds" is the same in both models.
## This is needed to run the mapping step over either the full or BS folder
m <- local_simplify(
swd_occ_df,
swd_bg_df,
path = bsdir,
taxa_column = "searchTaxon",
replicates = replicates, ## 5 as above
response_curves = TRUE,
logistic_format = TRUE,
cor_thr = cor_thr,
pct_thr = pct_thr,
k_thr = k_thr,
features = features, ## LPQ as above
quiet = FALSE)
## Read the model in, because it's tricky to index
bs.model <- readRDS(sprintf('%s/%s/full/maxent_fitted.rds', bsdir, save_name))
identical(length(bs.model@presence$Annual_mean_temp), nrow(occ))
## Save the chart corrleation file too for the training data set
par(mar = c(3, 3, 5, 3),
oma = c(1.5, 1.5, 1.5, 1.5))
png(sprintf('%s/%s/full/%s_%s.png', bsdir,
save_name, save_name, "bs_predictor_correlation"),
3236, 2000, units = 'px', res = 300)
## set margins
par(mar = c(3, 3, 5, 3),
oma = c(1.5, 1.5, 1.5, 1.5))
## Add detail to the response plot
chart.Correlation(bs.model@presence,
histogram = TRUE, pch = 19,
title = paste0('Reduced variable correlations for ', save_name))
dev.off()
gc()
} else {
message("Don't run backwards selection")
}
} else {
message('Fewer occurrence records than the number of cross-validation ',
'replicates for taxa ', taxa,
' Model not fit for this taxa')
}
}
#' @title Run SDM analyses for 'species with data' format, with bio-geographic cropping.
#'
#' @description This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the dismo package https://github.com/johnbaums/rmaxent
#' @param taxa_list Character string - the taxa to run maxent models for
#' @param taxa_level Character string - the taxnomic level to run maxent models for
#' @param sdm_df SpatialPointsDataFrame. Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param sdm_predictors Character string - Vector of enviro conditions that you want to include
#' @param maxent_dir Character string - The file path used for saving the maxent output
#' @param bs_dir Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr Numeric - The min allowable percent variable contribution
#' @param k_thr Numeric - The min number of variables to be kept in the model
#' @param min_n Numeric - The min number of records for running maxent
#' @param max_bg_size Numeric - The max number of background points to keep
#' @param background_buffer_width Numeric - The max distance (km) from occ points that BG points should be selected?
#' @param shapefiles Logical - Save shapefiles of the occ and bg data (T/F)?
#' @param features Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates Numeric - The number of replicates to use
#' @param responsecurves Logical - Save response curves of the maxent models (T/F)?
#' @param koppen_crop Logical - Should we use a koppen zone raster to select background points?
#' @param country_shp .Rds object - SpatialPolygonsDataFrame of Australia for mapping maxent points
#' @param Koppen_raster RasterLayer of global koppen zones, in Mollweide54009 projection
#' @param Koppen_zones Dataframe of global koppen zones, with columns : GRIDCODE, Koppen
#' @export
run_sdm_analysis_crop = function(taxa_list,
taxa_level,
sdm_df,
sdm_predictors,
maxent_dir,
bs_dir,
backwards_sel,
template_raster,
cor_thr,
pct_thr,
k_thr,
min_n,
max_bg_size,
background_buffer_width,
shapefiles,
features,
replicates,
responsecurves,
crop_Koppen,
Koppen_raster,
Koppen_zones,
country_shp) {
## Convert to SF object for selection - inefficient
message('Preparing spatial data for SDMs')
## Loop over all the taxa
## taxa <- taxa_list[1]
lapply(taxa_list, function(taxa){
## Skip the taxa if the directory already exists, before the loop
# outdir <- maxent_dir
## Could also check the folder size, so folders with no contents aren't skipped eg
## && sum(file.info(dir_name)$size) < 1000 (EG 1MB)
dir_name = file.path(maxent_dir, gsub(' ', '_', taxa))
if(!dir.exists(dir_name)) {
message('Maxent not yet run for ', taxa, ' - running')
## Create the directory for the taxa in progress,
## so other parallel runs don't run the same taxa
dir.create(dir_name)
file.create(file.path(dir_name, "in_progress.txt"))
## Print the taxa being processed to screen
if(taxa %in% sdm_df[[taxa_level]]) {
message('Doing ', taxa)
## Subset the records to only the taxa being processed - in the searched taxa or returned
occurrence <- sdm_df %>% .[.[[taxa_level]] %in% taxa, ]
message('Using ', nrow(occurrence), ' occ records from ', unique(occurrence$SOURCE))
## Now get the background points. These can come from any taxa, other than the modelled taxa.
## However, they should be limited to the same SOURCE as the occ data
background <- sdm_df %>% .[!.[[taxa_level]] %in% taxa, ]
message('Using ', nrow(background), ' background records from ', unique(background$SOURCE))
if(!crop_Koppen) {
Koppen_raster = FALSE
Koppen_zones = FALSE
}
## Finally fit the models using FIT_MAXENT_TARG_BG. Also use tryCatch to skip any exceptions
tryCatch(
fit_maxent_targ_bg_back_sel_crop(occ = occurrence,
bg = background,
sdm_predictors = sdm_predictors,
taxa = taxa,
outdir = maxent_dir,
bsdir = bs_dir,
backwards_sel = backwards_sel,
cor_thr = cor_thr,
pct_thr = pct_thr,
k_thr = k_thr,
crop_Koppen = crop_Koppen,
template_raster = template_raster,
min_n = min_n,
max_bg_size = max_bg_size,
Koppen_raster = Koppen_raster,
Koppen_zones = Koppen_zones,
background_buffer_width = background_buffer_width,
shapefiles = shapefiles,
features = features,
replicates = replicates,
responsecurves = responsecurves,
country_shp = country_shp),
## Save error message
error = function(cond) {
## How to put the message into the file?
file.create(file.path(dir_name, "sdm_failed.txt"))
message(taxa, ' failed')
cat(cond$message, file = file.path(dir_name, "sdm_failed.txt"))
warning(taxa, ': ', cond$message)
})
} else {
message(taxa, ' skipped - no data.') ## This condition ignores taxa which have no data...
file.create(file.path(dir_name, "completed.txt"))
}
## now add a file to the dir to denote that it has completed
file.create(file.path(dir_name, "completed.txt"))
} else {
message(taxa, ' sdm already exists, skipped') ## This condition ignores taxa which have no data...
}
})
}
#' @title Maxent Backwards selection for 'species with data' format, with cropping.
#' @description
#' This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the rmaxent package https://github.com/johnbaums/rmaxent
#' It assumes that the input df is that returned by the prepare_sdm_table function
#'
#' @param occ SpatialPointsDataFrame - Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param bg SpatialPointsDataFrame - Spdf of of candidate background points
#' @param sdm_predictors Character string - Vector of enviro conditions that you want to include
#' @param outdir Character string - The file path used for saving the maxent output
#' @param bsdir Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param template_raster RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr Numeric - The min allowable percent variable contribution
#' @param k_thr Numeric - The min number of variables to be kept in the model
#' @param min_n Numeric - The min number of records for running maxent
#' @param max_bg_size Numeric - The max number of background points to keep
#' @param crop_Koppen Logical - Should we use a koppen zone raster to select background points?
#' @param shapefiles Logical - Save shapefiles of the occ and bg data (T/F)?
#' @param features Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates Numeric - The number of replicates to use
#' @param responsecurves Logical - Save response curves of the maxent models (T/F)?
#' @param country_shp .Rds object - SpatialPolygonsDataFrame of Australia for mapping maxent points
#' @param rep_args RasterLayer of global koppen zones, in Mollweide54009 projection
#' @param full_args Dataframe of global koppen zones, with columns : GRIDCODE, Koppen
#' @export fit_maxent_targ_bg_back_sel_crop
fit_maxent_targ_bg_back_sel_crop <- function(occ,
bg,
sdm_predictors,
taxa,
outdir,
bsdir,
cor_thr,
pct_thr,
k_thr,
backwards_sel,
template_raster,
min_n,
max_bg_size,
background_buffer_width,
crop_Koppen,
Koppen_raster,
Koppen_zones,
shapefiles,
features,
replicates,
responsecurves,
country_shp) {
## If we aren't cropping the raster to the Koppen extent, don't use that argument
## First, stop if the outdir file exists,
if(!file.exists(outdir)) stop('outdir does not exist :(', call. = FALSE)
outdir_sp <- file.path(outdir, gsub(' ', '_', taxa))
bsdir_sp <- file.path(bsdir, gsub(' ', '_', taxa))
if(crop_Koppen) {
if(!missing('Koppen_raster')) {
if(!is(Koppen_raster, 'RasterLayer'))
stop('Koppen must be a RasterLayer, and should be in the same coordinate system as template_raster')
}
}
## If the file doesn't exist, split out the features
if(!file.exists(outdir_sp)) dir.create(outdir_sp)
features <- unlist(strsplit(features, ''))
## Make sure user features are allowed: don't run the model if the
## features have been incorrectly specified in the main argument
## l: linear
## p: product
## q: quadratic
## h: hinge ## disabled for this analysis
## t: threshold ## disabled for this analysis
if(length(setdiff(features, c('l', 'p', 'q', 'h', 't'))) > 1)
stop("features must be a vector of one or more of ',
'l', 'p', 'q', 'h', and 't'.")
## Create a buffer of xkm around the occurrence points
## So the spatial data change has caused this problem
buffer <- aggregate(gBuffer(occ, width = background_buffer_width, byid = TRUE))
## Get unique cell numbers for taxa occurrences
template_raster_spat <- terra::rast(template_raster)
occ_mat <- cbind(lon = occ$lon, lat = occ$lat)
cells <- terra::cellFromXY(template_raster_spat, occ_mat)
## Clean out duplicate cells and NAs (including points outside extent of predictor data)
## Note this will get rid of a lot of duplicate records not filtered out by GBIF columns, etc.
not_dupes <- which(!duplicated(cells) & !is.na(cells))
occ <- occ[not_dupes, ]
cells <- cells[not_dupes]
message(nrow(occ), ' occurrence records (unique cells).')
## Skip taxa that have less than a minimum number of records: eg 20 taxa
if(nrow(occ) > min_n) {
## Subset the background records to the 200km buffered polygon
message(taxa, ' creating background cells')
system.time(o <- over(bg, buffer))
bg <- bg[which(!is.na(o)), ]
bg_mat <- cbind(lon = bg$lon, lat = bg$lat)
bg_cells <- terra::cellFromXY(template_raster_spat, bg_mat)
## Clean out duplicates and NAs (including points outside extent of predictor data)
bg_not_dupes <- which(!duplicated(bg_cells) & !is.na(bg_cells))
bg_unique <- bg[bg_not_dupes, ]
bg_cells_unique <- bg_cells[bg_not_dupes]
bg_mat_unique <- cbind(lon = bg_unique$lon, lat = bg_unique$lat)
## Find which of these cells fall within the Koppen-Geiger zones that the taxa occupies
## Crop the Kopppen raster to the extent of the occurrences, and snap it.
if(crop_Koppen) {
message(taxa, ' intersecting background cells with Koppen zones')
Koppen_crop <- raster::crop(Koppen_raster, occ, snap = 'out')
## Only extract and match those cells that overlap between the ::
## 1). cropped koppen zone,
## 2). occurrences and
## 3). background points
zones <- raster::extract(Koppen_crop, occ)
cells_in_zones_crop <- raster::Which(Koppen_crop %in% zones, cells = TRUE)
Koppen_crop_rast <- terra::rast(Koppen_crop)
Koppen_rast <- terra::rast(Koppen_raster)
cells_in_zones <- terra::cellFromXY(Koppen_rast, terra::xyFromCell(Koppen_crop_rast, cells_in_zones_crop))
bg_cells_zones <- intersect(bg_cells_unique, cells_in_zones) ## this is 0 for 5km
i <- terra::cellFromXY(template_raster_spat, bg_mat_unique)
bg_crop <- bg_unique[which(i %in% bg_cells_zones), ]
## For some taxa, we have the problem that the proportion of ALA/INV data is
## very different in the occurrence vs the bg records.
## This should be caused by the 200km / koppen restriction, etc.
## Now save an image of the background points
## This is useful to quality control the models
save_name = gsub(' ', '_', taxa)
aus.mol = country_shp %>%
spTransform(projection(buffer))
aus.kop = raster::crop(Koppen_crop, aus.mol)
occ.mol <- occ %>%
spTransform(projection(buffer))
## Then save the occurrence points
png(sprintf('%s/%s/%s_%s.png', outdir, save_name, save_name, "buffer_occ"),
16, 10, units = 'in', res = 300)
plot(Koppen_crop, legend = FALSE,
main = paste0('Occurence SDM records for ', taxa))
plot(aus.mol, add = TRUE)
plot(buffer, add = TRUE, col = "red")
plot(occ.mol, add = TRUE, col = "blue")
dev.off()
} else {
message(taxa, ' Do not intersect background cells with Koppen zones')
i <- terra::cellFromXY(template_raster_spat, bg_mat_unique)
bg_crop <- bg_unique[which(i %in% bg_cells_unique), ]
## Now save an image of the background points
## This is useful to quality control the models
save_name = gsub(' ', '_', taxa)
aus.mol = country_shp %>%
spTransform(projection(buffer))
occ.mol <- occ %>%
spTransform(projection(buffer))
## Then save the occurrence points
png(sprintf('%s/%s/%s_%s.png', outdir, save_name, save_name, "buffer_occ"),
16, 10, units = 'in', res = 300)
plot(aus.mol, legend = FALSE,
main = paste0('Occurence SDM records for ', taxa))
plot(buffer, add = TRUE, col = "red")
plot(occ.mol, add = TRUE, col = "blue")
dev.off()
}
## Reduce background sample, if it's larger than max_bg_size
if (nrow(bg_crop) > max_bg_size) {
message(nrow(bg_crop), ' target taxa background records for ', taxa,
', reduced to random ', max_bg_size, ' using random points from :: ', unique(bg_crop$SOURCE))
bg.samp <- bg_crop[sample(nrow(bg_crop), max_bg_size), ]
} else {
## If the bg points are smaller that the max_bg_size, just get all the points
message(nrow(bg_crop), ' target taxa background records for ', taxa,
' using all points from :: ', unique(bg$SOURCE))
bg.samp <- bg_crop
}
## Now save the buffer, the occ and bg points as shapefiles
if(shapefiles) {
suppressWarnings({
message(taxa, ' writing occ and bg shapefiles')
writeOGR(SpatialPolygonsDataFrame(buffer, data.frame(ID = seq_len(length(buffer)))),
outdir_sp, paste0(save_name, '_bg_buffer'), 'ESRI Shapefile', overwrite_layer = TRUE)
writeOGR(bg.samp, outdir_sp, paste0(save_name, '_bg'), 'ESRI Shapefile', overwrite_layer = TRUE)
writeOGR(occ, outdir_sp, paste0(save_name, '_occ'), 'ESRI Shapefile', overwrite_layer = TRUE)
})
}
## Also save the background and occurrence points as .rds files
saveRDS(bg.samp, file.path(outdir_sp, paste0(save_name, '_bg.rds')))
saveRDS(occ, file.path(outdir_sp, paste0(save_name, '_occ.rds')))
## SWD = taxa with data. Now sample the environmental
## variables used in the model at all the occ and bg points
swd_occ <- occ[, sdm_predictors]
saveRDS(swd_occ, file.path(outdir_sp, paste0(save_name,'_occ_swd.rds')))
swd_bg <- bg.samp[, sdm_predictors]
saveRDS(swd_bg, file.path(outdir_sp, paste0(save_name, '_bg_swd.rds')))
## Save the SWD tables as shapefiles
if(shapefiles) {
writeOGR(swd_occ, outdir_sp, paste0(save_name, '_occ_swd'), 'ESRI Shapefile', overwrite_layer = TRUE)
writeOGR(swd_bg, outdir_sp, paste0(save_name, '_bg_swd'), 'ESRI Shapefile', overwrite_layer = TRUE)
}
## Now combine the occurrence and background data
swd <- as.data.frame(rbind(swd_occ@data, swd_bg@data))
saveRDS(swd, file.path(outdir_sp, 'swd.rds'))
pa <- rep(1:0, c(nrow(swd_occ), nrow(swd_bg)))
## Now, set the features to be used by maxent ::
## Linear, product and quadratic
off <- setdiff(c('l', 'p', 'q', 't', 'h'), features)
## This sets threshold and hinge features to "off"
if(length(off) > 0) {
off <- c(l = 'linear=false', p = 'product=false', q = 'quadratic=false',
t = 'threshold=false', h = 'hinge=false')[off]
}
off <- unname(off)
## Run the replicates
if(replicates > 1) {
## Only use rep_args & full_args if not using replicates
rep_args <- NULL
## Run MAXENT for cross validation data splits of swd : so 5 replicaes, 0-4
## first argument is the predictors, the second is the occurrence data
message(taxa, ' running xval maxent')
me_xval <- dismo::maxent(swd, pa, path = file.path(outdir_sp, 'xval'),
args = c(paste0('replicates=', replicates),
'responsecurves=true',
'outputformat=logistic',
off, paste(names(rep_args), rep_args, sep = '=')))
}
## Run the full maxent model - using all the data in swd
## This uses DISMO to output standard files, but the names can't be altered
full_args <- NULL
message(taxa, ' running full maxent')
me_full <- maxent(swd, pa, path = file.path(outdir_sp, 'full'),
args = c(off, paste(names(full_args), full_args, sep = '='),
'responsecurves=true',
'outputformat=logistic'))
## Save the full model. Replicate this line in the backwards selection algortithm
## Remove Koppen from the end
if(crop_Koppen) {
saveRDS(list(me_xval = me_xval, me_full = me_full, swd = swd, pa = pa),
koppen_gridcode = as.character(Koppen_zones$Koppen[match(unique(zones), Koppen_zones$GRIDCODE)]),
file.path(outdir_sp, 'full', 'maxent_fitted.rds'))
} else {
saveRDS(list(me_xval = me_xval, me_full = me_full, swd = swd, pa = pa),
file.path(outdir_sp, 'full', 'maxent_fitted.rds'))
}
if (backwards_sel) {
## Coerce the "taxa with data" (SWD) files to regular data.frames
## This is needed to use the simplify function
swd_occ <- as.data.frame(swd_occ)
swd_occ$lon <- NULL
swd_occ$lat <- NULL
swd_bg <- as.data.frame(swd_bg)
swd_bg$lon <- NULL
swd_bg$lat <- NULL
## Need to create a taxa column here
swd_occ$searchTaxon <- taxa
swd_bg$searchTaxon <- taxa
## Run simplify rmaxent::simplify
# Given a candidate set of predictor variables, this function identifies
# a subset that meets specified multicollinearity criteria. Subsequently,
# backward stepwise variable selection (VIF) is used to iteratively drop
# the variable that contributes least to the model, until the contribution
# of each variable meets a specified minimum, or until a predetermined
# minimum number of predictors remains. It returns a model object for the
# full model, rather than a list of models as does the previous function
## Using a modified versionof rmaxent::simplify, so that the taxa of the
## maxent model object "maxent_fitted.rds" is the same in both models.
## This is needed to run the mapping step over either the full or BS folder
m <- local_simplify(
swd_occ,
swd_bg,
path = bsdir,
taxa_column = "searchTaxon",
replicates = replicates, ## 5 as above
response_curves = TRUE,
logistic_format = TRUE,
cor_thr = cor_thr,
pct_thr = pct_thr,
k_thr = k_thr,
features = features, ## LPQ as above
quiet = FALSE)
## Save the bg, occ and swd files into the backwards selection folder too
saveRDS(bg.samp, file.path(bsdir_sp, paste0(save_name, '_bg.rds')))
saveRDS(occ, file.path(bsdir_sp, paste0(save_name, '_occ.rds')))
saveRDS(swd, file.path(bsdir_sp, paste0('swd.rds')))
## Read the model in, because it's tricky to index
bs.model <- readRDS(sprintf('%s/%s/full/maxent_fitted.rds', bsdir, save_name))
identical(length(bs.model@presence$Annual_mean_temp), nrow(occ))
## Save the chart corrleation file too for the training data set
par(mar = c(3, 3, 5, 3),
oma = c(1.5, 1.5, 1.5, 1.5))
png(sprintf('%s/%s/full/%s_%s.png', bsdir,
save_name, save_name, "bs_predictor_correlation"),
3236, 2000, units = 'px', res = 300)
## set margins
par(mar = c(3, 3, 5, 3),
oma = c(1.5, 1.5, 1.5, 1.5))
## Add detail to the response plot
chart.Correlation(bs.model@presence,
histogram = TRUE, pch = 19,
title = paste0('Reduced variable correlations for ', save_name))
dev.off()
gc()
} else {
message("Don't run backwards selection")
}
} else {
message('Fewer occurrence records than the number of cross-validation ',
'replicates for taxa ', taxa,
' Model not fit for this taxa')
}
}
#' @title Maxent full models for 'species with data' format, without biogeographic cropping.
#' @description
#' This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the rmaxent package https://github.com/johnbaums/rmaxent
#' It assumes that the input df is that returned by the prepare_sdm_table function
#'
#' @param occ SpatialPointsDataFrame - Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param bg SpatialPointsDataFrame - Spdf of of candidate background points
#' @param sdm_predictors Character string - Vector of enviro conditions that you want to include
#' @param outdir Character string - The file path used for saving the maxent output
#' @param bsdir Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param template_raster RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr Numeric - The min allowable percent variable contribution
#' @param k_thr Numeric - The min number of variables to be kept in the model
#' @param min_n Numeric - The min number of records for running maxent
#' @param max_bg_size Numeric - The max number of background points to keep
#' @param shapefiles Logical - Save shapefiles of the occ and bg data (T/F)?
#' @param features Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates Numeric - The number of replicates to use
#' @param responsecurves Logical - Save response curves of the maxent models (T/F)?
#' @param poly Character string - path to shapefile
#' @param rep_args RasterLayer of global koppen zones, in Mollweide54009 projection
#' @param full_args Dataframe of global koppen zones, with columns : GRIDCODE, Koppen
#' @export fit_maxent_targ_bg_back_sel_no_crop
fit_maxent_targ_bg_full_no_crop <- function(occ,
bg,
sdm_predictors,
taxa,
outdir,
bsdir,
cor_thr,
pct_thr,
k_thr,
template_raster,
min_n,
max_bg_size,
background_buffer_width,
shapefiles,
features,
replicates,
responsecurves,
poly) {
## First, stop if the outdir file exists,
if(!file.exists(outdir)) stop('outdir does not exist :(', call. = FALSE)
outdir_sp <- file.path(outdir, gsub(' ', '_', taxa))
bsdir_sp <- file.path(bsdir, gsub(' ', '_', taxa))
## If the file doesn't exist, split out the features
if(!file.exists(outdir_sp)) dir.create(outdir_sp)
features <- unlist(strsplit(features, ''))
## Make sure user features are allowed: don't run the model if the
## features have been incorrectly specified in the main argument
## l: linear
## p: product
## q: quadratic
## h: hinge ## disabled for this analysis
## t: threshold ## disabled for this analysis
if(length(setdiff(features, c('l', 'p', 'q', 'h', 't'))) > 1)
stop("features must be a vector of one or more of ',
'l', 'p', 'q', 'h', and 't'.")
## Create a buffer of xkm around the occurrence points
## So the spatial data change has caused this problem
buffer <- aggregate(gBuffer(occ, width = background_buffer_width, byid = TRUE))
## Get unique cell numbers for taxa occurrences
template_raster_spat <- terra::rast(template_raster)
occ_mat <- cbind(lon = occ$lon, lat = occ$lat)
cells <- terra::cellFromXY(template_raster_spat, occ_mat)
## Clean out duplicate cells and NAs (including points outside extent of predictor data)
## Note this will get rid of a lot of duplicate records not filtered out by GBIF columns, etc.
not_dupes <- which(!duplicated(cells) & !is.na(cells))
occ <- occ[not_dupes, ]
cells <- cells[not_dupes]
message(nrow(occ), ' occurrence records (unique cells).')
## Skip taxa that have less than a minimum number of records: eg 20 taxa
if(nrow(occ) > min_n) {
## Subset the background records to the 200km buffered polygon
message(taxa, ' creating background cells')
system.time(o <- over(bg, buffer))
bg <- bg[which(!is.na(o)), ]
bg_mat <- cbind(lon = bg$lon, lat = bg$lat)
bg_cells <- terra::cellFromXY(template_raster_spat, bg_mat)
## Clean out duplicates and NAs (including points outside extent of predictor data)
bg_not_dupes <- which(!duplicated(bg_cells) & !is.na(bg_cells))
bg_unique <- bg[bg_not_dupes, ]
bg_cells_unique <- bg_cells[bg_not_dupes]
bg_mat_unique <- cbind(lon = bg_unique$lon, lat = bg_unique$lat)
## Find which of these cells fall within the Koppen-Geiger zones that the taxa occupies
## Crop the Kopppen raster to the extent of the occurrences, and snap it.
message(taxa, ' Do not intersect background cells with Koppen zones')
message('country poly is a ', class(poly))
i <- terra::cellFromXY(template_raster_spat, bg_mat_unique)
bg_crop <- bg_unique[which(i %in% bg_cells_unique), ]
## Now save an image of the background points
## This is useful to quality control the models
save_name = gsub(' ', '_', taxa)
## Then save the occurrence points
png(sprintf('%s/%s/%s_%s.png', outdir, save_name, save_name, "buffer_occ"),
16, 10, units = 'in', res = 300)
raster::plot(st_geometry(poly),
legend = FALSE,
main = paste0('Occurence SDM records for ', taxa))
raster::plot(buffer, add = TRUE, col = "red")
raster::plot(occ, add = TRUE, col = "blue")
dev.off()
gc()
## Reduce background sample, if it's larger than max_bg_size
if (nrow(bg_crop) > max_bg_size) {
message(nrow(bg_crop), ' target taxa background records for ', taxa,
', reduced to random ', max_bg_size, ' using random points from :: ', unique(bg_crop$SOURCE))
bg.samp <- bg_crop[sample(nrow(bg_crop), max_bg_size), ]
} else {
## If the bg points are smaller that the max_bg_size, just get all the points
message(nrow(bg_crop), ' target taxa background records for ', taxa,
' using all points from :: ', unique(bg$SOURCE))
bg.samp <- bg_crop
}
## Now save the buffer, the occ and bg points as shapefiles
if(shapefiles) {
suppressWarnings({
message(taxa, ' writing occ and bg shapefiles')
writeOGR(SpatialPolygonsDataFrame(buffer, data.frame(ID = seq_len(length(buffer)))),
outdir_sp, paste0(save_name, '_bg_buffer'), 'ESRI Shapefile', overwrite_layer = TRUE)
writeOGR(bg.samp, outdir_sp, paste0(save_name, '_bg'), 'ESRI Shapefile', overwrite_layer = TRUE)
writeOGR(occ, outdir_sp, paste0(save_name, '_occ'), 'ESRI Shapefile', overwrite_layer = TRUE)
})
}
## Also save the background and occurrence points as .rds files
saveRDS(bg.samp, file.path(outdir_sp, paste0(save_name, '_bg.rds')))
saveRDS(occ, file.path(outdir_sp, paste0(save_name, '_occ.rds')))
## SWD = taxa with data. Now sample the environmental
## variables used in the model at all the occ and bg points
swd_occ <- occ[, sdm_predictors]
saveRDS(swd_occ, file.path(outdir_sp, paste0(save_name,'_occ_swd.rds')))
swd_bg <- bg.samp[, sdm_predictors]
saveRDS(swd_bg, file.path(outdir_sp, paste0(save_name, '_bg_swd.rds')))
## Save the SWD tables as shapefiles
if(shapefiles) {
writeOGR(swd_occ, outdir_sp, paste0(save_name, '_occ_swd'), 'ESRI Shapefile', overwrite_layer = TRUE)
writeOGR(swd_bg, outdir_sp, paste0(save_name, '_bg_swd'), 'ESRI Shapefile', overwrite_layer = TRUE)
}
gc()
## Now combine the occurrence and background data
swd <- as.data.frame(rbind(swd_occ@data, swd_bg@data))
saveRDS(swd, file.path(outdir_sp, 'swd.rds'))
pa <- rep(1:0, c(nrow(swd_occ), nrow(swd_bg)))
## Now, set the features to be used by maxent ::
## Linear, product and quadratic
off <- setdiff(c('l', 'p', 'q', 't', 'h'), features)
## This sets threshold and hinge features to "off"
if(length(off) > 0) {
off <- c(l = 'linear=false', p = 'product=false', q = 'quadratic=false',
t = 'threshold=false', h = 'hinge=false')[off]
}
off <- unname(off)
## Run the replicates
if(replicates > 1) {
## Only use rep_args & full_args if not using replicates
rep_args <- NULL
## Run MAXENT for cross validation data splits of swd : so 5 replicaes, 0-4
## first argument is the predictors, the second is the occurrence data
message(taxa, ' running xval maxent')
me_xval <- dismo::maxent(swd, pa, path = file.path(outdir_sp, 'xval'),
args = c(paste0('replicates=', replicates),
'responsecurves=true',
'outputformat=logistic',
off, paste(names(rep_args), rep_args, sep = '=')))
}
gc()
## Run the full maxent model - using all the data in swd
## This uses DISMO to output standard files, but the names can't be altered
full_args <- NULL
message(taxa, ' running full maxent')
me_full <- maxent(swd, pa, path = file.path(outdir_sp, 'full'),
args = c(off, paste(names(full_args), full_args, sep = '='),
'responsecurves=true',
'outputformat=logistic'))
## Save the full model. Replicate this line in the backwards selection algortithm
## Remove Koppen from the end
saveRDS(list(me_xval = me_xval, me_full = me_full, swd = swd, pa = pa),
file.path(outdir_sp, 'full', 'maxent_fitted.rds'))
} else {
message('Fewer occurrence records than the number of cross-validation ',
'replicates for taxa ', taxa,
' Model not fit for this taxa')
}
}
#' @title Simplify maxent model
#' @description This is a local version of rmaxent::simplify (https://github.com/johnbaums/rmaxent)
#' Given a candidate set of predictor variables, this function identifies
#' a subset that meets specified multicollinearity criteria. Subsequently,
#' backward stepwise variable selection is used to iteratively drop the variable
#' that contributes least to the model, until the contribution of each variable
#' meets a specified minimum, or until a predetermined minimum number of predictors remains.
#'
#' It assumes that the input df is that returned by the fit_maxent_targ_bg_back_sel function
#' @param occ SpatialPointsDataFrame - Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param bg SpatialPointsDataFrame - Spdf of of candidate background points
#' @param path Character string - Vector of enviro conditions that you want to include
#' @param taxa_column Character string - Vector of enviro conditions that you want to include
#' @param cor_thr Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr Numeric - The min allowable percent variable contribution
#' @param k_thr Numeric - The min number of variables to be kept in the model
#' @param features Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates Numeric - The number of replicates to use
#' @param type The variable contribution metric to use when dropping variables
#' @param logistic_format Logical value indicating whether maxentResults.csv should report logistic value thresholds
#' @param responsecurves Logical - Save response curves of the maxent models (T/F)?
#' @export local_simplify
local_simplify = function (occ, bg, path, taxa_column = "taxa", response_curves = TRUE,
logistic_format = TRUE, type = "PI", cor_thr, pct_thr, k_thr,
features = "lpq", replicates = 1, quiet = TRUE)
{
if (!taxa_column %in% colnames(occ))
stop(taxa_column, " is not a column of `occ`", call. = FALSE)
if (!taxa_column %in% colnames(bg))
stop(taxa_column, " is not a column of `bg`", call. = FALSE)
if (missing(path)) {
save <- FALSE
path <- tempdir()
}
else save <- TRUE
features <- unlist(strsplit(gsub("\\s", "", features), ""))
if (length(setdiff(features, c("l", "p", "q", "h", "t"))) >
1)
stop("features must be a vector of one or more of ',\n 'l', 'p', 'q', 'h', and 't'.")
off <- setdiff(c("l", "p", "q", "t", "h"), features)
if (length(off) > 0) {
off <- c(l = "linear=FALSE", p = "product=FALSE", q = "quadratic=FALSE",
t = "threshold=FALSE", h = "hinge=FALSE")[off]
}
off <- unname(off)
occ_by_taxa <- split(occ, occ[[taxa_column]])
bg_by_taxa <- split(bg, bg[[taxa_column]])
if (!identical(sort(names(occ_by_taxa)), sort(names(bg_by_taxa)))) {
stop("The same set of taxa names must exist in occ and bg")
}
type <- switch(type, PI = "permutation.importance", PC = "contribution",
stop("type must be either \"PI\" or \"PC\".", call. = FALSE))
args <- off
if (replicates > 1)
args <- c(args, paste0("replicates=", replicates))
if (isTRUE(response_curves))
args <- c(args, "responsecurves=TRUE")
if (isTRUE(logistic_format))
args <- c(args, "outputformat=logistic")
f <- function(taxa) {
if (!quiet)
message("\n\nDoing ", taxa)
name_ <- gsub(" ", "_", taxa)
swd <- rbind(occ_by_taxa[[taxa]], bg_by_taxa[[taxa]])
swd <- swd[, -match(taxa_column, names(swd))]
if (ncol(swd) < k_thr)
stop("Initial number of variables < k_thr", call. = FALSE)
pa <- rep(1:0, c(nrow(occ_by_taxa[[taxa]]), nrow(bg_by_taxa[[taxa]])))
vc <- usdm::vifcor(swd, maxobservations = nrow(swd),
th = cor_thr)
vif <- methods::slot(vc, "results")
k <- nrow(vif)
exclude <- methods::slot(vc, "excluded")
if (!isTRUE(quiet) & length(exclude) > 0) {
message("Dropped due to collinearity: ", paste0(exclude,
collapse = ", "))
}
if (k < k_thr)
stop(sprintf("Number of uncorrelated variables (%s) < k_thr (%s). %s",
k, k_thr, "Reduce k_thr, increase cor_thr, or find alternative predictors."),
call. = FALSE)
swd_uncor <- swd[, as.character(vif$Variables)]
d <- file.path(path, name_, if (replicates > 1)
"xval"
else "full")
m <- dismo::maxent(swd_uncor, pa, args = args, path = d)
if (isTRUE(save))
saveRDS(m, file.path(d, "maxent_fitted.rds"))
pct <- m@results[grep(type, rownames(m@results)), ,
drop = FALSE]
pct <- pct[, ncol(pct)]
pct <- sort(pct)
names(pct) <- sub(paste0("\\.", type), "", names(pct))
if (min(pct) >= pct_thr || length(pct) == k_thr) {
if (replicates > 1) {
d <- file.path(path, name_, "full")
m <- dismo::maxent(swd_uncor, pa, args = grep("replicates",
args, value = TRUE,
invert = TRUE), path = d)
}
if (isTRUE(save)) {
saveRDS(m, file.path(d, "maxent_fitted.rds"))
}
return(m)
}
while (min(pct) < pct_thr && length(pct) > k_thr) {
candidates <- vif[vif$Variables %in% names(pct)[pct ==
pct[1]], ]
drop <- as.character(candidates$Variables[which.max(candidates$VIF)])
message("Dropping ", drop)
swd_uncor <- swd_uncor[, -match(drop, colnames(swd_uncor))]
if (!quiet)
message(sprintf("%s variables: %s", ncol(swd_uncor),
paste0(colnames(swd_uncor), collapse = ", ")))
m <- dismo::maxent(swd_uncor, pa, args = args, path = d)
pct <- m@results[grep(type, rownames(m@results)),
, drop = FALSE]
pct <- pct[, ncol(pct)]
pct <- sort(pct)
names(pct) <- sub(paste0("\\.", type), "", names(pct))
}
if (replicates > 1) {
d <- file.path(path, name_, "full")
m <- dismo::maxent(swd_uncor, pa, args = grep("replicates",
args, value = TRUE,
invert = TRUE), path = d)
}
gc()
if (isTRUE(save)) {
saveRDS(m, file.path(d, "maxent_fitted.rds"))
}
return(m)
}
lapply(names(occ_by_taxa), f)
}
#' @title Variable importance
#' @description This function calculates variables importance.
#' @param mod object - maxent model object
#' @export var_importance
var_importance <- function(mod) {
res <- mod@results
pc <- res[grepl('contribution', rownames(res)),]
pi <- res[grepl('permutation', rownames(res)),]
varnames <- sapply(strsplit(names(pc), '.contribution'), function(x) x[1])
df <- data.frame(variable=varnames, percent.contribution=pc, permutation.importance=pi, row.names=NULL)
return(df)
}
#' @title Compile SDM results from 'species with data' format.
#' @description This function extracts the SDM results from the folders.
#' @param taxa_list Character string - the taxa to run maxent models for
#' @param results_dir Character string - The file path used for saving the maxent output
#' @param save_data Logical or character - do you want to save the data frame?
#' @param data_path Character string - The file path used for saving the data frame
#' @param sdm_path Character string - The file path where the maxent models are stored.
#' @param save_run Character string - run name to append to the data frame, useful for multiple runs.
#' @export compile_sdm_results
compile_sdm_results = function(taxa_list,
results_dir,
save_data,
data_path,
sdm_path,
save_run) {
## The code that adds niche info is now in './R/9_COLLATE_MAXENT_RESULTS.R'
message('Creating summary stats for ', length(taxa_list),
' taxa in the set ', "'", save_run, "'")
##
processed = list.dirs(sdm_path, full.names = FALSE, recursive = FALSE)
## First, make a list of all the taxa with models, then restrict them
## to just the models on the taxa_list list
map_spp_list = gsub(" ", "_", taxa_list)
map_spp_list = processed[processed %in% map_spp_list ]
map_spp_patt = paste0(map_spp_list, collapse = "|")
message ("map_spp_list head:")
message (paste(head(map_spp_list), collapse = ","))
## Now stop R from creating listing all the maxent files that have completed - this takes a long time
message('Compile SDM results for taxa in ', results_dir)
maxent.tables = lapply (map_spp_list, FUN = function (x) {paste(results_dir , x, "full/maxent_fitted.rds", sep="/")})
## How many taxa have been modelled?
message(paste("maxent.tables has this many entries:", length(maxent.tables)))
message(paste(head (maxent.tables), collapse = ","))
sdm.exists = lapply(maxent.tables, FUN = function (x) {file.exists (x)}) %>% unlist()
## Only list the intersection between the modelled taxa and
message(paste(head(sdm.exists), collapse=","))
maxent.tables = maxent.tables[sdm.exists]
## Report what data the table has
message (paste ("maxent.tables has this many entries:", length(maxent.tables)))
maxent.tables = stringr::str_subset(maxent.tables, map_spp_patt)
message (paste ("maxent.tables has this many entries:", length(maxent.tables)))
message (paste (head(maxent.tables), collapse = ","))
## Now create a table of the results
## x = maxent.tables[1]
MAXENT.RESULTS <- maxent.tables %>%
## Pipe the list into lapply
lapply(function(x) {
## We don't need to skip taxa that haven't been modelled
x = gsub(paste0(results_dir, "/"), "", x)
message(x)
## load the backwards selected model
if (grepl("back", results_dir)) {
m = readRDS(paste0(results_dir, '/', x))
} else {
## Get the background records from any source
m = readRDS(paste0(results_dir, '/', x))$me_full
}
## Get the number of Variables
number.var = length(m@lambdas) - 4 ## (the last 4 slots of the lambdas file are not variables)
mxt.records = nrow(m@presence)
## Get variable importance
m.vars = var_importance(m)
var.pcont = m.vars[rev(order(m.vars[["percent.contribution"]])),][["variable"]][1:4]
pcont = m.vars[rev(order(m.vars[["percent.contribution"]])),][["percent.contribution"]][1:4]
Var_pcont = paste(var.pcont, pcont, sep = ' = ')
var.pimp = m.vars[rev(order(m.vars[["permutation.importance"]])),][["variable"]][1:1]
pimp = m.vars[rev(order(m.vars[["permutation.importance"]])),][["permutation.importance"]][1:1]
## Create a list of the variables which explains at least 75% of variation?
## Get maxent results columns to be used for model checking
## Including the omission rate here
Training_AUC = m@results["Training.AUC",]
Number_background_points = m@results["X.Background.points",]
Logistic_threshold = m@results["X10.percentile.training.presence.Logistic.threshold",]
Omission_rate = m@results["X10.percentile.training.presence.training.omission",]
## Now rename the maxent columns that we will use in the results table
d = data.frame(searchTaxon = x,
Maxent_records = mxt.records,
Number_var = number.var,
Var_pcont_1 = Var_pcont[1],
Var_pcont_2 = Var_pcont[2],
Var_pcont_3 = Var_pcont[3],
Var_pcont_4 = Var_pcont[4],
Var_pimp = var.pimp[[1]],
Perm_imp = pimp[[1]],
Training_AUC,
Number_background_points,
Logistic_threshold,
Omission_rate)
## Remove path gunk, and taxa
d$Species = NULL
d$searchTaxon = gsub("/full/maxent_fitted.rds", "", d$searchTaxon)
row.names(d) <- NULL
return(d)
}) %>%
## Finally, bind all the rows together
bind_rows
## Now create a list of the '10th percentile training presence Logistic threshold'.
## This is used in step 8 to threshold
## the maps to just areas above the threshold.
message ("MAXENT.RESULTS columns")
message (paste (colnames (MAXENT.RESULTS)))
message (paste (nrow (MAXENT.RESULTS)))
summary(MAXENT.RESULTS["Logistic_threshold"])
percent.10.log <- as.list(MAXENT.RESULTS["Logistic_threshold"]) %>%
.[["Logistic_threshold"]]
## Create a list of the omission files - again, don't do this for all the files, just the intersection
omission.tables = lapply (map_spp_list, FUN = function (x) {paste(results_dir , x, "full/species_omission.csv", sep = "/")})
message (head (omission.tables))
## Only process the existing files
om.exists = lapply (omission.tables, FUN = function (x) {file.exists (x)}) %>% unlist()
omission.tables = omission.tables[om.exists]
message(head(omission.tables))
## Get the maxium TSS value using the omission data : use _training_ ommission data only
Max_tss <- sapply(omission.tables, function(file) {
## For eachg taxa, read in the training data
d <- read.csv(file)
i <- which.min(d$Training.omission + d$Fractional.area)
c(Max_tss = 1 - min(d$Training.omission + d$Fractional.area),
thr = d$Corresponding.logistic.value[i])
})
## Add a taxa variable to the TSS results, so we can subset to just the taxa analysed
Max_tss = as.data.frame(Max_tss)
MAXENT.RESULTS = cbind(MAXENT.RESULTS, Max_tss)
summary(MAXENT.RESULTS$Max_tss)
summary(MAXENT.RESULTS$Omission_rate)
## This is a summary of maxent output for current conditions
## All taxa should have AUC > 0.7
dim(MAXENT.RESULTS)
head(MAXENT.RESULTS, 20)[1:5]
## Now check the match between the taxa list, and the results list.
length(intersect(map_spp_list, MAXENT.RESULTS$searchTaxon))
MAXENT.RESULTS = MAXENT.RESULTS[MAXENT.RESULTS$searchTaxon %in% map_spp_list , ]
map_spp = unique(MAXENT.RESULTS$searchTaxon)
length(map_spp);setdiff(sort(map_spp_list), sort(map_spp))
## Then make a list of all the directories containing the individual GCM rasters. This is used for combining the rasters
SDM.RESULTS.DIR <- map_spp %>%
## Pipe the list into lapply
lapply(function(taxa) {
## Create the character string...
m <- sprintf('%s/%s/full/', results_dir, taxa) ## path.backwards.sel
m
}) %>%
## Bind the list together
c() %>% unlist()
length(SDM.RESULTS.DIR)
## Change the taxa column
MAXENT.RESULTS$searchTaxon = gsub("_", " ", MAXENT.RESULTS$searchTaxon)
MAXENT.RESULTS$results_dir = SDM.RESULTS.DIR
##
if(save_data == TRUE) {
## If saving, save
saveRDS(MAXENT.RESULTS, paste0(data_path, 'MAXENT_RESULTS_', save_run, '.rds'))
write_csv(MAXENT.RESULTS, paste0(data_path, 'MAXENT_RESULTS_', save_run, '.csv'))
return(MAXENT.RESULTS)
} else {
## Or return to the global environment
message(' skip file saving, not many taxa analysed')
return(MAXENT.RESULTS)
}
}
#########################################################################################################################
#################################################### TBC ##############################################################
#########################################################################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.