R/predImportTrainModels.r

Defines functions predImportTrainModels

Documented in predImportTrainModels

#' Train SDMs across multiple iterations of a scenario
#'
#' This function trains species distribution models on simulate data. Typical implementation is to use \code{predImportMakeData} to create simulated data sets, then \code{predImportTrainModels} to train SDMs on those data sets, then \code{predImportEval} to evaluate the models.
#' @param simDir Character, path name of directory in which scenario data files are saved.
#' @param modelDir Character, path name of directory in which model files are saved. Depending on whether multivariate, reduced, and/or univariate models are trained, inside this folder will be subfolders named "multivariate", "reduced", and/or "univariate" followed by the name of the algorithm (e.g., "multivariate brt").
#' @param vars Character vector, names of variables to use in model training. These should match the names in the \code{geography} argument supplied to the \code{predImportMakeData} function. See \code{\link[enmSdmPredImport]{genesis}} function for more details on \code{geography}.
#' @param algos Character list of model algorithms to implement. Options include \code{omniscient}, \code{brt} (boosted regression trees), \code{gam} (generalized additive models), \code{glm} (generalized linear models), \code{maxent} (Maxent, using version 3.3.3k or before), \code{maxnet} (Maxent, version 3.4 or higher), or \code{rf} (random forests).
#' @param type Character, type of models to train. Options include \code{multivariate} (use all variables in \code{vars}, \code{reduced} (a series of models, each using all but one variable in \code{vars}), and/or \code{univariate} (a series of models, one per variable in \code{vars}).
#' @param iters Vector of positive integers, data iterations to train models for.
#' @param numBg Positive integer, vector of positive integers, or \code{NULL}. This is the number of background sites used to train the model. If this is \code{NULL} (default), then the number of background sites will be equal to the number of sites available in the "sim" object created by \code{predImportMakeData}. If this is a single integer, then the background sites used for training will be taken from the first \code{numBg} sites available in the "sim" object. If this is a vector, then it must be the same length as \code{algos}, and each algorithm will then be presented with the respective number of background sites. If the latter, the vector must have names that match the algorithm(s) being used.
#' @param fileFlag Either \code{NULL} or a character string. If a character string then this is included in the simulated data file name and each model file name. If \code{NULL} (default), nothing is added, so file names will be as "model XXX.RData". If a character string, then the file name will be as "ALGORITHM FLAG model XXX.RData" where "XXX" is the iteration number, "FLAG" the string in \code{fileFlag}, and "ALGORITHM" the model algorithm name.
#' @param overwrite Logical, if \code{TRUE} then overwrite existing model results files. Default is \code{FALSE}.
#' @param tempDir Character, path of temporary directory. Used to store ancillary modeling files generated by Maxent 3.3.3k and earlier. Not used for any other modeling algorithm. Maxent can generate \emph{a lot} of these files which eventually fill up a disc. By specifying this folder you can have some control over where they are saved and thus if they fill up a hard drive (e.g., send temp files to a fast secondary drive with lots of space).
#' @param verbose Numeric, if 0 then show minimal output, 1 more output, 2 even more, >2 all of it.
#' @param ... Other arguments to pass to "train~~~" functions in the \pkg{enmSdm} package.
#' @return Nothing (writes models to disc).
#' @seealso \code{\link[enmSdmPredImport]{predImportMakeData}}, \code{\link[enmSdmPredImport]{predImportEval}}
#' @export

predImportTrainModels <- function(
	simDir,
	modelDir,
	vars,
	algos=c('omniscient', 'bioclim', 'brt', 'gam', 'glm', 'maxent', 'maxnet', 'rf'),
	type=c('multivariate', 'reduced', 'univariate'),
	iters=1:100,
	numBg=NULL,
	fileFlag=NULL,
	overwrite=FALSE,
	tempDir=raster::rasterOptions()$tmpdir,
	verbose=1,
	...
) {

	# file prefix
	fileAppendStartSpace <- if (!is.null(fileFlag)) { paste0(' ', fileFlag) } else { '' }
	fileAppendEndSpace <- if (!is.null(fileFlag)) { paste0(fileFlag, ' ') } else { '' }

	### for each SDM algorithm
	##########################
	
	for (algo in algos) {

		if (verbose >= 0) omnibus::say(date(), ' | Modeling with ', toupper(algo), post=0)

		thisNumBg <- if (is.null(numBg)) {
			NULL
		} else if (length(numBg) == 1) {
			numBg
		} else {
			numBg[[algo]]
		}
	
		# by ITERATION
		for (iter in iters) {

			if (verbose==1) {
				omnibus::say('| ', iter, post=0)
			} else if (verbose > 1) {
				omnibus::say(date(), ' | ', toupper(algo), ' | iteration ', iter, post=0, pre=1)
			}

			### load training/test data
			###########################
			
			load(paste0(simDir, '/', fileAppendEndSpace, 'sim ', omnibus::prefix(iter, 4), '.Rdata'))
			
			# select background sites
			if (is.null(thisNumBg)) thisNumBg <- sim$stats$numBg
			
			trainPres <- sim$trainDataPresBg[sim$trainDataPresBg$presBg == 1, c('presBg', vars)]
			trainBg <- sim$trainDataPresBg[sim$trainDataPresBg$presBg == 0, c('presBg', vars)]
			
			if (thisNumBg > nrow(trainBg)) stop('The "sim" object does not contain the desired number of training background sites (see argument "numBg").')
			
			trainBg <- trainBg[1:thisNumBg, ]
			trainDataPresBg <- rbind(trainPres, trainBg)
			
			### train MULTIVARIATE models
			#############################
			if ('multivariate' %in% type) {
				
				if (verbose > 0) { omnibus::say('multi', post=0) }
				
				# if overwriting models is OK, OR if model doesn't exist
				fileMissing <- !file.exists(paste0(modelDir, '/multivariate ', algo, '/', algo, ' ', fileAppendEndSpace, 'model ', omnibus::prefix(iter, 4), '.RData'))
				
				if (overwrite | fileMissing) {
					
					if (algo=='omniscient') {

						out <- response
						attr(out, 'modelType') <- 'full'

					} else if (algo=='bioclim') {
					
						bioclimData <- trainDataPresBg[trainDataPresBg$presBg == 1, vars, drop=FALSE]
						out <- dismo::bioclim(bioclimData, ...)

					} else if (algo=='glm') {
					
						out <- enmSdm::trainGlm(
							data=trainDataPresBg,
							resp='presBg',
							preds=vars,
							construct=FALSE,
							verbose=verbose > 2,
							...
						)

					} else if (algo=='maxent') {
					
						out <- enmSdm::trainMaxEnt(
							data=trainDataPresBg,
							resp='presBg',
							preds=vars,
							scratchDir=tempDir,
							verbose=(verbose > 2),
							...
						)

					} else if (algo=='maxnet') {
					
						out <- enmSdm::trainMaxNet(
							data=trainDataPresBg,
							resp='presBg',
							preds=vars,
							verbose=(verbose > 2),
							...
						)

					} else if (algo=='brt') {

						out <- enmSdm::trainBrt(
							data=trainDataPresBg,
							resp='presBg',
							preds=vars,
							w=TRUE,
							verbose=(verbose > 2),
							...
						)
					
					} else if (algo=='gam') {
			
						out <- enmSdm::trainGam(
							data=trainDataPresBg,
							resp='presBg',
							preds=vars,
							construct=FALSE,
							verbose=(verbose > 2),
							...
						)
						
					} else if (algo=='rf') {
			
						trainDataPresBg$presBg <- as.factor(trainDataPresBg$presBg)

						out <- enmSdm::trainRf(
							data=trainDataPresBg,
							resp='presBg',
							preds=vars,
							importance=TRUE,
							verbose=(verbose > 2),
							...
						)
						
					}

					model <- if (is.null(out)) {
						FALSE
					} else {
						out
					}

					if (!(algo %in% c('omniscient', 'maxent', 'bioclim'))) {
						if (!is.na(model)) model$stats$numTrainBg <- thisNumBg
					}

					omnibus::dirCreate(modelDir, '/multivariate ', algo)
					fileName <- paste0(modelDir, '/multivariate ', algo, '/', algo, ' ', fileAppendEndSpace, 'model ', omnibus::prefix(iter, 4), '.RData')
					save(model, file=fileName)
					rm(model); gc()

				} # if overwriting models OK OR model doesn't exist
				
				if (verbose > 0) omnibus::say('\U2713', post=0)
			
			} # if wanting multivariate models
			
			### train REDUCED models
			########################
			if ('reduced' %in% type) {
				
				if (length(vars) > 2) {
				
					if (verbose > 0) { omnibus::say('reduced', post=0) }
						
					# if overwriting models OK OR model doesn't exist
					fileMissing <- !file.exists(paste0(modelDir, '/reduced ', algo, '/', algo, fileAppendStartSpace, ' model ', omnibus::prefix(iter, 4), '.Rdata'))
					if (overwrite | fileMissing) {
						
						model <- list()

						# for EACH variable
						for (count in seq_along(vars)) {

							reducedVars <- vars[-count]
							reducedTrainData <- trainDataPresBg[ , reducedVars, drop=FALSE]
							
							if (algo=='omniscient') {
							
								out <- response
								attr(out, 'modelType') <- 'reduced'
								attr(out, 'reducedSans') <- names(sim$geography)[count]
							
							} else if (algo=='bioclim') {
							
								bioclimData <- reducedTrainData[reducedTrainData$presBg == 1, reducedVars, drop=FALSE]
								out <- dismo::bioclim(bioclimData, ...)

							} else if (algo=='glm') {
							
								out <- enmSdm::trainGlm(
									data=reducedTrainData,
									resp='presBg',
									preds=reducedVars,
									construct=FALSE,
									verbose=verbose > 2,
									...
								)

							} else if (algo=='maxent') {
			
								out <- enmSdm::trainMaxEnt(
									data=reducedTrainData,
									resp='presBg',
									preds=reducedVars,
									scratchDir=tempDir,
									verbose=(verbose > 2),
									...
								)
								
							} else if (algo=='maxnet') {
							
								out <- enmSdm::trainMaxNet(
									data=reducedTrainData,
									resp='presBg',
									preds=reducedVars,
									verbose=(verbose > 2),
									...
								)

							} else if (algo=='brt') {
						
								out <- enmSdm::trainBrt(
									data=reducedTrainData,
									resp='presBg',
									preds=reducedVars,
									verbose=(verbose > 2),
									...
								)
							
							} else if (algo=='gam') {

								out <- enmSdm::trainGam(
									data=reducedTrainData,
									resp='presBg',
									preds=reducedVars,
									construct=FALSE,
									verbose=(verbose > 2),
									...
								)
							
							} else if (algo=='rf') {

								reducedTrainData$presBg <- as.factor(reducedTrainData$presBg)
							
								out <- enmSdm::trainRf(
									data=reducedTrainData,
									resp='presBg',
									preds=reducedVars,
									importance=TRUE,
									verbose=(verbose > 2),
									...
								)
							
							}
						
							model[[count]] <- if (!is.null(out)) {
								out
							} else {
								FALSE
							}
							
							names(model)[[count]] <- paste0('sans', vars[count])
							
							if (!(algo %in% c('omniscient', 'maxent', 'bioclim'))) {
								if (!is.na(model[[count]])) model[[count]]$stats$numTrainBg <- thisNumBg
							}

						} # next reduced model
					
						omnibus::dirCreate(modelDir, '/reduced ', algo)
						fileName <- paste0(modelDir, '/reduced ', algo, '/', algo, fileAppendStartSpace, ' model ', omnibus::prefix(iter, 4), '.Rdata')
						save(model, file=fileName)
						rm(model); gc()
						
					} # if overwriting models OK OR model doesn't exist

				} # if enough layers to do reduced models
			
				if (verbose > 0) omnibus::say('\U2713', post=0)
			
			} # if wanting reduced models
			
			### train UNIVARIATE models
			###########################
			if ('univariate' %in% type) {
				
				if (verbose > 0) { omnibus::say('univar', post=0) }
				
				# if overwriting models OK OR model doesn't exist
				if (overwrite | !file.exists(paste0(modelDir, '/univariate ', algo, '/', algo, ' ', fileAppendEndSpace, 'model ', omnibus::prefix(iter, 4), '.RData'))) {

					model <- list()
					
					for (count in seq_along(vars)) {
					
						if (verbose > 1) { omnibus::say(vars[count], post=0) }
					
						univarVar <- vars[count]
						univarTrainData <- trainDataPresBg[ , c('presBg', univarVar)]

						# omniscient model
						if (algo=='omniscient') {
						
							out <- response
							attr(out, 'modelType') <- 'univariate'
							attr(out, 'univarWith') <- names(sim$geography)[count]

						# BIOCLIM
						} else if (algo=='bioclim') {
						
							bioclimData <- univarTrainData[univarTrainData$presBg == 1, univarVar, drop=FALSE]
							out <- dismo::bioclim(bioclimData, ...)

						# GLM
						} else if (algo=='glm') {
						
							out <- enmSdm::trainGlm(
								data=univarTrainData,
								resp='presBg',
								preds=univarVar,
								construct=FALSE,
								select=TRUE,
								verbose=verbose > 2,
								...
							)
								
						# maxent
						} else if (algo=='maxent') {

							out <- enmSdm::trainMaxEnt(
								data=univarTrainData,
								resp='presBg',
								preds=univarVar,
								scratchDir=tempDir,
								verbose=(verbose > 2),
								...
							)

						# maxnet
						} else if (algo=='maxnet') {
						
							out <- enmSdm::trainMaxNet(
								data=univarTrainData,
								resp='presBg',
								preds=univarVar,
								verbose=(verbose > 2),
								...
							)

						# BRTs
						} else if (algo=='brt') {

							out <- enmSdm::trainBrt(
								data=univarTrainData,
								resp='presBg',
								preds=univarVar,
								w=TRUE,
								verbose=(verbose > 2),
								...
							)

						### GAMs
						} else if (algo=='gam') {

							out <- enmSdm::trainGam(
								data=univarTrainData,
								resp='presBg',
								preds=univarVar,
								construct=FALSE,
								select=FALSE,
								verbose=(verbose > 2),
								...
							)

						### RFs
						} else if (algo == 'rf') {

							univarTrainData$presBg <- as.factor(univarTrainData$presBg)
						
							out <- enmSdm::trainRf(
								data=univarTrainData,
								resp='presBg',
								preds=univarVar,
								importance=FALSE,
								verbose=(verbose > 2),
								...
							)

						}
						
						model[[count]] <- if (!is.null(out)) {
							out
						} else {
							FALSE
						}
						
						names(model)[[count]] <- paste0('only', univarVar)

						if (!(algo %in% c('omniscient', 'maxent', 'bioclim'))) {
							if (!is.na(model[[count]])) model[[count]]$stats$numTrainBg <- thisNumBg
						}

					} # next univariate model

					omnibus::dirCreate(modelDir, '/univariate ', algo)
					fileName <- paste0(modelDir, '/univariate ', algo, '/', algo, ' ', fileAppendEndSpace, 'model ', omnibus::prefix(iter, 4), '.RData')
					save(model, file=fileName)
					rm(model); gc()
					
				} # if overwriting models OK OR model doesn't exist
					
				if (verbose > 0) omnibus::say('\U2713', post=0)
					
			} # if wanting univariate models
			
		} # next iteration

		omnibus::say('')
		
	} # next algorithm

}
adamlilith/enmSdmPredImport documentation built on Dec. 31, 2022, 5:40 p.m.