#' @title Make a species distribution model
#'
#' @description Returns the model, the performance metrics, and/or distribution maps depending on arguments.
#'
#' @param data \code{SpatialPointsDataFrame} containing response and predictors
#' @param method SDM methos used: "gam", "rf", "gbm", "max", or "gbm.step". See details for details
#' @param responsetype Type of the response: "count", "continuous", or "presence". Presence denotes bimodal responses [0|1]
#' @param response Column name of response in data argument
#' @param predictors Column names of predictors to use in data argument
#' @param secondary Column name in data. Use this to calculate model performance metrics from instead of response. Default is NULL.
#' @param enviStack \code{RasterStack} of predictors. Used to calculate SD map
#' @param enviPix \code{SpatialPixelsDataFrame} of predictors. enviPix<-as(enviStack,"SpatialPixelsDataFrame"). Only for performance.
#' @param seed Integer. For reproduceability.
#' @param aggregated logical or \code{NULL} Default is \code{NULL}. This is for ensemble calculations and added to metrics in case of not being \code{NULL}
#' @param pseudoabsence logical or \code{NULL} Default is \code{NULL}. Same as above
#' @param gbm.trees gbm.trees param for dismo::gbm
#' @param maxargs argument to pass tp maxent
#' @param model \code{logical} return ONLY the model. Default is \code{FALSE}. Overrides args below.
#' @param prediction return ONLY the prediction. Default is \code{FALSE}. Overrides args below.
#' @param moran add spatial autocorrelation metric to model metric. Default is \code{FALSE}. See details.
#' @param flat return model performance metrics only (as \code{data.frame})
#' @param rast return \code{list} of metric and raster
#' @param rast.occ additionally adds bimodal occurence map. see details for threshold calculation.
#' @param ... ellipsis is used to pass arguments to subsequent functions like \code{threshold.def} and \code{moRange}. See \code{\link{metrics}} \code{\link{moranii}} for details
#' @return A \code{model}, or a \code{data.frame} or a \code{list} depending on arguments
#' @details SDM methos used are "gam", "rf", "gbm", "max", or "gbm.step".
#' If you want spatial autocorrelation metrics you probably need to pass additional arguments to \code{\link{moranii}}. Note that calculations may take very long depending on number of points and parametrization #FIXME
#' @examples #FIXME
#'
model <- function(data,
method,
responsetype,
response,
predictors,
secondary=NULL,
enviStack,
enviPix,
seed=NULL,
aggregated=NULL,
pseudoabsence=NULL,
gbm.trees=2000,
maxargs=c("outputformat=logistic", "defaultprevalence=0.5"),
model=FALSE,
prediction=FALSE,
flat=FALSE,
moran=FALSE,
rast=TRUE,
rast.occ=TRUE,
...)
{
# Metrics and prediction map of Species Distribution Model specified as list
#
# Args:
# data [<SpatialPointsDataFrame>]
# method [<character>] modeling method
# responsetype [<character>]
# response [<character>] column name of data
# predictors [<character>] column names of data
# secondary [<character>] column name of data; compare with prediction for cor() in metrics()
# enviStack [<RasterStack>]
# enviPix [<SpatialPixelsDataFrame>]
# seed [<numeric>] seed
# aggregated [<logical>] is aggregated
# pseudoabsence [<logical>] absences are pseudoabsences
# gbm.trees [<numeric>]
# OUTPUT OPTIONS
# model [<logical>] output model only (overides options below)
# prediction [<logical>] output prediction only (overides options below)
# moran [<logical>] add spatial autocorrelation metric to model metric
# flat [<logical>] output model metric as data.frame (overides options below)
# rast [<logical>] output list of metric and raster
# rast.occ [<logical>] raster becomes list of of rasters of model and absence/presence prediction
# ... arguments for moranii() & makefn.free.model (moParams=c("response","residuals"),moRange=list(c(0,0.5),c(0.5,1)),check.names=TRUE)
#dots <- list(...)
if(is.null(seed)){
seed <- as.integer(runif(1)*100000)
}
if(!is.null(secondary)){
secondary <- data[[secondary]]
}
Model <- makefn.free.model(method=method, response=response, responsetype=responsetype, predictors=predictors, Predictor=enviPix, data.names=names(data), gbm.trees=gbm.trees, ...)
Predict <-makefn.free.predict(responsetype=responsetype, method=method, environ=enviStack, maxargs=maxargs)
set.seed(seed)
fit <- Model(data=data)
if(model){
return(fit)
}
P <- Predict(fit,data)
if(prediction){
data$prediction<-P
data$residuals<-data[[response]]-P
return(data)
}
M <- metrics(response=data[[response]], prediction=P, secondary=secondary, ...)
if(moran){
dat <- data[,response]
dat$response <- data[[response]]
dat$residuals <- data[[response]]-P
M <- c(M,moranii(dat, ...))
}
metric <- data.frame(list(
as.list(c(response=response, method=method, responsetype=responsetype, predictors=paste0(predictors,collapse="."))),
as.list(c(aggregated=ifelse(class(aggregated)=="logical",aggregated,FALSE),pseudoabsence=ifelse(class(pseudoabsence)=="logical",pseudoabsence,FALSE))),
as.list(M)),stringsAsFactors=FALSE)
if(flat){
return(metric)
}
if(rast){
enviPix$full <- Predict(fit,enviPix)
full <- raster(enviPix,match("full",names(enviPix)))
if(rast.occ){
t<-M[["threshold"]]
ap <- full
ap@data@values<-presence.absence(ap@data@values,t=t)
full <- list(full=full,ap=ap)
}
} else {
full <- NULL
}
return(list(metric=metric,raster=full))
}
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#
# crossvalid :: cross validation routine
#
#' @title Cross validation routine for species distribution models
#'
#' @description repeated k-fold cross-validation. Calculate model preformance metrics, disribution, standard deviation and occurence maps.
#'
#' @param Ncv Number of cv repititions.
#' @param Kfold Number of folds. Usually take 10 or 5.
#' @param data \code{SpatialPointsDataFrame} containing response and predictors
#' @param method SDM methos used: "gam", "rf", "gbm", "max", or "gbm.step". See details for details
#' @param responsetype Type of the response: "count", "continuous", or "presence". Presence denotes bimodal responses [0|1]
#' @param response Column name of response in data argument
#' @param predictors Column names of predictors to use in data argument
#' @param secondary Column name in data. Use this to calculate model performance metrics from instead of response. Default is NULL.
#' @param strata criterion for gereation of folds
#' @param enviStack \code{RasterStack} of predictors. Used to calculate SD map
#' @param enviPix \code{SpatialPixelsDataFrame} of predictors. enviPix<-as(enviStack,"SpatialPixelsDataFrame"). Only for performance.
#' @param seed Integer. You probably want reproduceability. Note that Maxent's pseudoabsence generation can't be seeded this way--so expect those results to vary
#' @param aggregated logical or \code{NULL} Default is \code{NULL}. This is for ensemble calculations and added to metrics in case of not being \code{NULL}
#' @param pseudoabsence logical or \code{NULL} Default is \code{NULL}. Same as above
#' @param gbm.trees gbm.trees param for dismo::gbm
#' @param maxargs argument to pass tp maxent
#' @param flat return model performance metrics only (as \code{data.frame})
#' @param rast return \code{list} of metric and raster
#' @param ... ellipsis is used to pass arguments to subsequent functions like \code{threshold.def}. See \code{\link{metrics}} for details
#
crossvalid <- function(Ncv,
Kfold,
data,
method,
responsetype,
response,
predictors,
secondary=NULL,
strata,
enviStack,
enviPix,
seed,
check.names=TRUE,
aggregated=NULL,
pseudoabsence=NULL,
gbm.trees=2000,
maxargs=c("outputformat=logistic", "defaultprevalence=0.5"),
rast=TRUE,
flat=FALSE,
...)
{
# Metrics and prediction map of Species Distribution Model specified as list
#
# Args:
# Ncv [<numeric>] Number of cross-validation runs
# Kfold [<numeric>] Number of folds
# data [<SpatialPointsDataFrame>]
# method [<character>] modeling method
# responsetype [<character>]
# response [<character>] column name of data
# predictors [<character>] column names of data
# secondary [<character>] column name of data; compare with prediction for cor() in metrics()
# strata [<character>] column name of data; stratum used to create folds: a vector or factor with sub-groups
# enviStack [<RasterStack>]
# enviPix [<SpatialPixelsDataFrame>]
# seed [<numeric>] seed
# aggregated [<logical>] is aggregated
# pseudoabsence [<logical>] absences are pseudoabsences
# gbm.trees [<numeric>]
# rast [<logical>] output raster
# flat [<logical>] no list, just metric as data.frame
# ... arguments for makefn.free.model
sec<-NULL
if(flat){
rast<-FALSE
}
if(is.null(strata)){
strata<-"presence"
if("presence" %in% names(data)){
stop("presence exists in data - use strata=\"presence\"")
} else {
data$presence<-presence.absence(data[[response]])
}
}
if(rast){
rbin<-raster(enviStack,layer=1)*0
}
tmpfiles<-vector(mode = "character",length = Ncv)
set.seed(seed)
seed<-as.integer(runif(Ncv,min=10000,max=99999)*1000)
Model<-makefn.free.model(method=method, response=response, responsetype=responsetype, predictors=predictors, Predictor=enviPix, data.names=names(data), gbm.trees=gbm.trees, ...)
Predict<-makefn.free.predict(responsetype=responsetype, method=method, environ=enviStack, maxargs=maxargs)
for(n in 1:Ncv){
set.seed(seed[n])
group <- kfold(x=data,k=Kfold,by=data[[strata]])
for(k in 1:Kfold){
#cat(n," ",k,"\n")
test <- data[group == k,]
train <- data[group != k,]
set.seed(seed[n])
fit<-Model(data=train)
P<-Predict(fit,test)
if(!is.null(secondary)){
sec <- data[[secondary]][group == k]
}
M<-metrics(response=test[[response]],prediction=P,secondary=sec, ...)
metric<-data.frame(list(
as.list(c(response=response,method=method,responsetype=responsetype,predictors=paste0(predictors,collapse="."))),
as.list(c(aggregated=ifelse(class(aggregated)=="logical",aggregated,FALSE),pseudoabsence=ifelse(class(pseudoabsence)=="logical",pseudoabsence,FALSE))),
as.list(M)),stringsAsFactors=FALSE)
if (n+k<3){ # init
result<-allocateDf(metric,Ncv*Kfold)
if(rast){
Ra<-stack(replicate(Kfold,rbin))
}
}
result[(n-1)*Kfold+k,]<-metric[1,]
if(rast){
t<-M[["threshold"]]
enviPix$fit<-Predict(fit,enviPix)
ro<-ra<-raster(enviPix,layer=match("fit",names(enviPix)))
ro@data@values<-presence.absence(ro@data@values,t=t)
rbin<-rbin+ro #certainty
Ra[[k]]<-ra
}
}
if(rast){
tmpfiles[n]<-tempfile(pattern="tmp",fileext = ".bin")
save(Ra,file=tmpfiles[n])
if(n==1){
rmean<-sum(Ra)/(Ncv*Kfold)
} else {
rmean<-rmean+sum(Ra)/(Ncv*Kfold)
}
}
}
if(rast){
# raster
rbin<-(rbin/(Ncv*Kfold/2))-1
sd<-sd.raster(N=Ncv,K=Kfold,tmpfiles=tmpfiles,rmean=rmean)
names(rbin)<-"rbin"
names(rmean)<-"rmean"
names(sd)<-"sd"
} else {
rbin <- rmean <- sd <- NULL
}
if(flat){
return(result)
} else {
return(list(metric.agg=aggregate.results(result),metric=result,rbin=rbin,rmean=rmean,sd=sd))
}
}
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#
# nill models
#
# Metrics of null model cross validation
#
# Args:
# data [<SpatialPointsDataFrame>]
# method [<character>] modeling method
# predictors [<character>] column names of data
# enviStack [<RasterStack>]
# enviPix [<SpatialPixelsDataFrame>]
# Ncv [<numeric>] Number of cross-validation runs
# Ncv [<numeric>] Number of cross-validation runs
# Kfold [<numeric>] Number of folds
# seed [<numeric>] seed
# prob [<numeric> or <character>] if <char> the prevalence of data[[prob]] is used as prob
nullModel <- function (data,
method="rf",
predictors,
enviStack,
enviPix,
N0=100,
Ncv=10,
Kfold=5,
seed,
prob=0.5) {
set.seed(seed)
Seed<-as.integer(runif(N0,min=10000,max=99999)*1000)
if (class(prob)=="character"){
prob<-sum(presence.absence(data[[prob]]))/nrow(data)
}
for(i in 1:N0){
set.seed(Seed[i])
data$presence<-rbinom(nrow(data),1,prob)
m <-crossvalid(Ncv=Ncv,Kfold=Kfold,data=data,method=method,responsetype="presence",response="presence",predictors=predictors,strata="presence",secondary="presence",enviStack=enviStack,enviPix=enviPix,seed=Seed[i],maxargs=c("outputformat=logistic", "defaultprevalence=0.5"),gbm.trees=2000,rast=FALSE,flat=TRUE)
cat <- c(rep(FALSE,(i-1)*Ncv*Kfold),rep(TRUE,Ncv*Kfold),rep(FALSE,Ncv*Kfold*(N0-i)))
if (i<2){ # init
nullmodelmetric<-allocateDf(m,Ncv*Kfold*N0)
}
nullmodelmetric[cat,]<-m
}
return(aggregate.results(nullmodelmetric))
}
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#
moranii <- function(sp,moParams=c("response","residuals"),moRange=list(c(0,0.5),c(0.5,1)), ...)
{
# moran() calculates spatial autocorrelation (MoranII) of response and residuals for given ranges
# only for lonlat projs. FIXME
# Args:
# sp: spatialPoints or SPDF
# response,residuals [<numeric>] input for spdep::sp.correlogram
# range [<list>] range as for spdep::dnearneigh
#
# Uses:
# clean.null.list()
#browser()
library(spdep)
library(rgdal)
stopifnot(class(sp)=="SpatialPointsDataFrame")
result<-NULL
p.adj.method="holm"
spcor<-function(){
m <- sp.correlogram(nb, as.vector(sp[[P]]), order=2, method="I", zero.policy=TRUE)
res <- as.matrix(m$res)
ZI <- (res[, 1] - res[, 2])/sqrt(res[, 3])
pv <- p.adjust(2 * pnorm(abs(ZI), lower.tail = FALSE), method = p.adj.method)
mo<-res[1,1]
p<-pv[1]/2
return(c(mo,p))
}
for(R in moRange){
nb <- dnearneigh(sp, R[1], R[2])
for(P in moParams){
m <- tryCatch(spcor(),error=function(x){return(c(NA,NA))})
result <- c(result, m[1])
names(result)[length(result)] <- paste("M",P,paste(R,collapse="_"),sep=".")
result <- c(result, m[2])
names(result)[length(result)] <- paste("p",P,paste(R,collapse="_"),sep=".")
}
}
# if(any(sapply(result,is.na))){stop("NA in metrics")}
return(result)
}
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#
# code testing
if(FALSE){
nb<-dnearneigh(data, 0, 50)
sp.correlogram(nb, as.vector(data$density), order=2, method="I")
print(subs.correlog, p.adj.method="holm")
#quartz(title="Correlogram of substrate density")
plot(subs.correlog)
# check catchment area of nb
nn<-sapply(nb,FUN=function(x){length(x)}) # number of neighbors
n<-match(max(nn),nn) # point with most neighbors
load(file=file.path(WD,"data.geo/v.l.europeS.shape.bin"))
plot(data,pch=20,cex=0.2)
plot(data[n,],pch=20,cex=1,col="green",add=T)
plot(data[nb[n][[1]],],pch=20,cex=0.2,col="red",add=T)
plot(europeS.shape,add=T)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.