#' @export
get_k_best_models <- function(tab_model, k = 5, use_loo) {
if(use_loo) {
writeLines("\t* Using leave-one out for model selection")
tab_model <- tab_model[order(tab_model$looic, decreasing = FALSE), ]
} else {
writeLines("\t* Using AIC for model selection")
tab_model <- tab_model[order(tab_model$AIC, decreasing = FALSE), ]
}
tab_model <- tab_model[1:k, "index"]
return(tab_model)
}
#' Fit all possible model with the covariates given.
#'
#' @param distFit \code{distfit} object. One of \code{plot_detection} output or directly an output of \code{ds}
#' @param segdata_obs segdata data.frame with observation added with \code{ajout_obs}.
#' @param obsdata Observation data.frame prepared with \code{prepare_data_obs}.
#' @param response Response variable to choose between "ind" number of indiviudals or "obs" number of observation.
#' @param predictors vector containing all covariates as character string.
#' @param likelihood Likelihood must be one of "negbin", "poisson" or "tweedie". Default is "negbin".
#' @param esw Value of \emph{effective-strip width}.
#' @param max_cor Maximum correlation threshold between two covariates tolerated.
#' @param nb_max_pred Maximum number of covariates in a model.
#' @param complexity k argument in smoothers.
#' @param smooth_xy Controls the intercept to include or not a bivariate smooth on x and y :
#' for prediction inside the prospected polygon, should be set to TRUE to obtain stable estimates for
#' prediction outside, MUST be set to FALSE to keep extrapolation under control.
#' @param k Number of models to return for inference.
#' @param splines_by Interaction with splines given by one variable of segdata.
#' @param weighted Either to compute weights or no.
#' @param random variable of \code{segdata_obs} to include random effect.
#' @param soap list to pass in order to use a soap-film smooth: must be prepared outside this function.
#' @param use_loo Model selection with leave-one-out cross validation if TRUE, if FALSE it uses AIC to select models.
#'
#' @note By default, use cubic B-splines with shrinkage (bs = 'cs').
#'
#' @return This function return a list containing:
#' \enumerate{
#' \item all_fits_binded : \code{data.frame} Containing all fitting infos on models.
#' \item best_models : \code{list} k best models with standardized value.
#' \item best_models4plotting : \code{list} k best models with raw values (for plotting).
#' }
#'
#' @importFrom arm rescale
#' @importFrom utils combn
#' @importFrom mgcv nb tw
#' @importFrom dsm dsm
#' @importFrom mvtnorm rmvnorm
#' @importFrom loo loo.matrix stacking_weights
#' @importFrom tweedie dtweedie
#' @export
fit_all_dsm <- function(distFit = NULL,
segdata_obs,
obsdata,
response = "ind",
predictors,
likelihood = "negbin",
esw = NULL,
max_cor = 0.5,
nb_max_pred = 3,
complexity = 4,
smooth_xy = TRUE,
k = 5,
splines_by = NULL,
weighted = FALSE,
random = NULL, # a character vector
soap = list(xt = NULL, knots = NULL), # a list with xt = list(bnd = ) and knots,
use_loo = FALSE # model selection with leave-one-out
) {
## likelihood must be one of "negbin", "poisson" or "tweedie"
## default is "negbin" for negative binomial likelihood
## smooth_xy controls the intercept to include or not a bivariate smooth on x and y :
# for prediction inside the prospected polygon,
# should be set to TRUE to obtain stable estimates
# for prediction outside, MUST be set to FALSE to keep extrapolation under control
## soap is a list to pass in order to use a soap-film smooth: must be prepared outside this function
## k is the number of models to return for inference (based on AIC or Deviance)
## by default, use cubic B-splines with shrinkage
## arg 'splines_by' is to have an interaction with splines
rescale <- function(x) { (x - mean(x)) / sd(x) }
## raw data
X <- segdata_obs
## prepare smooth terms
if(is.null(splines_by)) {
smoothers <- paste("s(", predictors, ", k = ", complexity, ", bs = 'cs')", sep = "")
} else {
if(!is.character(splines_by)) {
stop("Arg. 'splines_by' must be character")
}
if(!any(names(segdata_obs) == splines_by)) {
stop(paste("No column named '", splines_by, "' in dataframe 'segdata_obs'", sep = ""))
}
smoothers <- paste("s(", predictors, ", k = ", complexity, ", bs = 'cs', by = ", splines_by, ")", sep = "")
}
## need soap?
if(is.null(soap$xt) && is.null(soap$knots)) {
intercept <- ifelse(smooth_xy, "~ te(longitude, latitude, bs = 'cs')", "~ 1")
## standardize
segdata_obs[, c(predictors, "longitude", "latitude", "X", "Y")] <- apply(segdata_obs[, c(predictors, "longitude", "latitude", "X", "Y")], 2, rescale)
} else {
intercept <- ifelse(smooth_xy, "~ s(longitude, latitude, bs = 'so', xt = list(bnd = bnd))", "~ 1")
## standardize: do not standardize long/lat with soap
segdata_obs[, predictors] <- apply(segdata_obs[, predictors], 2, rescale)
}
## include random effects: must be factors for mgcv
if(!is.null(random)) {
if(!all(random %in% names(segdata_obs))) {
stop("Check random effect: no matching column in table 'segdata_obs'")
} else {
if(!is.factor(segdata_obs[, random[1]])) {
segdata_obs[, random[1]] <- factor(as.character(segdata_obs[, random[1]]), levels = c(unique(segdata_obs[, random[1]]), "new_level"))
X[, random[1]] <- factor(as.character(X[, random[1]]), levels = c(unique(X[, random[1]]), "new_level"))
}
intercept <- paste(intercept, " + s(", random[1], ", bs = 're')", sep = "")
if(length(random) > 1) {
for(k in 1:lenght(random)) {
if(!is.factor(segdata_obs[, random[1]])) {
segdata_obs[, random[k]] <- factor(as.character(segdata_obs[, random[k]]), levels = c(unique(segdata_obs[, random[k]]), "new_level"))
X[, random[k]] <- factor(as.character(X[, random[k]]), levels = c(unique(X[, random[k]]), "new_level"))
}
intercept <- paste(intercept, " + s(", random[k], ", bs = 're')", sep = "")
}
}
}
}
## all combinations among nb_max_pred
all_x <- lapply(1:nb_max_pred, combn, x = length(predictors))
## check whether cross-correlation needs to be evaluated
if(nb_max_pred == 1) {
rm_combn <- c(rep(0, length(predictors) + 1))
} else {
## identify which combination is to be removed
rm_combn <- lapply(all_x[-1], function(mat) {
sapply(1:ncol(mat), function(i) {
rho <- cor(X[, predictors[mat[, i]]]) ; diag(rho) <- 0
return(max(abs(as.numeric(rho))))
})
})
rm_combn <- c(c(rep(0, length(predictors) + 1)), unlist(rm_combn))
}
## list of models
mlist <- function(n, y, predictors) {
paste(y,
apply(X = combn(predictors, n), MARGIN = 2, paste, collapse = " + "),
sep = paste(intercept, "+", sep = " ")
)
}
all_mods <- c(paste("count", intercept, sep = " "),
unlist(lapply(1:nb_max_pred, mlist, y = "count", predictors = smoothers))
)
## remove combinations of variables that are too correlated
all_mods <- all_mods[which(rm_combn < max_cor)]
## compute weights
if(weighted) {
covariable <- predictors
if(is.null(covariable)) {
stop("covariable must not be NULL if weighted is TRUE")
}
if(!all(covariable %in% colnames(segdata_obs))) {
stop(paste0(covariable[!(covariable %in% colnames(segdata_obs))], " are not in segdata_obs"))
}
w <- lapply(all_x, function(tab) {
sapply(1:ncol(tab), function(j) {
make_cfact_2(calibration_data = segdata_obs,
test_data = segdata_obs,
var_name = covariable[tab[, j]],
percent = FALSE,
near_by = TRUE
)
})
})
w <- cbind(rep(1, nrow(segdata_obs)), do.call('cbind', w))
w <- w[, which(rm_combn < max_cor)]
} else {
w <- matrix(1, nrow = nrow(segdata_obs), ncol = length(which(rm_combn < max_cor)))
}
# suppress warnings
# options(warn = -1)
# relabel Sample.Label to have only unique match per segments
X$Sample.Label <- paste(X$Sample.Label, X$Seg, sep = "_")
segdata_obs$Sample.Label <- paste(segdata_obs$Sample.Label, segdata_obs$Seg, sep = "_")
obsdata$Sample.Label <- paste(obsdata$Sample.Label, obsdata$Seg, sep = "_")
## response variable is either n (nb of observations) or y (nb of individuals)
if(response != "ind") {
writeLines("\t* Response variable is the number of observations")
obsdata$size <- 1
} else {
writeLines("\t* Response variable is the number of individuals")
}
## detection
if(is.null(distFit) && is.null(esw)) {
stop("Must provide either a detection function as 'distFit', or esw")
} else {
if(!is.null(distFit)) {
writeLines("\t* Detection function provided")
esw <- NULL
} else {
if(length(esw) == nrow(segdata_obs)) {
writeLines("\t* esw provided for each segment")
} else {
esw <- esw[1]
writeLines(paste("\t* esw set to", esw, sep = " "))
}
}
}
## fit the models
my_dsm_fct <- function(x, tab = TRUE, segdata_obs, loo = FALSE, bnd = soap$xt, knots = soap$knots) {
model <- dsm(as.formula(all_mods[x]),
ddf.obj = distFit,
strip.width = esw,
segment.data = segdata_obs,
observation.data = obsdata,
family = switch(likelihood,
negbin = nb(),
poisson = poisson(),
tweedie = tw()
),
method = "REML",
weights = w[, x],
knots = knots
)
### leave-one-out cross-validation
if(loo) {
# if loo, do not return tab
tab <- FALSE
# approximate posterior distribution with MV normal
beta <- mvtnorm::rmvnorm(1e3, mean = model$coefficients, sigma = model$Vp)
Z <- predict(model,
newdata = model$model,
off.set = model$offset,
type = "lpmatrix"
)
mu <- exp(as.matrix(beta %*% t(Z)))
# response variable from fitted dsm object
y <- model$model$count
# log pointwise posterior density
lppd = switch(likelihood,
negbin = { apply(mu, 1, function(iter) { w[, x] * dnbinom(y, size = model$family$getTheta(trans = TRUE), mu = exp(model$offset) * iter, log = TRUE) }) },
poisson = { apply(mu, 1, function(iter) { w[, x] * dpois(y, lambda = exp(model$offset) * iter, log = TRUE) }) },
tweedie = { apply(mu, 1, function(iter) { w[, x] * log(tweedie::dtweedie(y, xi = model$family$getTheta(trans = TRUE), mu = exp(model$offset) * iter, phi = model$sig2)) }) }
)
out <- loo::loo.matrix(t(lppd), save_psis = TRUE)
} else {
out <- model
}
### store some results in a data frame
if(tab) {
return(
data.frame(model = all_mods[x],
index = x,
Convergence = ifelse(model$converged, 1, 0),
AIC = model$aic,
# GCV = model$gcv.ubre,
ResDev = model$deviance,
NulDev = model$null.deviance,
ExpDev = 100 * round(1 - model$deviance/model$null.deviance, 3)
)
)
} else {
return(out)
}
}
writeLines("\t* Fitting all possible models, please wait")
all_fits <- lapply(1:length(all_mods), my_dsm_fct, segdata_obs = segdata_obs)
## Collapse to a data frame
all_fits <- do.call('rbind', all_fits)
# 5 best looic else 5 best AIC
if(use_loo) {
## leave-one-out cross-validation using Pareto Smoothing Importance Sampling
writeLines("\t* Estimating loocv on all models: please wait")
all_psis <- lapply(1:length(all_mods), my_dsm_fct, segdata_obs = segdata_obs, loo = TRUE)
# get loo_ic and se_looic
loo_ic_coefs <- do.call(rbind,lapply(all_psis, function(x){x$estimates["looic",]}))
colnames(loo_ic_coefs) <- c("looic","se_looic")
all_fits <- cbind(all_fits, loo_ic_coefs)
# initiate stacking_weights column
all_fits$stacking_weights <- NA
# all_fits filtered 5 best
all_fits_best <- all_fits %>%
slice_min(looic, n = k)
# get best loo order
index_order_best <- get_k_best_models(tab_model = all_fits_best, k = k, use_loo = T)
# estimating stacking weights
get_elpd_loo <- do.call('cbind', lapply(index_order_best, function(l) {all_psis[[l]]$pointwise[, "elpd_loo"]}))
loow <- as.numeric(loo::stacking_weights(get_elpd_loo))
all_fits$stacking_weights[index_order_best] <- loow
all_fits_best_sw <- all_fits %>%
slice_min(looic, n = k)
index_order_sw <- all_fits_best_sw$index
## select the n-best models
best <- lapply(index_order_sw, my_dsm_fct, tab = FALSE, segdata_obs = X)
best_std <- lapply(index_order_sw, my_dsm_fct, tab = FALSE, segdata_obs = segdata_obs)
# order all_fits by looic
all_fits <- all_fits %>%
arrange(looic)
} else {
# all_fits filtered k best
all_fits_best <- all_fits %>%
slice_min(AIC, n = k)
# initiate stacking_weights column
all_fits$stacking_weights <- NA
index_order_best <- get_k_best_models(tab_model = all_fits_best, k = k, use_loo = F)
# estimating stacking weights
writeLines("\t* Estimating loocv on k best models: please wait")
all_psis <- lapply(index_order_best, my_dsm_fct, segdata_obs = segdata_obs, loo = TRUE)
get_elpd_loo <- do.call('cbind', lapply(1:k, function(l) {all_psis[[l]]$pointwise[, "elpd_loo"]}))
loow <- as.numeric(loo::stacking_weights(get_elpd_loo))
all_fits$stacking_weights[index_order_best] <- loow
all_fits_best_sw <- all_fits %>%
slice_min(AIC, n = k)
index_order_sw <- all_fits_best_sw$index
## select the n-best models
best <- lapply(index_order_sw, my_dsm_fct, tab = FALSE, segdata_obs = X)
best_std <- lapply(index_order_sw, my_dsm_fct, tab = FALSE, segdata_obs = segdata_obs)
# order all_fits by AIC
all_fits <- all_fits %>%
arrange(AIC)
}
## wrap-up with the outputs
return(
list(all_fits_binded = all_fits,
best_models = best,
best_models4plotting = best_std # pour le pred splines
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.