#' Fit and select an Explicit Interaction Community Model (EICM)
#'
#' Given species occurrence data and (optionally) measured environmental predictors,
#' fits and selects an EICM that models species occurrence probability as a function of
#' measured predictors, unmeasured predictors (latent variables) and direct species interactions.
#'
#' An Explicit Interaction Community Model (EICM) is a simultaneous equation linear model in which each
#' species model integrates all the other species as predictors, along with measured and latent variables.
#'
#' This is the main function for fitting EICM models, and is preferred over using \code{\link{eicm.fit}} directly.
#'
#' This function conducts the fitting and network topology selection workflow, which includes three stages:
#' 1) estimate latent variable values; 2) make preliminary estimates for species interactions;
#' 3) conduct network topology selection over a reduced model (based on the preliminary estimates).
#'
#' The selection stage is optional. If not conducted, the species interactions are estimated
#' (all or a subset according to the user-provided constraints), but not selected.
#' See \code{vignette("eicm")} for commented examples on a priori excluding interactions.
#'
#' Missing data in the response matrix is allowed.
#'
#' @inheritParams eicm.fit
#' @param rotate.latents logical. Rotate the estimated latent variable values (the values of the
#' latents at each sample) in the first step with PCA? Defaults to FALSE.
#' @param scale.latents logical. Standardize the estimated latent variable values (the values of the
#' latents at each sample) in the first step? Defaults to TRUE.
#' @param penalty the penalty applied to the number of species interactions to include, during variable selection.
#' @param theta.threshold exclude species interactions (from network selection) whose preliminary coefficient (in absolute value)
#' is lower than this value. This exclusion criterion is cumulative with the other user-defined exclusions.
#' @param latent.lambda the regularization applied to latent variables and respective coefficients
#' when estimating their values in samples.
#' @param fit.all.with.latents logical. Whether to use the previously estimated latent variables
#' when estimating the preliminary species interactions.
#' @param true.model for validation purposes only: the true model that has generated the data, to which
#' the estimated coefficients will be compared in each selection algorithm iteration.
#' @param popsize.sel the population size for the genetic algorithm, expressed as the factor to multiply
#' by the recommended minimum. Ignored if \code{do.selection=FALSE}.
#' @param n.cores the number of CPU cores to use in the variable selection stage and in the optimization.
#' @param parallel logical. Whether to use \code{\link[optimParallel]{optimParallel}} during optimizations
#' instead of \code{\link[stats]{optim}}.
#' @param do.selection logical. Conduct the variable selection stage, over species interaction network topology?
#' @param do.plots logical. Plot diagnostic and trace plots?
#' @param refit.selected logical. Refit with exact estimates the best model after network selection? Note that,
#' for performance reasons, the models fit during the network selection stage use an approximate likelihood.
#'
#' @return A \code{eicm.list} with the following components:
#' \describe{
#' \item{true.model:}{a copy of the \code{true.model} argument.}
#' \item{latents.only:}{the model with only the latent variables estimated.}
#' \item{fitted.model}{the model with only the species interactions estimated.}
#' \item{selected.model:}{the final model with all coefficients estimated, after network topology selection.
#' This is the "best" model given the selection criterion (which depends on
#' \code{regularization} and \code{penalty}.}
#' }
#' When accessing the results, remember to pick the model you want (usually, \code{selected.model}).
#' \code{\link{plot}} automatically picks \code{selected.model} or, if NULL, \code{fitted.model}.
#' @seealso \code{\link{eicm-package}}, \code{\link{eicm.fit}}, \code{\link{plot.eicm}}
#'
#' @examples
#' # refer to the vignette for a more detailed explanation
#' \donttest{
#' # This can take some time to run
#'
#' # Load the included parameterized model
#' data(truemodel)
#'
#' # make one realization of the model
#' occurrences <- predict(truemodel, nrepetitions=1)
#'
#' # Fit and select a model with 2 latent variables to be estimated and all
#' # interactions possible
#' m <- eicm(occurrences, n.latent=2, penalty=4, theta.threshold=0.5, n.cores=2)
#'
#' plot(m)
#' }
#' @export
#' @importClassesFrom GA ga
#' @useDynLib eicm, .registration = TRUE, .fixes = "SR_"
eicm <- function(occurrences, env=NULL, traits=NULL, intercept=TRUE, # data
n.latent=0, rotate.latents=FALSE, scale.latents=TRUE, # latents
forbidden=NULL, allowed=NULL, mask.sp=NULL, exclude.prevalence=0, # formulation
regularization=c(ifelse(n.latent > 0, 6, 0.5), 1), regularization.type="hybrid", # regularization
penalty=4, theta.threshold=0.5, latent.lambda=1, fit.all.with.latents=TRUE,
popsize.sel=2, n.cores=parallel::detectCores(), parallel=FALSE,
true.model=NULL, do.selection=TRUE, do.plots=TRUE, fast=FALSE,
refit.selected=TRUE) {
if(!(regularization.type %in% c("ridge", "lasso", "hybrid")))
stop("Regularization type must be one of: ridge, lasso, hybrid")
# TODO adjust regularization defaults, which apparently depend of the # of samples
attr(regularization, "type") <- regularization.type
options <- eicm.options(mask=list(sp=mask.sp))
if(theta.threshold < 0) theta.threshold <- 0
if(n.latent > 0) {
message("Fit latents, no interactions")
time0 <- system.time(latents.only <- fit.latents(env=env, occurrences=occurrences,
regularization=c(latent.lambda, 0), regularization.type="ridge",
nlat=n.latent, fast=FALSE, n.cores=ifelse(parallel, n.cores, 1)))
if(scale.latents) {
# standardize latents and counter-standardize their coefficients
f <- apply(latents.only$model$samples, 2, stats::sd)
latents.only$data$env[, names(f)] <- sweep(latents.only$data$env[, names(f), drop=FALSE], 2, f, "/")
latents.only$model$env[, names(f)] <- sweep(latents.only$model$env[, names(f), drop=FALSE], 2, f, "*")
latents.only$model$samples <- sweep(latents.only$model$samples, 2, f, "/")
}
if(rotate.latents) {
# rotate latent variable values
pc <- stats::prcomp(latents.only$model$samples)
# now rotate their responses with the same matrix
latents.only$model$env[, colnames(latents.only$model$samples)] <-
latents.only$model$env[, colnames(latents.only$model$samples)] %*% pc$rotation
# and re-copy the latent values to the env table
latents.only$data$env[, colnames(latents.only$model$samples)] <- latents.only$model$samples
}
if(!is.null(true.model) && do.plots) {
grDevices::dev.new(width=12, height=4)
coefficientComparisonPlot(latents.only$model, true.model, nenv.to.plot=n.latent, nlatent.to.plot=n.latent,
plot.intercept=TRUE, plot.interactions=FALSE)
}
# By the way, remember that we discard the estimated responses to latent variables.
# This first step is just to estimate the latent variable values at each site, everything
# else is discarded.
}
# fit the full network, so that weak interactions may be discarded
# TODO: if theta.threshold==0, we don't need to fit all interactions
if(n.latent > 0) {
model <- latents.only
model$model$samples <- NULL
if(fit.all.with.latents) {
# fix latent values
message("Fit the model with all interactions, with latent values fixed")
model$model$sp <- matrix(stats::runif(ncol(model$model$sp) * ncol(model$model$sp), -0.5, 0.5), ncol=ncol(model$model$sp))
# here we use model$data$env, which has the estimated latent values appended to it
time1 <- system.time(fitted.model <- eicm.fit(
model$data$occurrences, env=model$data$env, traits=traits, intercept=FALSE,
n.latent=0, initial.values=model$model,
options=options, forbidden=forbidden, allowed=allowed, exclude.prevalence=exclude.prevalence,
fast=TRUE, n.cores=ifelse(parallel, n.cores, 1),
optim.method="L-BFGS-B",
regularization=regularization, regularization.type=regularization.type))
} else {
message("Fit the model with all interactions, with no latents")
# remove estimated latent coefficients from environmental matrix
model$model$env <- model$model$env[, -((ncol(model$model$env) - n.latent + 1):ncol(model$model$env)), drop=FALSE]
model$model$sp <- matrix(stats::runif(ncol(model$model$sp) * ncol(model$model$sp), -0.5, 0.5), ncol=ncol(model$model$sp))
# here we use the original "env"
time1 <- system.time(fitted.model <- eicm.fit(
occurrences, env=env, traits=traits, intercept=TRUE,
n.latent=0, initial.values=model$model,
options=options, forbidden=forbidden, allowed=allowed, exclude.prevalence=exclude.prevalence,
fast=TRUE, n.cores=ifelse(parallel, n.cores, 1),
optim.method="L-BFGS-B",
regularization=regularization, regularization.type=regularization.type))
}
if(!is.null(true.model) && nrow(fitted.model$model$env) != nrow(true.model$model$env)) warning("Some species are missing from the estimated model")
} else { # No latents
message("Fit the model with all interactions, with no latents")
# fit the model with everything
time1 <- system.time(fitted.model <- eicm.fit(occurrences, env=env, traits=traits, intercept=TRUE
, n.latent=0,
options=options, forbidden=forbidden, allowed=allowed, exclude.prevalence=exclude.prevalence,
fast=TRUE, n.cores=ifelse(parallel, n.cores, 1),
optim.method="L-BFGS-B",
regularization=regularization, regularization.type=regularization.type))
latents.only <- fitted.model
latents.only$model$sp[, ] <- 0
}
if(!do.selection) {
# don't select interactions, just return full network estimates
out <- list(true.model=true.model, latents.only=latents.only, fitted.model=fitted.model)
if(exists("time0")) attr(out, "time.fit.latents") <- time0
if(exists("time1")) attr(out, "time.fit.all") <- time1
attr(out, "regularization") <- regularization
class(out) <- "eicm.list"
return(out)
}
if(!is.null(true.model)) {
if(do.plots) {
monitor.function <- function(model, plot.interactions) {
return(
coefficientComparisonPlot(model, true.model, nenv.to.plot=ncol(fitted.model$model$env) - 1,
nlatent.to.plot=0, plot.intercept=TRUE,
excluded.interactions=abs(fitted.model$model$sp) < theta.threshold,
plot.interactions=plot.interactions)
)
}
selection.monitor <- function(object, bestmodel, worstmodel) {
stats <- monitor.function(bestmodel, plot.interactions=TRUE)
if(!is.null(object)) {
fitness.stats <- stats::quantile(stats::na.exclude(object@fitness), probs=c(0, 0.5, 1))
message(sprintf("\rIt %d (%.0fs, ciT %d+%d/%d) | Fit %.1f %.1f %.1f | %d TP %d FN %d FP",
object@iter, object@time.took, object@cached, object@informed, object@cache.size,
fitness.stats[3], fitness.stats[2], fitness.stats[1],
stats["correct"], stats["missed"], stats["spurious"]))
utils::flush.console()
}
}
} else {
selection.monitor <- function(object, bestmodel, worstmodel) {
stats <- coefficientComparisonPlot(bestmodel, true.model, nenv.to.plot=ncol(fitted.model$model$env) - 1,
nlatent.to.plot=0, plot.intercept=TRUE,
excluded.interactions=abs(fitted.model$model$sp) < theta.threshold,
plot.interactions=TRUE, noplot=TRUE)
if(!is.null(object)) {
fitness.stats <- stats::quantile(stats::na.exclude(object@fitness), probs=c(0, 0.5, 1))
message(sprintf("\rIt %d (%.0fs, ciT %d+%d/%d) | Fit %.1f %.1f %.1f | %d TP %d FN %d FP",
object@iter, object@time.took, object@cached, object@informed, object@cache.size,
fitness.stats[3], fitness.stats[2], fitness.stats[1],
stats["correct"], stats["missed"], stats["spurious"]))
utils::flush.console()
}
}
}
if(do.plots && !is.null(latents.only)) {
grDevices::dev.new(width=12, height=4)
monitor.function(fitted.model$model, plot.interactions=TRUE)
graphics::abline(h=c(-theta.threshold, theta.threshold), lty=2)
}
message(sprintf("Discarded %d true interactions (out of %d) from the start."
, sum(true.model$model$sp != 0 & (abs(fitted.model$model$sp) <= theta.threshold | !fitted.model$model$options$mask$sp))
, sum(true.model$model$sp != 0)
))
} else {
if(do.plots) {
selection.monitor <- function(object, bestmodel, worstmodel) {
if(is.null(bestmodel)) return(invisible(NULL))
plot.eicm.matrix(bestmodel, type="network")
if(!is.null(object)) {
gaMonitor.eicm(object, bestmodel, worstmodel)
}
}
} else selection.monitor <- NULL
}
# now hard threshold over interactions
# NOTE this mask already includes user-specified exclusions (options, prevalence and forbidden)
# because they were not estimated in the fitted.model
masksp <- abs(fitted.model$model$sp) > theta.threshold
tmp <- !masksp & fitted.model$model$options$mask$sp
diag(tmp) <- FALSE
message(sprintf("Discarded %d parameters (out of %d) by hard thresholding with |theta| < %.2f.",
sum(tmp), sum(fitted.model$model$sp != 0), theta.threshold))
# sum(tmp), sum(fitted.model$model$options$mask$sp), theta.threshold))
# if(do.plots)
# grDevices::dev.new(width=12, height=4)
latents.only.copy <- latents.only
latents.only.copy$model$samples <- NULL
latents.only.copy$model$sp <- fitted.model$model$sp
latents.only.copy$model$options$mask$sp <- fitted.model$model$options$mask$sp
time2 <- system.time(var.selection <- model.selection.network(latents.only.copy, regularization=regularization
, regularization.type=regularization.type
, penalty=penalty, masksp=masksp, select.direction=TRUE, exclude.prevalence=exclude.prevalence
, fast=TRUE, optim.method="ucminf", optim.control=list(trace=0, maxeval=10000, gradstep=c(0.001, 0.001), grtol=0.1)
#, fast=FALSE, optim.method = "L-BFGS-B", optim.control = list(trace=0, maxit=10000, ndeps=0.001)
, parallel=n.cores
, maxit.stagnated=50, pmutation = 0.01, popsize.factor=popsize.sel
, monitor=selection.monitor
))
bits <- var.selection@solution[1, ]
selected.model <- var.selection@allEvaluations[[paste(bits, collapse="")]]
selected.model <- list(
data=eicm.data(occurrences=occurrences, env=latents.only.copy$data$env)
, model=eicm.matrix(selected.model$env, sp.coefs=selected.model$sp, latent=selected.model$samples, options=selected.model$options)
)
attr(selected.model, "regularization") <- regularization
class(selected.model) <- "eicm"
if(refit.selected) {
selected.model <- eicm.fit(occurrences=selected.model$data$occurrences, env=selected.model$data$env,
traits=selected.model$data$traits, options=selected.model$model$options, n.latent=0,
regularization=attr(selected.model, "regularization"),
regularization.type=attr(attr(selected.model, "regularization"), "type"),
fast=FALSE, n.cores=n.cores)
}
if(is.function(selection.monitor))
selection.monitor(NULL, selected.model$model, NULL)
out <- list(true.model=true.model, fitted.model=fitted.model, latents.only=latents.only, selected.model=selected.model)
if(exists("time0")) attr(out, "time.fit.latents") <- time0
if(exists("time1")) attr(out, "time.fit.all") <- time1
if(exists("time2")) attr(out, "time.selection") <- time2
attr(out, "regularization") <- regularization
attr(out, "penalty") <- penalty
attr(out, "theta.threshold") <- theta.threshold
attr(out, "latent.lambda") <- latent.lambda
class(out) <- "eicm.list"
return(out)
}
# Utility function to estimate latents possibly with given species coefficients
fit.latents <- function(env, occurrences, regularization, regularization.type, nlat,
envcoefs=NULL, spcoefs=NULL, fast=FALSE, n.cores) {
if(is.null(env))
env <- matrix(0, nrow=nrow(occurrences), ncol=0)
# no species interactions
options <- eicm.options(mask=list(env=1L, sp=0L))
# This is to estimate latents in new data, from sparse data! We here fix the species coefficients.
if(!is.null(spcoefs) || !is.null(envcoefs)) {
if(is.null(spcoefs))
spcoefs <- matrix(0, ncol=ncol(occurrences), nrow=ncol(occurrences))
if(is.null(envcoefs))
envcoefs <- matrix(0, ncol=ncol(env) + nlat, nrow=ncol(occurrences))
options <- options + list(offset=list(env=envcoefs, sp=spcoefs))
}
model <- eicm.fit(occurrences, env=env, intercept=TRUE
, options=options
, fast=fast, n.cores=n.cores
, n.latent=nlat
, regularization=regularization, regularization.type=regularization.type)
return(model)
}
gaMonitor.eicm <- function(object, bestmodel, worstmodel) {
fitness.stats <- stats::quantile(stats::na.exclude(object@fitness), probs=c(0, 0.5, 1))
nterms.stats <- stats::quantile(apply(object@population, 1, sum), probs=c(0, 0.5, 1))
message(sprintf("\rIt %d (%.0fs, ciT %d+%d/%d) | Fit %.1f %.1f %.1f | #term %d %d %d",
object@iter, object@time.took, object@cached, object@informed, object@cache.size
, fitness.stats[3], fitness.stats[2], fitness.stats[1], nterms.stats[1], as.integer(nterms.stats[2]), nterms.stats[3]))
utils::flush.console()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.