knitr::opts_chunk$set( fig.align = "center", fig.retina=1, fig.width = 6, fig.height = 6, cache = FALSE, collapse = TRUE, comment = "#>", highlight = TRUE )
We define the following model :
$$ \theta_{ij} =\alpha_i + \beta_{0j}+X_i.\beta_j+ W_i.\lambda_j $$
It can be easily shown that: $y_{ij} = \alpha_i + \beta_{0j} + X_i.\beta_j + W_i.\lambda_j + \epsilon_{i,j}$, with $\forall (i,j) \ \epsilon_{ij} \sim \mathcal{N}(0,V)$ where:
Latent variables: $W_i=(W_i^1,\ldots,W_i^q)$ where $q$ is the number of latent variables considered, which has to be fixed by the user (by default $q=2$). We assume that $W_i \sim \mathcal{N}(0,I_q)$ and we define the associated coefficients: $\lambda_j=(\lambda_j^1,\ldots, \lambda_j^q)'$. We use a prior distribution $\mathcal{N}(0,1)$ for all lambdas not concerned by constraints to $0$ on upper diagonal and to strictly positive values on diagonal.
Explanatory variables: bioclimatic data about each site. $X=(X_i)_{i=1,\ldots,nsite}$ with $X_i=(x_i^1,\ldots,x_i^p)\in \mathbb{R}^p$ where $p$ is the number of bioclimatic variables considered. The corresponding regression coefficients for each species $j$ are noted : $\beta_j=(\beta_j^1,\ldots,\beta_j^p)'$.
$\beta_{0j}$ correspond to the intercept for species $j$ which is assumed to be a fixed effect. We use a prior distribution $\mathcal{N}(0,1)$ for all betas.
$\alpha_i$ represents the random effect of site $i$ such as $\alpha_i \sim \mathcal{N}(0,V_{\alpha})$ and we assume that $V_{\alpha} \sim \mathcal {IG}(\text{shape}=0.5, \text{rate}=0.005)$ as prior distribution by default.
$V$ represents the variance of residuals or overdispersion term and we assume that $V \sim \mathcal {IG}(\text{shape}=0.5, \text{rate}=0.005)$ as prior distribution by default.
We simulate two explanatory variables considering 150 inventory sites.
#== Data simulation #= Number of inventory sites nsite <- 150 #= Set seed for repeatability seed <- 1234 set.seed(seed) #= Ecological process (suitability) x1 <- rnorm(nsite, 0, 1) x2 <- rnorm(nsite, 0, 1) X <- cbind(rep(1, nsite), x1, x2) colnames(X) <- c("Intercept", "Covariate_1", "Covariate_2") np <- ncol(X) head(X)
We simulate the response variable for 20 species on the 150 inventory sites by considering the explanatory variables simulated above and following the model defined previously with two latent axes.
#= Number of species nsp <- 20 #= Number of latent variables n_latent <- 2 #= Latent variables W W <- matrix(rnorm(nsite*n_latent,0,1), nsite, n_latent) #= Fixed species effect beta beta.target <- t(matrix(runif(nsp*np,-1,1), byrow=TRUE, nrow=nsp)) #= Factor loading lambda lambda.target <- matrix(0, n_latent, nsp) mat <- t(matrix(runif(nsp*n_latent, -1, 1), byrow=TRUE, nrow=nsp)) lambda.target[upper.tri(mat, diag=TRUE)] <- mat[upper.tri(mat, diag=TRUE)] diag(lambda.target) <- runif(n_latent, 0, 2) #= Variance of random site effect V_alpha.target <- 0.2 #= Random site effect alpha alpha.target <- rnorm(nsite,0 , sqrt(V_alpha.target)) # Simulation of response data with probit link theta.target <- X%*%beta.target + W%*%lambda.target + alpha.target V.target <- 0.2 Y <- matrix(rnorm(nsite*nsp, theta.target, sqrt(V.target)), nrow=nsite) colnames(Y) <- paste0("sp_", 1:nsp) rownames(Y) <- paste0("site_", 1:nsite) head(Y)
This data-set is available in the jSDM-package
R package. It can be loaded with the data()
command. The frogs
dataset is in "wide" format: each line is a site and the occurrence data (from Species_1 to Species_9) are in columns. A site is characterized by its x-y geographical coordinates, one discrete covariate and two other continuous covariates.
We rearrange the data in two data-sets: a first one for the presence-absence observations for each species (columns) at each site (rows), and a second one for the site characteristics.
We also normalize the continuous explanatory variables to facilitate MCMC convergence.
We use the jSDM_gaussian
function to fit the JSDM (increase the number of iterations to achieve convergence).
library(jSDM) # Increase number of iterations (burnin and mcmc) to get convergence mod <- jSDM_gaussian(# Iteration burnin=1000, mcmc=1000, thin=1, # Response variable response_data=Y, # Explanatory variables site_formula =~ Covariate_1 + Covariate_2, site_data=X, n_latent=2, site_effect="random", # Starting values alpha_start=0, beta_start=0, lambda_start=0, W_start=0, V_alpha=1, V_start=1 , # Priors shape_Valpha=0.5, rate_Valpha=0.0005, shape_V=0.5, rate_V=0.0005, mu_beta=0, V_beta=1, mu_lambda=0, V_lambda=1, seed=1234, verbose=1)
np <- nrow(mod$model_spec$beta_start) # =================================================== # Result analysis # =================================================== #========== #== Outputs #= Parameter estimates # Species effects beta and factor loadings lambda ## Trace and density of beta_j for the first two species mean_beta <- matrix(0,nsp,ncol(X)) par(mfrow=c(2,2)) for (j in 1:nsp){ mean_beta[j,] <- apply(mod$mcmc.sp[[j]] [,1:ncol(X)], 2, mean) if(j<=2){ for (p in 1:ncol(X)){ coda::traceplot(mod$mcmc.sp[[j]][,p]) coda::densplot(mod$mcmc.sp[[j]][,p], main = paste(colnames(mod$mcmc.sp[[j]])[p],", species : ",j)) abline(v=beta.target[p,j],col='red') } } } ## Trace and density of lambda_j for the first two species mean_lambda <- matrix(0,nsp,n_latent) par(mfrow=c(2,2)) for (j in 1:nsp){ mean_lambda[j,] <- apply(mod$mcmc.sp[[j]] [,(ncol(X)+1):(ncol(X)+n_latent)], 2, mean) if(j<=2){ for (l in 1:n_latent) { coda::traceplot(mod$mcmc.sp[[j]][,ncol(X)+l]) coda::densplot(mod$mcmc.sp[[j]][,ncol(X)+l], main=paste(colnames(mod$mcmc.sp[[j]]) [ncol(X)+l],", species : ",j)) abline(v=lambda.target[l,j],col='red') } } } # Representation of fitted values according to expected ones par(mfrow=c(1,2)) plot(t(beta.target), mean_beta, main="species effect beta", xlab ="obs", ylab ="fitted") abline(a=0,b=1,col='red') plot(t(lambda.target), mean_lambda, main="factor loadings lambda", xlab ="obs", ylab ="fitted") abline(a=0,b=1,col='red') ## W latent variables ## Trace and density of latent variables W_i for the first two sites par(mfrow=c(2,2)) for (l in 1:n_latent) { for (i in 1:2) { coda::traceplot(mod$mcmc.latent[[paste0("lv_",l)]][,i], main = paste0("Latent variable W_", l, ", site ", i)) coda::densplot(mod$mcmc.latent[[paste0("lv_",l)]][,i], main = paste0("Latent variable W_", l, ", site ", i)) abline(v=W[i,l],col='red') } } # Representation of fitted values according to expected ones par(mfrow=c(1,2)) for (l in 1:n_latent) { plot(W[,l], summary(mod$mcmc.latent[[paste0("lv_",l)]])[[1]][,"Mean"], main = paste0("Latent variable W_", l), xlab ="obs", ylab ="fitted") abline(a=0,b=1,col='red') } ## alpha ## Trace and density of alpha_i for the first two sites for (i in 1:2){ coda::traceplot(mod$mcmc.alpha[,i], main = paste0("Site effect alpha_", i)) coda::densplot(mod$mcmc.alpha[,i], main = paste0("Site effect alpha_", i)) abline(v=alpha.target[i],col='red') } # Representation of fitted values according to expected ones par(mfrow=c(1,3)) plot(alpha.target, summary(mod$mcmc.alpha)[[1]][,"Mean"], xlab ="obs", ylab ="fitted", main="site effect alpha") abline(a=0,b=1,col='red') ## Valpha coda::traceplot(mod$mcmc.V_alpha) coda::densplot(mod$mcmc.V_alpha) abline(v=V_alpha.target,col='red') ## Variance of residuals par(mfrow=c(1,2)) coda::traceplot(mod$mcmc.V) coda::densplot(mod$mcmc.V, main="Variance of residuals") abline(v=V.target, col='red') ## Deviance summary(mod$mcmc.Deviance) plot(mod$mcmc.Deviance) #= Predictions par(mfrow=c(1,1)) plot(Y, mod$Y_pred, main="Response variable", xlab="obs", ylab="fitted") abline(a=0,b=1,col='red')
After fitting the jSDM with latent variables, the full species residual correlation matrix $R=(R_{ij})^{i=1,\ldots, nspecies}{j=1,\ldots, nspecies}$ can be derived from the covariance in the latent variables such as : $$\Sigma{ij} = \begin{cases} \lambda_i^T .\lambda_j + V, & \text{if } i=j \ \lambda_i^T .\lambda_j, & \text{else.} \end{cases}$$, then we compute correlations from covariances : $$R_{i,j} = \frac{\Sigma_{ij}}{\sqrt{\Sigma {ii}\Sigma {jj}}}$$.
We use the plot_residual_cor()
function to compute and display the residual correlation matrix :
plot_residual_cor(mod)
We use the predict.jSDM()
S3 method on the mod
object of class jSDM
to compute the mean (or expectation) of the posterior distributions obtained and get the expected values of response variable.
# Sites and species concerned by predictions : ## 50 sites among the 150 Id_sites <- sample.int(150, 50) ## All species Id_species <- colnames(Y) # Simulate new observations of covariates on those sites simdata <- matrix(nrow=50, ncol = ncol(mod$model_spec$site_data)) colnames(simdata) <- colnames(mod$model_spec$site_data) rownames(simdata) <- Id_sites simdata <- as.data.frame(simdata) simdata$Covariate_1 <- rnorm(50) simdata$Covariate_2 <- rnorm(50) # Predictions Y_pred <- predict(mod, newdata=simdata, Id_species=Id_species, Id_sites=Id_sites, type="mean") hist(Y_pred, main="Predicted response with simulated data", xlab="Y_pred")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.