This document describes how the different models were fit to the simulated data from Appendix 2 and how each model's performance was evaluated.
Note that the pairs
program was run separately (outside of R) with the following options:
Initialization:
mc.cores = 8 library(dplyr) # For manipulating data structures library(corpcor) # For regularized partial covariances library(rosalia) # For Markov networks library(arm) # For regularized logistic regression library(BayesComm) # For joint species distribution modeling library(parallel) # for mclapply set.seed(1)
A function to import the data file and run each method on it:
fit_all = function(identifier){ ######## Import ######## data_filename = paste0("fakedata/matrices/", identifier, ".csv") truth_filename = paste0("fakedata/truths/", identifier, ".txt") # first column is row numbers; drop it raw_obs = as.matrix(read.csv(data_filename)[ , -1]) # Identify species that are never present (or never absent) so they # can be dropped species_is_variable = diag(var(raw_obs)) > 0 pair_is_variable = tcrossprod(species_is_variable) > 0 x = raw_obs[ , species_is_variable] truth = unlist(read.table(truth_filename))[pair_is_variable[upper.tri(pair_is_variable)]] splitname = strsplit(identifier, "/|-|\\.")[[1]] n_sites = as.integer(splitname[[1]]) rep_name = splitname[[2]] # Species IDs sp1 = combn(colnames(x), 2)[1, ] sp2 = combn(colnames(x), 2)[2, ] ######## Partial correlations ######## p_corr = pcor.shrink(x) ######## Correlations ######## corr = cor(x) ######## GLM ######## coef_matrix = matrix(0, ncol(x), ncol(x)) for(i in 1:ncol(x)){ if(var(x[,i]) > 0){ coefs = coef(bayesglm(x[,i] ~ x[ , -i], family = binomial))[-1] coef_matrix[i, -i] = coefs } } coef_matrix = (coef_matrix + t(coef_matrix)) / 2 ######## Markov network ######## rosie = rosalia(x, maxit = 200, trace = 0, prior = make_logistic_prior(scale = 2), hessian = TRUE) rosie_point = rosie$beta[upper.tri(rosie$beta)] rosie_se = sqrt(diag(solve(rosie$opt$hessian)))[-(1:sum(species_is_variable))] ######## BayesComm and partial BayesComm ######## bc = BC(Y = x, model = "community", its = 1000) bc_pcors = sapply( 1:nrow(bc$trace$R), function(i){ Sigma = matrix(0, nrow = ncol(x), ncol = ncol(x)) Sigma[upper.tri(Sigma)] <- bc$trace$R[i, ] # Fill in upper triangle Sigma <- Sigma + t(Sigma) # Fill in lower triangle diag(Sigma) <- 1 # Diagonal equals 1 in multivariate probit model pcor = cor2pcor(Sigma) pcor[upper.tri(pcor)] } ) bind_rows( data_frame( rep_name = rep_name, n_sites = n_sites, sp1 = sp1, sp2 = sp2, method = "Markov network", truth = truth, estimate = rosie_point, lower = qnorm(.025, rosie_point, rosie_se), upper = qnorm(.975, rosie_point, rosie_se) ), data_frame( rep_name = rep_name, n_sites = n_sites, sp1 = sp1, sp2 = sp2, method = "GLM", truth = truth, estimate = coef_matrix[upper.tri(coef_matrix)], lower = NA, upper = NA ), data_frame( rep_name = rep_name, n_sites = n_sites, sp1 = sp1, sp2 = sp2, method = "correlation", truth = truth, estimate = corr[upper.tri(corr)], lower = NA, upper = NA ), data_frame( rep_name = rep_name, n_sites = n_sites, sp1 = sp1, sp2 = sp2, method = "partial correlation", truth = truth, estimate = p_corr[upper.tri(p_corr)], lower = NA, upper = NA ), data_frame( rep_name = rep_name, n_sites = n_sites, sp1 = sp1, sp2 = sp2, method = "partial BayesComm", truth = truth, estimate = rowMeans(bc_pcors), lower = apply(bc_pcors, 1, quantile, .025), upper = apply(bc_pcors, 1, quantile, .975) ), data_frame( rep_name = rep_name, n_sites = n_sites, sp1 = sp1, sp2 = sp2, method = "BayesComm", truth = truth, estimate = colMeans(bc$trace$R), lower = apply(bc$trace$R, 2, quantile, .025), upper = apply(bc$trace$R, 2, quantile, .975) ) ) }
Run the above function on all the files:
# Find all the csv files in the fakedata/matrices folder, # then drop .csv identifiers = dir("fakedata/matrices", pattern = "\\.csv$") %>% gsub("\\.csv$", "", .)
mclapply(identifiers, fit_all, mc.cores = mc.cores, mc.preschedule = FALSE, mc.silent = TRUE) %>% bind_rows() %>% write.csv(file = "estimates.csv")
From the R help file for the logistic distribution:
The Logistic distribution with location = m and scale = s has... density
f(x) = 1/s exp((x-m)/s) (1 + exp((x-m)/s))^-2.
It is a long-tailed distribution with mean m and variance $\pi$^2 /3 s^2.
As noted in the main text, I used location zero and scale 2 (plotted in
red below). The "true" parameter distribution for $\beta$ is drawn in black,
for reference. These distributions have different means and the prior is
substantially wider than the distribution of true parameters (mean absolute
deviation from zero about r round(mean(abs(rlogis(1E7, scale = 2))), 1)
times larger).
curve(ifelse(x < 0, .75 * dexp(abs(x)), .25 * dexp(x)), from = -15, to = 15, n = 1000, ylab = "Density") curve(dlogis(x, location = 0, scale = 2), add = TRUE, col = "red")
The log of this density was added to the log-likelihood to calculate an
un-normalized log-posterior, which is optimized by the optim
function in the
stats
package.
This prior distribution is "weakly informative" in the sense of Gelman et al. (2008. Annals of Applied Statistics, "A weakly informative default prior distribution for logistic and other regression models"), as shown below: the logistic regularizer pulls the estimated values toward zero less strongly than Gelman et al.'s weakly-informative Cauchy prior does.
x = as.matrix( read.csv("fakedata/matrices/25-no_env1.csv")[,-1] ) # Don't estimate parameters for species that were never observed x = x[ , colMeans(x) != 0] # Estimate parameters using a weakly informative logistic prior logistic_fit = rosalia(x, prior = make_logistic_prior(scale = 2), trace = FALSE) # Estimate parameters using Gelman et al.'s weakly informative Cauchy prior cauchy_fit = rosalia(x, prior = make_cauchy_prior(scale = 2.5), trace = FALSE) plot(logistic_fit$beta, cauchy_fit$beta) abline(0,1, h = 0, v = 0, col = "darkgray")
The GLM method produces two estimates for each species pair. These estimates tend to be very similar, so most reasonable methods for generating a consensus estimate should produce very similar results.
get_glm_cor = function(identifier){ data_filename = paste0("fakedata/matrices/", identifier, ".csv") # first column is row numbers; drop it raw_obs = as.matrix(read.csv(data_filename)[ , -1]) # Identify species that are never present (or never absent) so they # can be dropped species_is_variable = diag(var(raw_obs)) > 0 pair_is_variable = tcrossprod(species_is_variable) > 0 x = raw_obs[ , species_is_variable] coef_matrix = matrix(0, ncol(x), ncol(x)) # Fill in the coefficient matrix, column-by-column for(i in 1:ncol(x)){ if(var(x[,i]) > 0){ coefs = coef(bayesglm(x[,i] ~ x[ , -i], family = binomial))[-1] coef_matrix[i, -i] = coefs } } # Compare the estimates from the upper triangle with the estimates from # the estimates in the lower triangle (which becomes the upper triangle after # transposing) cor(coef_matrix[upper.tri(coef_matrix)], t(coef_matrix)[upper.tri(coef_matrix)]) } glm_correlations = unlist(mclapply(identifiers, get_glm_cor)) mean(glm_correlations) hist(glm_correlations)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.