In this tutorial we describe the steps for obtaining the results of the epidemiology application of Section 4 of the paper, in order to make them fully reproducible.
All the analyses are performed with a MacBook Air (OS X Sierra, version 10.12.6), using a R version 3.4.1. Notice that matrix decompositions involved in this code might differ across operating systems.
The code described below requires the installation of the LSBP
R package, available in this repository. See the README for instructions on the installation.
As a preliminary step, we load on a clean the environment all the required libraries.
rm(list=ls()) # Clean the current session
library(LSBP) # Load the LSBP package
library(ggplot2) # Graphical library
library(coda) # For MCMC analysis
library(splines) # For computing the natural B-splines basis
load("dde.RData") # Load the dataset in memory
The dde
dataset can be downloaded from this respository. It contains a data.frame
having two columns:
DDE
: the level of the Dichlorodiphenyldichloroethylene.GAD
: the gestational age at delivery, in days.The dataset comprises a total of 2312
observations. The DDE
is clearly related to the gestational age at delivery, as suggested by the scatterplot shown below; the smooth line is a loess estimate.
ggplot(data=dde, aes(x=DDE,y=GAD)) + geom_point(alpha=.5, cex=.5) + geom_smooth( method="loess", span = 1, col=1) + xlab("DDE (mg/L)") + ylab("Gestational age at delivery") + theme_bw()
To fit the LSBP model we first define some fixed quantities, like the number MCMC replications, the burn-in period and the number of mixture components H
.
n <- nrow(dde) # Number of observations
p <- 2 # Row and colums of the design for the kernel
p_splines <- 5 # Number of splines components
R <- 30000 # Number of replications
burn_in <- 5000 # Burn-in period
H <- 5 # Number of mixture components
Using the function prior_LSBP
we can specify the prior distribution, in the correct format. Also, notice that both GAD
and DDE
have been normalized.
For the mixing components, we use a natural cubic splines basis with $4$ equally spaced inner knots, obtained through the ns
command of the splines
package. As a result, for each mixture component we have $5$ parameters that have to be estimated.
prior <- prior_LSBP(p,p,
b_mixing = rep(0,p_splines+1), B_mixing=diag(1,p_splines+1),
b_kernel = c(0,0), B_kernel=diag(1,p),
a_tau = .1, b_tau= .1)
# Creation of the normalized dataset
dde_scaled <- data.frame(scale(dde))
Basis <- ns(dde_scaled$DDE,p_splines)
dde_scaled <- data.frame(dde_scaled, BS=Basis)
# Formula for the model
model_formula <- Formula::as.Formula(GAD ~ DDE | BS.1 + BS.2 + BS.3 + BS.4 + BS.5)
We first run the Gibbs sampling using the command LSBP_Gibbs
of the LSBP
package.
set.seed(10) # The seed is setted so that the Gibbs sampler is reproducible.
fit_Gibbs <- LSBP_Gibbs(model_formula, data=dde_scaled, H=H, prior=prior,
control=control_Gibbs(R=R,burn_in=burn_in), verbose=FALSE)
To alleviate the issue of local maxima, we run the ECM algorithm 10
times through the command LSBP_ECM
of the LSBP
package, and we select the model that reached the highest value of the log-posterior distribution.
logposterior <- rep(0,10)
for(i in 1:10){
set.seed(i) # Every time we run the algorithm, we set a seed varying with i
fit_ECM <- LSBP_ECM(model_formula, data=dde_scaled, H=H, prior=prior,
control=control_ECM(method_init = "random"), verbose=FALSE)
logposterior[i] <- fit_ECM$logposterior
}
# Then, we run the algorithm again selecting the seed having the maximum log-posterior
set.seed(which.max(logposterior))
fit_ECM <- LSBP_ECM(model_formula, data=dde_scaled, H=H, prior=prior,
control=control_ECM(method_init = "random"), verbose=FALSE)
As for the ECM algorithm, also the variational Bayes approach suffers the issue of local maxima. Therefore, we run the algorithm 10
times, selecting the one having the highest lower bound.
# VB algorithm
lower_bound <- rep(0,10)
for(i in 1:10){
set.seed(i)
fit_VB <- LSBP_VB(model_formula, data=dde_scaled, H=H, prior=prior,
control_VB(tol=1e-2,method_init="random"),verbose=FALSE)
lower_bound[i] <- fit_VB$lowerbound
}
set.seed(which.max(lower_bound))
fit_VB <- LSBP_VB(model_formula, data=dde_scaled, H=H, prior=prior,
control_VB(tol=1e-2,method_init="random"),verbose=FALSE)
We first create some auxiliary quantities that will be useful during this
# Points for which we will evaluate
DDE.points <- (round(quantile(dde$DDE,c(0.1,0.6,0.9,0.99)),2) - mean(dde$DDE))/sd(dde$DDE)
# And the correspondings design matrices
X1 <- cbind(1,DDE.points) # Design matrix for the kernel
X2 <- cbind(1,ns(DDE.points, # Design matrix for the stick-breaking weights
knots=attr(Basis,"knots"),
Boundary.knots=attr(Basis,"Boundary.knots")))
# Sequence for AGE and DDE
sequenceGAD <- seq(from=min(dde_scaled$GAD),to=max(dde_scaled$GAD),length=100)
sequenceDDE <- seq(from=min(dde_scaled$DDE),to=max(dde_scaled$DDE),length=100)
# Create a new dataset containing the values to be predicted
newdata <- data.frame(GAD=0, DDE=sequenceDDE,
BS= ns(sequenceDDE,knots=attr(Basis,"knots"),Boundary.knots=attr(Basis,"Boundary.knots")))
In the following chunks it is reported the code necessary for obtaining the conditional density for the three algorithms, using the function LSBP_density
of the LSBP
package, which evaluates the conditional density of a logit stick-breaking model.
For the Gibbs sampler, the conditional density is evaluated for different GAD
values, for each MCMC replication.
# Posterior density - Gibbs sampling
pred_Gibbs <- array(0,c(R,length(sequenceGAD),4))
for(r in 1:R){ # Cycle over the iterations of the MCMC chain
for(i in 1:100){ # Cycle over the GAD grid
pred_Gibbs[r,i,] <- c(LSBP_density(sequenceGAD[i],X1,X2,
fit_Gibbs$param$beta_mixing[r,,],
fit_Gibbs$param$beta_kernel[r,,],
fit_Gibbs$param$tau[r,]))/sd(dde$GAD)
}
}
# Computing posterior means and posterior quantiles
estimate_Gibbs <- apply(pred_Gibbs,c(2,3),mean)
lower_Gibbs <- apply(pred_Gibbs,c(2,3),function(x) quantile(x,0.025))
upper_Gibbs <- apply(pred_Gibbs,c(2,3),function(x) quantile(x,0.975))
Similarly, for the ECM algorithm we plug-in the MAP estimate into the conditional density
# Posterior density estimate for the ECM model
estimate_ECM <- matrix(0,length(sequenceGAD),4)
for(i in 1:100){ # Cycle over the GAD grid
estimate_ECM[i,] <- c(LSBP_density(sequenceGAD[i],X1,X2,
fit_ECM$param$beta_mixing,
fit_ECM$param$beta_kernel,
fit_ECM$param$tau))/sd(dde$GAD)
}
Finally, we compute the posterior conditional density also for the VB approximation. We need first to sample values from the variational approximation: we will then compute the conditional density at each sampled value, thus obtaining a sample from the conditional density.
set.seed(123)
# Posterior density estimate for the VB model
fit_VB$beta_mixing_sim <- array(0, c(R, H - 1, p_splines+1))
fit_VB$beta_kernel_sim <- array(0, c(R, H, p))
fit_VB$tau_sim <- matrix(0, R, H)
# Generating values from the VB approximation
for (h in 1:H) {
if (h < H) {
eig <- eigen(fit_VB$param$Sigma_mixing[h, , ], symmetric = TRUE)
A1 <- t(eig$vectors) * sqrt(eig$values)
fit_VB$beta_mixing_sim[, h, ] <- t(fit_VB$param$mu_mixing[h, ] + t(matrix(rnorm(R * (p_splines+1)), R, p_splines+1) %*% A1))
}
eig <- eigen(fit_VB$param$Sigma_kernel[h, , ], symmetric = TRUE)
A1 <- t(eig$vectors) * sqrt(eig$values)
fit_VB$beta_kernel_sim[, h, ] <- t(fit_VB$param$mu_kernel[h, ] + t(matrix(rnorm(R * p), R, p) %*% A1))
fit_VB$tau_sim[, h] <- rgamma(R, fit_VB$param$a_tilde[h], fit_VB$param$b_tilde[h])
}
# Posterior density - VB
pred_VB <- array(0,c(R,length(sequenceGAD),4))
for(r in 1:R){ # Cycle over the R simulations of the previous step
for(i in 1:100){ # Cycle over the GAD grid
pred_VB[r,i,] <- c(LSBP_density(sequenceGAD[i],X1,X2,
fit_VB$beta_mixing_sim[r,,],
fit_VB$beta_kernel_sim[r,,],
fit_VB$tau_sim[r,]))/sd(dde$GAD)
}
}
# Posterior mean and quantiles
estimate_VB <- apply(pred_VB,c(2,3),mean)
lower_VB <- apply(pred_VB,c(2,3),function(x) quantile(x,0.025))
upper_VB <- apply(pred_VB,c(2,3),function(x) quantile(x,0.975))
The construction of Figure 4 of the paper proceeds as follows
# Construction of the data_frame - Notice that the values are reconducted to the original scale.
data.plot <- data.frame(
prediction = c(c(estimate_Gibbs),c(estimate_ECM),c(estimate_VB)),
lower = c(c(lower_Gibbs),rep(NA,100*4),c(lower_VB)),
upper = c(c(upper_Gibbs),rep(NA,100*4),c(upper_VB)),
sequenceGAD = rep(sequenceGAD,3*4)*sd(dde$GAD) + mean(dde$GAD),
DDE.points = rep(rep(DDE.points,each=100),3)*sd(dde$DDE)+ mean(dde$DDE),
Algorithm = c(rep("Gibbs sampler",4*100),rep("ECM",4*100),rep("Variational Bayes",4*100))
)
data.plot2 <- data.frame(
GAD = rep(c(dde$GAD[which(dde$DDE < 20.505)],
dde$GAD[which(dde$DDE >= 20.505 & dde$DDE < 41.08)],
dde$GAD[which(dde$DDE >= 41.08 & dde$DDE < 79.6)],
dde$GAD[which(dde$DDE > 79.6)]),3),
DDE.points = rep(c(rep(12.57,sum(dde$DDE < 20.505)),
rep(28.44,sum(dde$DDE >= 20.505 & dde$DDE < 41.08)),
rep(53.72,sum(dde$DDE >= 41.08 & dde$DDE < 79.6)),
rep(105.47,sum(dde$DDE > 79.6))),3),
Algorithm = c(rep("Gibbs sampler",nrow(dde)),rep("ECM",nrow(dde)),rep("Variational Bayes",nrow(dde)))
)
ggplot(data=data.plot) + geom_line(aes(x=sequenceGAD,y=prediction)) + facet_grid(Algorithm~ DDE.points,scales="free_y") + ylab("Density") + geom_ribbon(alpha=0.4,aes(x=sequenceGAD,ymin=lower,ymax=upper)) +xlab("Gestational age at delivery") + geom_histogram(data=data.plot2,aes(x=GAD,y=..density..),alpha=0.2,bins=25) + theme_bw()
Finally, we produce the conditional probability of being under a threshold T of Figure 5 of the paper
# Notice that the GAD and DDE values are reconducted to the original scale.
gibbs_cdf <- cbind(predict(fit_Gibbs,type="cdf",threshold=(33*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata),
predict(fit_Gibbs,type="cdf",threshold=(35*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata),
predict(fit_Gibbs,type="cdf",threshold=(37*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata),
predict(fit_Gibbs,type="cdf",threshold=(40*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata))
vb_cdf <- cbind(predict(fit_VB,type="cdf",threshold=(33*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata),
predict(fit_VB,type="cdf",threshold=(35*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata),
predict(fit_VB,type="cdf",threshold=(37*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata),
predict(fit_VB,type="cdf",threshold=(40*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata))
ECM_cdf <- c(predict(fit_ECM,type="cdf",threshold=(33*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata),
predict(fit_ECM,type="cdf",threshold=(35*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata),
predict(fit_ECM,type="cdf",threshold=(37*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata),
predict(fit_ECM,type="cdf",threshold=(40*7 - mean(dde$GAD))/sd(dde$GAD),newdata=newdata))
data.cdf <- data.frame(DDE=rep(sequenceDDE,3*4)*sd(dde$DDE) + mean(dde$DDE),
Algorithm=rep(c("ECM","Gibbs sampler","Variational Bayes"),each=4*length(sequenceDDE)),
CDF = c(ECM_cdf,colMeans(gibbs_cdf),colMeans(vb_cdf)),
Threshold = rep(rep(c("T = 33","T = 35","T = 37","T = 40"),each=length(sequenceDDE)),3),
Upper = c(rep(NA,4*length(sequenceDDE)),
apply(gibbs_cdf,2,function(x) quantile(x,0.975)),
apply(vb_cdf,2,function(x) quantile(x,0.975))),
Lower = c(rep(NA,4*length(sequenceDDE)),
apply(gibbs_cdf,2,function(x) quantile(x,0.025)),
apply(vb_cdf,2,function(x) quantile(x,0.025))))
ggplot(data=data.cdf,aes(x=DDE,y=CDF,ymin=Lower,ymax=Upper)) + geom_line() + facet_grid(Algorithm~Threshold)+ xlab("DDE (mg/L)")+ylab("Pr(Gestational Length < T)") + geom_ribbon(alpha=0.2,col="white") + theme_bw()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.