`refitME`: Vignette for fitting measurement error models using Monte Carlo Expectation Maximization in `R`

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.pos = 'H')

This vignette documents fitting an MCEM algorithm via the refitME R-package. For more specific details see: refitME: Measurement Error Modelling using Monte Carlo Expectation Maximization in R. Also, see ?refitME for further details on the fitting function, input arguments and output.

Example 1: A simple GLM example taken from Carroll et al. (2006).

We use the Framingham heart study data set. In addition to the naive model, we also fit a simex model and compare it with MCEM. Computational times for both models are also reported.

Load data and R-packages.

suppressWarnings(suppressMessages(library(refitME)))
suppressWarnings(suppressMessages(library(simex)))

set.seed(2020)

data(Framinghamdata)

Fit the naive model.

The first stored variable w1 is the error contaminated predictor as used in the original Carroll et al. (2006) analysis.

glm_naiv <- glm(Y ~ w1 + z1 + z2 + z3, x = TRUE, family = binomial, 
                 data = Framinghamdata)

Setup measurement error (ME) variance and all tuning parameters.

sigma.sq.u <- 0.006295 # ME variance, as obtained from Carroll et al. (2006).
B <- 100  # The number of Monte Carlo replication values/SIMEX simulations.

\clearpage

Fit the SIMEX model.

start <- Sys.time()
glm_simex <- simex(glm_naiv, SIMEXvariable = c("w1"),
                  measurement.error = cbind(sqrt(sigma.sq.u)), B = B) # SIMEX.
end <- Sys.time()
t1 <- difftime(end, start, units = "secs")
comp.time <- c(t1)

Fit the MCEM model.

start <- Sys.time()
glm_MCEM <- refitME(glm_naiv, sigma.sq.u, B)
end <- Sys.time()
t2 <- difftime(end, start, units = "secs")
comp.time <- c(comp.time, t2)

Report model estimates and compare computational times.

est.beta <- rbind(coef(glm_naiv), coef(glm_simex), coef(glm_MCEM))
est.beta.se <- rbind(sqrt(diag(vcov(glm_naiv))),
                   sqrt(diag(glm_simex$variance.jackknife)), sqrt(diag(vcov(glm_MCEM))))
row.names(est.beta) = row.names(est.beta.se) <- c("Naive GLM", "SIMEX", "MCEM")
colnames(est.beta) = colnames(est.beta.se) <- c("(Intercept)", "SBP", "chol. level", 
                                                "age", "smoke")
round(est.beta, digits = 3)
round(est.beta.se, digits = 3)  # Standard error estimates.

names(comp.time) <- c("SIMEX", "MCEM")
comp.time  # SIMEX and MCEM.

Example 2: A GAM example taken from Ganguli et al. (2005).

The Milan mortality air pollution data set is part of the SemiPar package. Here, we fit GAM models via the mgcv package where one predictor (daily total suspended particles measurements) is error-contaminated.

Load data and R-packages.

suppressWarnings(suppressMessages(library(refitME)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(SemiPar)))
suppressWarnings(suppressMessages(library(mgcv)))

set.seed(2021)

data(milan.mort)
dat.air <- sample_n(milan.mort, 100) # Take a random sample of 100.

Setup all variables.

Y <- dat.air[, 6]  # Mortality counts.
n <- length(Y)

z1 <- (dat.air[, 1])
z2 <- (dat.air[, 4])
z3 <- (dat.air[, 5])
w1 <- log(dat.air[, 9]) # The error-contaminated predictor (total suspended particles).
dat <- data.frame(cbind(Y, w1, z1, z2, z3))

Fit the naive model.

gam_naiv <- gam(Y ~ s(w1) + s(z1, k = 25) + s(z2) + s(z3), family = "poisson", data = dat)
plot_gam_naiv <- plot(gam_naiv, select = 1)

\clearpage

Fit the MCEM model.

sigma.sq.u <- 0.0915 # This gives a reliability ratio of 0.7.
rel.rat <- round((1 - sigma.sq.u/var(dat$w1))*100, digits = 0)

gam_MCEM1 <- refitME(gam_naiv, sigma.sq.u)

```r Plots of smooths against each predictor. TSP (top left is the error contaminated variable)."} xlab.names <- c("log(TSP)", "Day", "Temp", "Humidity")

op <- par(mfrow = c(2, 2), las = 1)

for(i in 1:4) { if (i == 1) { plot(gam_MCEM1, select = i, ylim = c(-0.35, 0.2), xlim = range(plot_gam_naiv[[1]]$x), rug = FALSE, col = "blue", all.terms = TRUE, xlab = xlab.names[i], ylab = "s(Mortality counts)", lwd = 2, cex.lab = 1.3, cex.axis = 1.3, cex.main = 2, font.lab = 1.1, cex = 1.4, shade = T) lines(plot_gam_naiv[[1]]$x, plot_gam_naiv[[1]]$fit, type = "l", col = "red", lwd = 2, lty = 2) title(main = bquote("Reliability ratio of predictor is"~.(rel.rat) ~ "%"), outer = F, line = 1, cex = 1.4) legend("bottomright", c("Naive GAM", "MCEM GAM"), col = c("red", "blue"), lty = c(2, 1), lwd = 2, bty = "n") for(j in 1:2) { axis(j, labels = FALSE) } } if (i == 2) { plot(gam_MCEM1, select = i, ylim = c(-0.25, 0.3), rug = FALSE, col = "blue", all.terms = TRUE, xlab = xlab.names[i], ylab = "s(Mortality counts)", lwd = 2, cex.lab = 1.3, cex.axis = 1.3, cex.main = 2, font.lab = 1.1, cex = 1.4, shade = T) lines(plot_gam_naiv[[2]]$x, plot_gam_naiv[[2]]$fit, type = "l", col = "red", lwd = 2, lty = 2) for(j in 1:2) { axis(j, labels = FALSE) } } if (i == 3) { plot(gam_MCEM1, select = i, ylim = c(-0.4, 0.4), rug = FALSE, col = "blue", all.terms = TRUE, xlab = xlab.names[i], ylab = "s(Mortality counts)", lwd = 2, cex.lab = 1.3, cex.axis = 1.3, cex.main = 2, font.lab = 1.1, cex = 1.4, shade = T) lines(plot_gam_naiv[[3]]$x, plot_gam_naiv[[3]]$fit, type = "l", col = "red", lwd = 2, lty = 2) for(j in 1:2) { axis(j, labels = FALSE) } } if (i == 4) { plot(gam_MCEM1, select = i, ylim = c(-0.1, 0.1), rug = FALSE, col = "blue", all.terms = TRUE, xlab = xlab.names[i], ylab = "s(Mortality counts)", lwd = 2, cex.lab = 1.1, cex.axis = 1.1, cex.main = 2, font.lab = 1.1, cex = 1.4, shade = T) lines(plot_gam_naiv[[4]]$x, plot_gam_naiv[[4]]$fit, type = "l", col = "red", lwd = 2, lty = 2) for(j in 1:2) { axis(j, labels = FALSE) } } }

title(main = "MCEM (Poisson GAM) fitted to the air pollution data.", outer = T, line = -2) par(op)

detach(package:mgcv)

\clearpage

## Example 3: A point-process model using presence-only data

We use the _Corymbia eximia_ presence-only data set from Renner and Warton (2013). Here, we fit a naive models point-process model (PPM) and an MCEM PPM.

### Load data and `R`-packages.

```r
suppressWarnings(suppressMessages(library(refitME)))
suppressWarnings(suppressMessages(library(caret)))

data(Corymbiaeximiadata)

dat <- Corymbiaeximiadata

Setup all variables.

Y1 <- dat$Y.obs

Rain <- dat$Rain
D.Main <- dat$D.Main
MNT <- dat$MNT

p.wt <- rep(1.e-6, length(Y1))
p.wt[Y1 == 0] <- 1

Y <- Y1/p.wt

X <- cbind(rep(1, length(Y)), poly(MNT, degree = 2, raw = TRUE),
          poly(Rain, degree = 2, raw = TRUE),
            poly(sqrt(D.Main), degree = 2, raw = TRUE))

colnames(X) <- c("(Intercept)", "X1", "X2", "Z1", "Z2", "Z3", "Z4")

dat1 <- data.frame(cbind(Y, X, p.wt))
colnames(dat1)[c(1, ncol(dat1))] <- c("Y", "p.wt")

Fit the naive (PPM) model.

PPM_naiv1 <- glm(Y ~ X1 + X2 + Z1 + Z2 + Z3 + Z4, family = "poisson",
 weights = p.wt, data = dat1)

\clearpage

Fit the MCEM PPM.

sigma.sq.u <- 0.25

B <- 20 # Consider increasing `B` if you want a more accurate answer.

PPM_MCEM1 <- refitME(PPM_naiv1, sigma.sq.u, B) # Should take about 2 mins.

Plot the predicted presences of Corymbia eximia

```r Plot of predicted presences of Corymbia eximia using presence-only data when fitting the MCEM model. Here the max temperature predictor is assumed to be error-contaminated."} coord.dat <- cbind(Corymbiaeximiadata$X, Corymbiaeximiadata$Y)

colnames(coord.dat) <- c("Longitude", "Latitude")

pred.dats <- as.data.frame(cbind(coord.dat[, 1], coord.dat[, 2], PPM_MCEM1$fitted.values))

colnames(pred.dats) <- c("x", "y", "preds")

levelplot(preds ~ x + y, data = pred.dats, asp = "iso", ylab = "Latitude", xlab = "Longitude", col.regions = heat.colors(1024)[900:1], cuts = 900, main = list("", cex = 5), scales = list(y = list(draw = FALSE), x = list(draw = FALSE)), colorkey = list(labels = list(cex = 0.8)))

## Example 4: A `VGAM` example using the Prinia flaviventris capture-recapture data.

We use the _Prinia flaviventris_ capture-recapture data set from Hwang, Huang and Wang (2007). Here, we fit naive `vglm` and `vgam` capture-recapture models, and the MCEM capture-recapture model. For all models we used the `posbinomial()` family provided in `VGAM`.

### Load data and `R`-packages.

```r
suppressWarnings(suppressMessages(library(refitME)))
suppressMessages(library(VGAM))
data(Priniadata)

Setup all variables.

tau <- 17   # No. of capture occasions.
w2 <- c(scale(Priniadata$w1)) # Standardize the wing length predictor.
Priniadata$w2 <- w2

Fit the naive vglm and vgam capture-recapture models.

CR_naiv1 <- vglm(cbind(cap, noncap) ~ w2,
                 VGAM::posbinomial(omit.constant = TRUE, 
                                   parallel = TRUE ~ w2),
                 data = Priniadata, trace = FALSE)
CR_naiv2 <- vgam(cbind(cap, noncap) ~ s(w2, df = 2),
                 VGAM::posbinomial(omit.constant = TRUE, 
                                   parallel = TRUE ~ s(w2, df = 2)),
                 data = Priniadata, trace = FALSE)

Fit the MCEM capture-recapture model.

sigma.sq.u <- 0.37/var(Priniadata$w1) # Measurement error variance.
B <- 100
CR_MCEM <- refitME(CR_naiv2, sigma.sq.u, B)

detach(package:VGAM)


Try the refitME package in your browser

Any scripts or data that you put into this service are public.

refitME documentation built on Aug. 3, 2021, 5:07 p.m.