### function defining the EM algorithm for the CFR estimates
##' A function to estimate the relative case fatality ratio when reporting rates
##' are time-varying and deaths are lagged because of survival time.
##'
##' This function implements an EM algorithm to estimate the relative case
##' fatality ratio between two groups when reporting rates are time-varying and
##' deaths are lagged because of survival time.
##' @usage EMforCFR(assumed.nu, alpha.start.values, full.data, max.iter = 50,
##' verb = FALSE, tol = 1e-10, SEM.var = TRUE)
##'
##' @param assumed.nu a vector of probabilities corresponding to the survival
##' distribution, i.e. nu[i]=Pr(surviving i days | fatal case)
##' @param alpha.start.values a vector starting values for the reporting rate
##' parameter of the GLM model. This must have length which corresponds to one
##' less than the number of unique integer values of full.dat[,"new.times"].
##' @param full.data A matrix of observed data. See description below.
##' @param max.iter The maximum number of iterations for the EM algorithm and
##' the accompanying SEM algorithm (if used).
##' @param verb An indicator for whether the function should print results as it
##' runs.
##' @param tol A tolerance to use to test for convergence of the EM algorithm.
##' @param SEM.var If TRUE, the SEM algorithm will be run in addition to the EM
##' algorithm to calculate the variance of the parameter estimates.
##'
##' @details The data matrix full.data must have the following columns:
##' \describe{ \item{grp}{a 1 or a 2 indicating which of the two groups, j,
##' the observation is for.} \item{new.times}{an integer value representing
##' the time, t, of observation.} \item{R}{the count of recovered cases with
##' onset at time t in group j.} \item{D}{the count of deaths which occurred at
##' time t in groupo j (note that these deaths did not have disease onset at
##' time t but rather died at time t).} \item{N}{the total cases at t, j, or
##' the sum of R and D columns.} }
##'
##' @return A list with the following elements \describe{ \item{naive.rel.cfr
##' }{the naive estimate of the relative case fatality ratio}
##' \item{glm.rel.cfr }{the reporting-rate-adjusted estimate of the relative
##' case fatality ratio} \item{EM.rel.cfr }{the lag-adjusted estimate of the
##' relative case fatality ratio} \item{EM.re.cfr.var }{the variance for the
##' log-scale lag-adjusted estimator taken from the final M-step}
##' \item{EM.rel.cfr.var.SEM }{ the Supplemented EM algorithm variance for the
##' log-scale lag-adjusted estimator} \item{EM.rel.cfr.chain }{a vector of the
##' EM algorithm iterates of the lag-adjusted relative CFR estimates}
##' \item{EMiter}{the number of iterations needed for the EM algorithm to
##' converge} \item{EMconv}{indicator for convergence of the EM algorithm. 0
##' indicates all parameters converged within max.iter iterations. 1 indicates
##' that the estimate of the relative case fatality ratio converged but other
##' did not. 2 indicates that the relative case fatality ratio did not
##' converge.} \item{SEMconv}{indicator for convergence of SEM algorithm.
##' Same scheme as EMconv.} \item{ests}{ the coefficient estimates for the
##' model } \item{ests.chain}{ a matrix with all of the coefficient estimates,
##' at each EM iteration} \item{DM}{the DM matrix from the SEM algorithm}
##' \item{DMiter}{a vector showing how many iterations it took for the
##' variance component to converge in the SEM algorithm} }
##' @examples
##' ## This is code from the CFR vignette provided in the documentation.
##'
##' data(simulated.outbreak.deaths)
##' min.cases <- 10
##' N.1 <- simulated.outbreak.deaths[1:60, "N"]
##' N.2 <- simulated.outbreak.deaths[61:120, "N"]
##' first.t <- min(which(N.1 > min.cases & N.2 > min.cases))
##' last.t <- max(which(N.1 > min.cases & N.2 > min.cases))
##' idx.for.Estep <- first.t:last.t
##' new.times <- 1:length(idx.for.Estep)
##' simulated.outbreak.deaths <- cbind(simulated.outbreak.deaths, new.times = NA)
##' simulated.outbreak.deaths[c(idx.for.Estep, idx.for.Estep + 60), "new.times"] <- rep(new.times, + 2)
##' assumed.nu = c(0, 0.3, 0.4, 0.3)
##' alpha.start <- rep(0, 22)
##'
##' ## caution! this next line may take several minutes (5-10, depanding on
##' ## the speed of your machine) to run.
##' \dontrun{cfr.ests <- EMforCFR(assumed.nu = assumed.nu,
##' alpha.start.values = alpha.start,
##' full.data = simulated.outbreak.deaths,
##' verb = FALSE,
##' SEM.var = TRUE,
##' max.iter = 500,
##' tol = 1e-05)}
##' @keywords coarse data
##' @keywords incomplete data
##' @keywords case fatality ratio
##' @keywords infectious disease
##' @export
EMforCFR <- function(assumed.nu, alpha.start.values, full.data,
max.iter = 50, verb = FALSE, tol = 1e-10, SEM.var = TRUE) {
## full.data is the data output from the observe.epidemic() function
## with a column specifying the rows to be used for analysis
#######################################
## calculate naive and GLM estimates ##
#######################################
D.1 <- sum(full.data[full.data[, "grp"] == 1 & !is.na(full.data[, "new.times"]), "D"])
D.2 <- sum(full.data[full.data[, "grp"] == 2 & !is.na(full.data[, "new.times"]), "D"])
N.1 <- sum(full.data[full.data[, "grp"] == 1 & !is.na(full.data[, "new.times"]), "N"])
N.2 <- sum(full.data[full.data[, "grp"] == 2 & !is.na(full.data[, "new.times"]), "N"])
naive.rel.cfr <- sum(D.2) / sum(N.2) / (sum(D.1) / sum(N.1))
dat <- data.frame(full.data)
unadj.glm.fit <- glm(dat[, "D"] ~ factor(dat[, "new.times"]) + factor(dat[, "grp"]),
offset = log(dat[, "N"]), family = poisson
)
glm.rel.cfr <- exp(coef(unadj.glm.fit))[length(coef(unadj.glm.fit))]
#########################
## set up EM algorithm ##
#########################
proposed.rel.cfr.chain <- rep(0, max.iter)
n.params <- max(dat[, "new.times"], na.rm = TRUE) + 1
phi.chain <- matrix(nrow = n.params, ncol = max.iter)
nlag <- length(assumed.nu)
## convergence codes:
## 0 = all parameters converge
## 1 = rCFR converges, but some other parameters do not
## 2 = rCFR does not converge
EMconv <- SEMconv <- 0
## looping variables
eps <- 1
j <- 0
alpha <- alpha.start.values
while (eps > tol) {
j <- j + 1
############
## E STEP ##
############
dat <- run.Estep(alpha,
full.data = full.data,
nlag = nlag, assumed.nu = assumed.nu
)
############
## M STEP ##
############
maxLik <- run.Mstep(dat)
phi.chain[, j] <- maxLik$phi
alpha <- phi.chain[2:(n.params - 1), j]
proposed.rel.cfr.chain[j] <- exp(phi.chain[n.params, j])
## check convergence
if (j > 1) eps <- max((phi.chain[, j] - phi.chain[, j - 1])^2)
if (j == max.iter) {
## message("*** WARNING: EM agorithm didn't converge ***")
if ((phi.chain[n.params, j] - phi.chain[n.params, j - 1])^2 < eps) {
EMconv <- 1
} else {
EMconv <- 2
}
break
}
}
var.joint <- maxLik$Var
phi <- phi.chain[, j]
proposed.rel.cfr <- proposed.rel.cfr.chain[j]
EMiter <- j
if (verb) {
print(paste("naive estimator =", round(naive.rel.cfr, 3)))
print(paste("GLM estimate unadjusted for lags =", round(glm.rel.cfr, 3)))
print(paste(EMiter, "iterations for EM convergence"))
}
if (SEM.var) {
DM.out <- SEM.variance(full.data = full.data, dat, phi, max.iter, tol, nlag = nlag, alpha.start.values, assumed.nu = assumed.nu)
DM <- DM.out$DM
DMiter <- DM.out$DMiter
if (length(DM.out$loop.idx) > 0) {
loop.idx <- DM.out$loop.idx
var.joint <- var.joint[-loop.idx, -loop.idx]
}
## set SEM convergence codes
if (any(DMiter == 0)) {
if (DMiter[length(DMiter)] == 0) {
SEMconv <- 2
} else {
SEMconv <- 1
}
}
## calculate CFR variance if it converged
if (SEMconv < 2) {
variance.SEM <- var.joint +
var.joint %*% DM %*% solve(diag(1, ncol(DM)) - DM)
proposed.rel.cfr.var.SEM <- variance.SEM[nrow(DM), nrow(DM)]
if (proposed.rel.cfr.var.SEM < 0) {
proposed.rel.cfr.var.SEM <- NA
SEMconv <- 2
}
} else {
proposed.rel.cfr.var.SEM <- NA
}
if (verb) {
print("Estimate with SEM")
print(paste(
"proposed CFR =", round(proposed.rel.cfr, 3),
"; 95% CI (",
round(
exp(log(proposed.rel.cfr) - 2 * sqrt(proposed.rel.cfr.var.SEM)),
3
),
",",
round(
exp(log(proposed.rel.cfr) + 2 * sqrt(proposed.rel.cfr.var.SEM)),
3
),
")"
))
if (any(DM.out$DMiter == 0)) {
print("non-convergent DM indices:")
print(which(DM.out$DMiter == 0))
}
}
} else {
SEMconv <- NA
DM <- NULL
DMiter <- NULL
proposed.rel.cfr.var.SEM <- NULL
}
## ##############
## OUTPUT LIST ##
## ##############
out <- list(
naive.rel.cfr = naive.rel.cfr,
glm.rel.cfr = glm.rel.cfr,
EM.rel.cfr = proposed.rel.cfr,
EM.rel.cfr.var = var.joint[nrow(var.joint), nrow(var.joint)],
EM.rel.cfr.var.SEM = proposed.rel.cfr.var.SEM,
EM.rel.cfr.chain = proposed.rel.cfr.chain,
EMiter = EMiter,
EMconv = EMconv,
SEMconv = SEMconv,
ests = phi,
ests.chain.EM = phi.chain,
DM = DM,
DMiter = DMiter
)
if (verb) {
print("Estimate with just GLM variance")
print(paste(
"proposed CFR =", round(proposed.rel.cfr, 3),
"; 95% CI (",
round(
exp(log(proposed.rel.cfr) - 2 * sqrt(out$EM.rel.cfr.var)),
3
),
",",
round(
exp(log(proposed.rel.cfr) + 2 * sqrt(out$EM.rel.cfr.var)),
3
),
")"
))
}
return(out)
}
## This function is meant to be run only through the function EMforCFR()
## and is used to calculate the variance via the Supplemented EM
## algorithm (see Meng and Rubin, 1991)
## params:
## \item{full.data}{ A matrix of observed data. See description in EMforCFR helpfile.}
## \item{dat}{A data frame.}
## \item{phi}{ A vector of fitted parameters from the final EM iteration.}
## \item{max.iter}{ The maximum number of iterations for SEM algorithm. }
## \item{tol}{ A tolerance to use to test for convergence of the EM algorithm. }
## \item{nlag}{ The number of time units for lagged data. Corresponds to length(assumed.nu).}
## \item{alpha.start.values}{ a vector starting values for the reporting rate parameter of the GLM model. This must have length which corresponds to one less than the number of unique integer values of full.dat[,"new.times"].}
## \item{assumed.nu}{ a vector of probabilities corresponding to the survival distribution, i.e. nu[i]=Pr(surviving i days | fatal case) }
## returned list:
## \item{DM}{The estimate of the variance-covariance matrix for the model parameters. Only converged rows are returned.}
## \item{DMiter}{A vector whose ith entry is the number of iterations needed for convergence of the ith row of the DM matrix.}
## \item{loop.idx}{If not NULL, the values correspond to the original indices of DM which have been omitted because of lack of convergence.}
SEM.variance <- function(full.data, dat, phi, max.iter, tol, nlag,
alpha.start.values, assumed.nu) {
## algorithm parameters
iter <- 0
phi.hat <- phi ## using phi from notation in Bayesian Data Analysis (Gelman, p323)
phi0 <- c(1, alpha.start.values, 1) ## the starting values for the parameters: c(beta0, alpha2-T, gamma2)
n.params <- length(phi0)
## sequence of matrices which will converge to DM
Rt <- array(dim = c(n.params, n.params, max.iter))
DM <- matrix(nrow = n.params, ncol = n.params)
DMiter <- rep(0, n.params)
## sequence of coefficient estimates
phi.t <- matrix(nrow = n.params, ncol = max.iter + 1)
alpha.t <- alpha.start.values
phi.t[, 1] <- phi0
loop.idx <- 1:n.params
while (length(loop.idx) > 0) {
iter <- iter + 1
## THIS EM CALCULATES PHI^(T+1)
## E step
dat <- run.Estep(alpha.t,
nlag = nlag,
full.data = full.data, assumed.nu = assumed.nu
)
## M step
phi.t[, iter + 1] <- run.Mstep(dat)$phi
## generate Rt given phi.t
for (i in loop.idx) {
## defining phi.tmp here as phi^t(i) from Gelman
phi.t.i <- phi.hat
phi.t.i[i] <- phi.t[i, iter]
alpha.t.i <- phi.t.i[2:(n.params - 1)]
## E step
dat <- run.Estep(alpha.t.i,
nlag = nlag,
full.data = full.data, assumed.nu = assumed.nu
)
## M step
phi.tplus1.i <- run.Mstep(dat)$phi
## fis the ith row of the current Rt matrix
Rt[i, , iter] <- (phi.tplus1.i - phi.hat) / (phi.t[i, iter] - phi.hat[i])
if (iter == 1) next
if (all((Rt[i, , iter] - Rt[i, , iter - 1])^2 < sqrt(tol))) {
loop.idx <- loop.idx[-which(loop.idx == i)]
DM[i, ] <- Rt[i, , iter]
DMiter[i] <- iter
}
}
## renew alpha.t
alpha.tplus1 <- phi.t[2:(n.params - 1), iter + 1]
alpha.t <- alpha.tplus1
if (iter == max.iter) break("maximum iterations reached")
}
if (length(loop.idx) > 0) DM <- DM[-loop.idx, -loop.idx]
DM.out <- list(DM = DM, DMiter = DMiter, loop.idx = loop.idx)
return(DM.out)
}
## E-step of the EM algorithm.
# \arguments{
# \item{alpha}{Current estimates of the alpha parameters from the GLM model.}
# \item{full.data}{ A matrix of observed data. See description in EMforCFR helpfile.}
# \item{nlag}{ The number of time units for lagged data. Corresponds to length(assumed.nu).}
# \item{assumed.nu}{ a vector of probabilities corresponding to the survival distribution, i.e. nu[i]=Pr(surviving i days | fatal case) }
# }
# \value{ A data matrix with the same format as full.data from the EMforCFR() documentation.
# }
run.Estep <- function(alpha, full.data, nlag, assumed.nu) {
## storage for reconstructed deaths
n.times <- nrow(full.data) / 2
idx.to.reconstruct <- which(full.data[, "grp"] == 1 & !is.na(full.data[, "new.times"]))
times.to.reconstruct <- full.data[idx.to.reconstruct, "time"]
Dhat.1 <- Dhat.2 <- rep(0, n.times)
## define the data
N.1 <- full.data[1:n.times, "N"]
N.2 <- full.data[(n.times + 1):(2 * n.times), "N"]
D.1 <- full.data[1:n.times, "D"]
D.2 <- full.data[(n.times + 1):(2 * n.times), "D"]
## set the alpha coefficients
alpha.long <- rep(0, n.times)
alpha.long[times.to.reconstruct[-1]] <- alpha
alpha.long[(max(times.to.reconstruct) + 1):n.times] <- alpha[length(alpha)]
## ########
## loops for calculating expected value of the deaths
for (t in times.to.reconstruct) {
denom.1 <- denom.2 <- rep(0, nlag)
for (i in 1:nlag) {
N.idx <- (t + i - 1):(t + i - nlag)
nu.idx <- seq_along(N.idx)
## group 1
denom.1[i] <- sum(assumed.nu[nu.idx] * N.1[N.idx] * exp(alpha.long[N.idx]))
## group 2
denom.2[i] <- sum(assumed.nu[nu.idx] * N.2[N.idx] * exp(alpha.long[N.idx]))
}
## exclude 0s from denominator
denom.1[denom.1 == 0] <- NA
denom.2[denom.2 == 0] <- NA
## sum them up
D.idx <- (t + 1):(t + nlag)
Dhat.1[t] <- N.1[t] * exp(alpha.long[t]) * sum(D.1[D.idx] * assumed.nu[nu.idx] / denom.1, na.rm = TRUE)
Dhat.2[t] <- N.2[t] * exp(alpha.long[t]) * sum(D.2[D.idx] * assumed.nu[nu.idx] / denom.2, na.rm = TRUE)
}
dat <- cbind(
Dhat = c(Dhat.1, Dhat.2),
new.times = full.data[, "new.times"],
grp = full.data[, "grp"],
N = full.data[, "N"]
)
return(dat)
}
## the M-step
# \arguments{
# \item{dat}{data matrix passed from EMforCFR().}
# }
# \value{ A list with two components
# \item{phi }{fitted vector of parameters}
# \item{Var }{variance-covariance matrix from the fitted model}
# }
run.Mstep <- function(dat) {
subset <- which(dat[, "N"] > 0)
fit <- glm(dat[, "Dhat"] ~ factor(dat[, "new.times"]) + factor(dat[, "grp"]),
offset = log(dat[, "N"]), family = poisson, subset = subset
)
phi <- coef(fit)
out <- list(phi = phi, Var = vcov(fit))
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.