Nothing
### function defining the EM algorithm for the CFR estimates
##' A function to estimate the relative case fatality ratio when reporting rates
##' are time-varying and deaths are lagged because of survival time.
##'
##' This function implements an EM algorithm to estimate the relative case
##' fatality ratio between two groups when reporting rates are time-varying and
##' deaths are lagged because of survival time.
##' @usage EMforCFR(assumed.nu, alpha.start.values, full.data, max.iter = 50,
##' verb = FALSE, tol = 1e-10, SEM.var = TRUE)
##'
##' @param assumed.nu a vector of probabilities corresponding to the survival
##' distribution, i.e. nu[i]=Pr(surviving i days | fatal case)
##' @param alpha.start.values a vector starting values for the reporting rate
##' parameter of the GLM model. This must have length which corresponds to one
##' less than the number of unique integer values of full.dat[,"new.times"].
##' @param full.data A matrix of observed data. See description below.
##' @param max.iter The maximum number of iterations for the EM algorithm and
##' the accompanying SEM algorithm (if used).
##' @param verb An indicator for whether the function should print results as it
##' runs.
##' @param tol A tolerance to use to test for convergence of the EM algorithm.
##' @param SEM.var If TRUE, the SEM algorithm will be run in addition to the EM
##' algorithm to calculate the variance of the parameter estimates.
##'
##' @details The data matrix full.data must have the following columns:
##' \describe{ \item{grp}{a 1 or a 2 indicating which of the two groups, j,
##' the observation is for.} \item{new.times}{an integer value representing
##' the time, t, of observation.} \item{R}{the count of recovered cases with
##' onset at time t in group j.} \item{D}{the count of deaths which occurred at
##' time t in groupo j (note that these deaths did not have disease onset at
##' time t but rather died at time t).} \item{N}{the total cases at t, j, or
##' the sum of R and D columns.} }
##'
##' @return A list with the following elements \describe{ \item{naive.rel.cfr
##' }{the naive estimate of the relative case fatality ratio}
##' \item{glm.rel.cfr }{the reporting-rate-adjusted estimate of the relative
##' case fatality ratio} \item{EM.rel.cfr }{the lag-adjusted estimate of the
##' relative case fatality ratio} \item{EM.re.cfr.var }{the variance for the
##' log-scale lag-adjusted estimator taken from the final M-step}
##' \item{EM.rel.cfr.var.SEM }{ the Supplemented EM algorithm variance for the
##' log-scale lag-adjusted estimator} \item{EM.rel.cfr.chain }{a vector of the
##' EM algorithm iterates of the lag-adjusted relative CFR estimates}
##' \item{EMiter}{the number of iterations needed for the EM algorithm to
##' converge} \item{EMconv}{indicator for convergence of the EM algorithm. 0
##' indicates all parameters converged within max.iter iterations. 1 indicates
##' that the estimate of the relative case fatality ratio converged but other
##' did not. 2 indicates that the relative case fatality ratio did not
##' converge.} \item{SEMconv}{indicator for convergence of SEM algorithm.
##' Same scheme as EMconv.} \item{ests}{ the coefficient estimates for the
##' model } \item{ests.chain}{ a matrix with all of the coefficient estimates,
##' at each EM iteration} \item{DM}{the DM matrix from the SEM algorithm}
##' \item{DMiter}{a vector showing how many iterations it took for the
##' variance component to converge in the SEM algorithm} }
##' @examples
##' ## This is code from the CFR vignette provided in the documentation.
##'
##' data(simulated.outbreak.deaths)
##' min.cases <- 10
##' N.1 <- simulated.outbreak.deaths[1:60, "N"]
##' N.2 <- simulated.outbreak.deaths[61:120, "N"]
##' first.t <- min(which(N.1 > min.cases & N.2 > min.cases))
##' last.t <- max(which(N.1 > min.cases & N.2 > min.cases))
##' idx.for.Estep <- first.t:last.t
##' new.times <- 1:length(idx.for.Estep)
##' simulated.outbreak.deaths <- cbind(simulated.outbreak.deaths, new.times = NA)
##' simulated.outbreak.deaths[c(idx.for.Estep, idx.for.Estep + 60), "new.times"] <- rep(new.times, + 2)
##' assumed.nu = c(0, 0.3, 0.4, 0.3)
##' alpha.start <- rep(0, 22)
##'
##' ## caution! this next line may take several minutes (5-10, depanding on
##' ## the speed of your machine) to run.
##' \dontrun{cfr.ests <- EMforCFR(assumed.nu = assumed.nu,
##' alpha.start.values = alpha.start,
##' full.data = simulated.outbreak.deaths,
##' verb = FALSE,
##' SEM.var = TRUE,
##' max.iter = 500,
##' tol = 1e-05)}
##' @keywords coarse data
##' @keywords incomplete data
##' @keywords case fatality ratio
##' @keywords infectious disease
##' @export
EMforCFR <- function(assumed.nu, alpha.start.values, full.data,
max.iter=50, verb=FALSE, tol=1e-10, SEM.var=TRUE){
## full.data is the data output from the observe.epidemic() function
## with a column specifying the rows to be used for analysis
#######################################
## calculate naive and GLM estimates ##
#######################################
D.1 <- sum(full.data[full.data[,"grp"]==1&!is.na(full.data[,"new.times"]),"D"])
D.2 <- sum(full.data[full.data[,"grp"]==2&!is.na(full.data[,"new.times"]),"D"])
N.1 <- sum(full.data[full.data[,"grp"]==1&!is.na(full.data[,"new.times"]),"N"])
N.2 <- sum(full.data[full.data[,"grp"]==2&!is.na(full.data[,"new.times"]),"N"])
naive.rel.cfr <- sum(D.2)/sum(N.2)/(sum(D.1)/sum(N.1))
dat <- data.frame(full.data)
unadj.glm.fit <- glm(dat[,"D"] ~ factor(dat[,"new.times"]) + factor(dat[,"grp"]),
offset=log(dat[,"N"]), family=poisson)
glm.rel.cfr <- exp(coef(unadj.glm.fit))[length(coef(unadj.glm.fit))]
#########################
## set up EM algorithm ##
#########################
proposed.rel.cfr.chain <- rep(0, max.iter)
n.params <- max(dat[,"new.times"], na.rm=TRUE)+1
phi.chain <- matrix(nrow=n.params, ncol=max.iter)
nlag <- length(assumed.nu)
## convergence codes:
## 0 = all parameters converge
## 1 = rCFR converges, but some other parameters do not
## 2 = rCFR does not converge
EMconv <- SEMconv <- 0
## looping variables
eps <- 1
j <- 0
alpha <- alpha.start.values
while(eps>tol){
j <- j+1
############
## E STEP ##
############
dat <- run.Estep(alpha,full.data=full.data,
nlag=nlag, assumed.nu=assumed.nu)
############
## M STEP ##
############
maxLik <- run.Mstep(dat)
phi.chain[,j] <- maxLik$phi
alpha <- phi.chain[2:(n.params-1),j]
proposed.rel.cfr.chain[j] <- exp(phi.chain[n.params,j])
## check convergence
if(j>1) eps <- max((phi.chain[,j]-phi.chain[,j-1])^2)
if(j==max.iter) {
## message("*** WARNING: EM agorithm didn't converge ***")
if((phi.chain[n.params,j]-phi.chain[n.params,j-1])^2 < eps){
EMconv <- 1
} else {
EMconv <- 2
}
break
}
}
var.joint <- maxLik$Var
phi <- phi.chain[,j]
proposed.rel.cfr <- proposed.rel.cfr.chain[j]
EMiter <- j
if(verb) {
print(paste("naive estimator =", round(naive.rel.cfr, 3)))
print(paste("GLM estimate unadjusted for lags =", round(glm.rel.cfr, 3)))
print(paste(EMiter, "iterations for EM convergence"))
}
if(SEM.var){
DM.out <- SEM.variance(full.data=full.data, dat, phi, max.iter, tol, nlag=nlag, alpha.start.values, assumed.nu=assumed.nu)
DM <- DM.out$DM
DMiter <- DM.out$DMiter
if(length(DM.out$loop.idx)>0) {
loop.idx <- DM.out$loop.idx
var.joint <- var.joint[-loop.idx,-loop.idx]
}
## set SEM convergence codes
if(any(DMiter==0)) {
if(DMiter[length(DMiter)]==0) {
SEMconv <- 2
} else { SEMconv <- 1 }
}
## calculate CFR variance if it converged
if(SEMconv<2){
variance.SEM <- var.joint +
var.joint %*% DM %*% solve(diag(1, ncol(DM))-DM)
proposed.rel.cfr.var.SEM <- variance.SEM[nrow(DM), nrow(DM)]
if(proposed.rel.cfr.var.SEM<0) {
proposed.rel.cfr.var.SEM <- NA
SEMconv <- 2
}
} else { proposed.rel.cfr.var.SEM <- NA }
if(verb){
print("Estimate with SEM")
print(paste("proposed CFR =", round(proposed.rel.cfr, 3),
"; 95% CI (",
round(exp(log(proposed.rel.cfr)
-2*sqrt(proposed.rel.cfr.var.SEM)),3),
",",
round(exp(log(proposed.rel.cfr)
+2*sqrt(proposed.rel.cfr.var.SEM)),3),
")"))
if(any(DM.out$DMiter==0)) {
print("non-convergent DM indices:")
print(which(DM.out$DMiter==0))
}
}
} else {
SEMconv <- NA
DM <- NULL
DMiter <- NULL
proposed.rel.cfr.var.SEM <- NULL
}
## ##############
## OUTPUT LIST ##
## ##############
out <- list(naive.rel.cfr=naive.rel.cfr,
glm.rel.cfr=glm.rel.cfr,
EM.rel.cfr=proposed.rel.cfr,
EM.rel.cfr.var=var.joint[nrow(var.joint), nrow(var.joint)],
EM.rel.cfr.var.SEM = proposed.rel.cfr.var.SEM,
EM.rel.cfr.chain=proposed.rel.cfr.chain,
EMiter=EMiter,
EMconv=EMconv,
SEMconv=SEMconv,
ests=phi,
ests.chain.EM=phi.chain,
DM=DM,
DMiter=DMiter)
if(verb) {
print("Estimate with just GLM variance")
print(paste("proposed CFR =", round(proposed.rel.cfr, 3),
"; 95% CI (",
round(exp(log(proposed.rel.cfr)
-2*sqrt(out$EM.rel.cfr.var)),3),
",",
round(exp(log(proposed.rel.cfr)
+2*sqrt(out$EM.rel.cfr.var)),3),
")"))
}
return(out)
}
## This function is meant to be run only through the function EMforCFR()
## and is used to calculate the variance via the Supplemented EM
## algorithm (see Meng and Rubin, 1991)
## params:
## \item{full.data}{ A matrix of observed data. See description in EMforCFR helpfile.}
## \item{dat}{A data frame.}
## \item{phi}{ A vector of fitted parameters from the final EM iteration.}
## \item{max.iter}{ The maximum number of iterations for SEM algorithm. }
## \item{tol}{ A tolerance to use to test for convergence of the EM algorithm. }
## \item{nlag}{ The number of time units for lagged data. Corresponds to length(assumed.nu).}
## \item{alpha.start.values}{ a vector starting values for the reporting rate parameter of the GLM model. This must have length which corresponds to one less than the number of unique integer values of full.dat[,"new.times"].}
## \item{assumed.nu}{ a vector of probabilities corresponding to the survival distribution, i.e. nu[i]=Pr(surviving i days | fatal case) }
## returned list:
## \item{DM}{The estimate of the variance-covariance matrix for the model parameters. Only converged rows are returned.}
## \item{DMiter}{A vector whose ith entry is the number of iterations needed for convergence of the ith row of the DM matrix.}
## \item{loop.idx}{If not NULL, the values correspond to the original indices of DM which have been omitted because of lack of convergence.}
SEM.variance <- function(full.data, dat, phi, max.iter, tol, nlag,
alpha.start.values, assumed.nu){
## algorithm parameters
eps <- 1
iter <- 0
phi.hat <- phi ## using phi from notation in Bayesian Data Analysis (Gelman, p323)
phi0 <- c(1, alpha.start.values, 1) ## the starting values for the parameters: c(beta0, alpha2-T, gamma2)
n.params <- length(phi0)
## sequence of matrices which will converge to DM
Rt <- array(dim=c(n.params, n.params, max.iter))
DM <- matrix(nrow=n.params, ncol=n.params)
DMiter <- rep(0, n.params)
## sequence of coefficient estimates
phi.t <- matrix(nrow=n.params, ncol=max.iter+1)
alpha.t <- alpha.start.values
phi.t[,1] <- phi0
loop.idx <- 1:n.params
while(length(loop.idx)>0) {
iter <- iter + 1
##THIS EM CALCULATES PHI^(T+1)
## E step
dat <- run.Estep(alpha.t, nlag=nlag,
full.data=full.data, assumed.nu=assumed.nu)
## M step
phi.t[,iter+1] <- run.Mstep(dat)$phi
## generate Rt given phi.t
for(i in loop.idx){
## defining phi.tmp here as phi^t(i) from Gelman
phi.t.i <- phi.hat
phi.t.i[i] <- phi.t[i,iter]
alpha.t.i <- phi.t.i[2:(n.params-1)]
## E step
dat <- run.Estep(alpha.t.i, nlag=nlag,
full.data=full.data, assumed.nu=assumed.nu)
## M step
phi.tplus1.i <- run.Mstep(dat)$phi
## fis the ith row of the current Rt matrix
Rt[i,,iter] <- (phi.tplus1.i-phi.hat)/(phi.t[i,iter]-phi.hat[i])
if(iter==1) next
if(all((Rt[i,,iter]-Rt[i,,iter-1])^2 < sqrt(tol))){
loop.idx <- loop.idx[-which(loop.idx==i)]
DM[i,] <- Rt[i,,iter]
DMiter[i] <- iter
}
}
## renew alpha.t
alpha.tplus1 <- phi.t[2:(n.params-1), iter+1]
alpha.t <- alpha.tplus1
if(iter == max.iter) break("maximum iterations reached")
}
if(length(loop.idx)>0) DM <- DM[-loop.idx, -loop.idx]
DM.out <- list(DM=DM, DMiter=DMiter, loop.idx=loop.idx)
return(DM.out)
}
## E-step of the EM algorithm.
# \arguments{
# \item{alpha}{Current estimates of the alpha parameters from the GLM model.}
# \item{full.data}{ A matrix of observed data. See description in EMforCFR helpfile.}
# \item{nlag}{ The number of time units for lagged data. Corresponds to length(assumed.nu).}
# \item{assumed.nu}{ a vector of probabilities corresponding to the survival distribution, i.e. nu[i]=Pr(surviving i days | fatal case) }
# }
# \value{ A data matrix with the same format as full.data from the EMforCFR() documentation.
# }
run.Estep <- function(alpha, full.data, nlag, assumed.nu){
## storage for reconstructed deaths
n.times <- nrow(full.data)/2
idx.to.reconstruct <- which(full.data[,"grp"]==1 &!is.na(full.data[,"new.times"]))
times.to.reconstruct <- full.data[idx.to.reconstruct,"time"]
Dhat.1 <- Dhat.2 <- rep(0, n.times)
## define the data
N.1 <- full.data[1:n.times,"N"]
N.2 <- full.data[(n.times+1):(2*n.times),"N"]
D.1 <- full.data[1:n.times,"D"]
D.2 <- full.data[(n.times+1):(2*n.times),"D"]
## set the alpha coefficients
alpha.long <- rep(0, n.times)
alpha.long[times.to.reconstruct[-1]] <- alpha
alpha.long[(max(times.to.reconstruct)+1):n.times] <- alpha[length(alpha)]
## ########
## loops for calculating expected value of the deaths
for(t in times.to.reconstruct) {
denom.1 <- denom.2 <- rep(0, nlag)
for(i in 1:nlag){
N.idx <- (t+i-1):(t+i-nlag)
nu.idx <- 1:length(N.idx)
## group 1
denom.1[i] <- sum(assumed.nu[nu.idx]*N.1[N.idx]*exp(alpha.long[N.idx]))
## group 2
denom.2[i] <- sum(assumed.nu[nu.idx]*N.2[N.idx]*exp(alpha.long[N.idx]))
}
## exclude 0s from denominator
denom.1[denom.1==0] <- NA
denom.2[denom.2==0] <- NA
## sum them up
D.idx <- (t+1):(t+nlag)
Dhat.1[t] <- N.1[t] * exp(alpha.long[t]) * sum(D.1[D.idx] * assumed.nu[nu.idx]/denom.1, na.rm=TRUE)
Dhat.2[t] <- N.2[t] * exp(alpha.long[t]) * sum(D.2[D.idx] * assumed.nu[nu.idx]/denom.2, na.rm=TRUE)
}
dat <- cbind(Dhat=c(Dhat.1, Dhat.2),
new.times=full.data[,"new.times"],
grp=full.data[,"grp"],
N=full.data[,"N"])
return(dat)
}
## the M-step
# \arguments{
# \item{dat}{data matrix passed from EMforCFR().}
# }
# \value{ A list with two components
# \item{phi }{fitted vector of parameters}
# \item{Var }{variance-covariance matrix from the fitted model}
# }
run.Mstep <- function(dat){
subset <- which(dat[,"N"]>0)
fit <- glm(dat[,"Dhat"] ~ factor(dat[,"new.times"]) + factor(dat[,"grp"]),
offset=log(dat[,"N"]),family=poisson, subset=subset)
phi <- coef(fit)
out <- list(phi=phi, Var=vcov(fit))
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.