# R/CFR_estimation.R In nickreich/coarseDataTools: A Collection of Functions to Help with Analysis of Coarsely Observed Data

#### Documented in EMforCFR

### function defining the EM algorithm for the CFR estimates

##' A function to estimate the relative case fatality ratio when reporting rates
##' are time-varying and deaths are lagged because of survival time.
##'
##' This function implements an EM algorithm to estimate the relative case
##' fatality ratio between two groups when reporting rates are time-varying and
##' deaths are lagged because of survival time.
##' @usage EMforCFR(assumed.nu, alpha.start.values, full.data, max.iter = 50,
##'   verb = FALSE, tol = 1e-10, SEM.var = TRUE)
##'
##' @param assumed.nu  a vector of probabilities corresponding to the survival
##'   distribution, i.e. nu[i]=Pr(surviving i days | fatal case)
##' @param alpha.start.values a vector starting values for the reporting rate
##'   parameter of the GLM model. This must have length which corresponds to one
##'   less than the number of unique integer values of full.dat[,"new.times"].
##' @param full.data  A matrix of observed data. See description below.
##' @param max.iter The maximum number of iterations for the EM algorithm and
##'   the accompanying SEM algorithm (if used).
##' @param verb An indicator for whether the function should print results as it
##'   runs.
##' @param tol A tolerance to use to test for convergence of the EM algorithm.
##' @param SEM.var If TRUE, the SEM algorithm will be run in addition to the EM
##'   algorithm to calculate the variance of the parameter estimates.
##'
##' @details The data matrix full.data must have the following columns:
##'   \describe{ \item{grp}{a 1 or a 2 indicating which of the two groups, j,
##'   the observation is for.} \item{new.times}{an integer value representing
##'   the time, t, of observation.} \item{R}{the count of recovered cases with
##'   onset at time t in group j.} \item{D}{the count of deaths which occured at
##'   time t in groupo j (note that these deaths did not have disease onset at
##'   time t but rather died at time t).} \item{N}{the total cases at t, j, or
##'   the sum of R and D columns.} }
##'
##' @return A list with the following elements \describe{ \item{naive.rel.cfr
##'   }{the naive estimate of the relative case fatality ratio}
##'   \item{glm.rel.cfr }{the reporting-rate-adjusted estimate of the relative
##'   case fatality ratio} \item{EM.rel.cfr }{the lag-adjusted estimate of the
##'   relative case fatality ratio} \item{EM.re.cfr.var }{the variance for the
##'   log-scale lag-adjusted estimator taken from the final M-step}
##'   \item{EM.rel.cfr.var.SEM }{ the Supplemented EM algorithm variance for the
##'   log-scale lag-adjusted estimator} \item{EM.rel.cfr.chain }{a vector of the
##'   EM algorithm iterates of the lag-adjusted relative CFR estimates}
##'   \item{EMiter}{the number of iterations needed for the EM algorithm to
##'   converge} \item{EMconv}{indicator for convergence of the EM algorithm.  0
##'   indicates all paramters converged within max.iter iterations.  1 indicates
##'   that the estimate of the relative case fatality ratio converged but other
##'   did not.  2 indicates that the relative case fatality ratio did not
##'   converge.} \item{SEMconv}{indicator for convergence of SEM algorithm.
##'   Same scheme as EMconv.} \item{ests}{ the coefficient estimates for the
##'   model } \item{ests.chain}{ a matrix with all of the coefficient estimates,
##'   at each EM iteration} \item{DM}{the DM matrix from the SEM algorithm}
##'   \item{DMiter}{a vector showing how many iterations it took for the
##'   variance component to converge in the SEM algorithm} }
##'  @examples
##'     ## This is code from the CFR vignette provided in the documentation.
##'
##' data(simulated.outbreak.deaths)
##' min.cases <- 10
##' N.1 <- simulated.outbreak.deaths[1:60, "N"]
##' N.2 <- simulated.outbreak.deaths[61:120, "N"]
##' first.t <- min(which(N.1 > min.cases & N.2 > min.cases))
##' last.t <- max(which(N.1 > min.cases & N.2 > min.cases))
##' idx.for.Estep <- first.t:last.t
##' new.times <- 1:length(idx.for.Estep)
##' simulated.outbreak.deaths <- cbind(simulated.outbreak.deaths, new.times = NA)
##' simulated.outbreak.deaths[c(idx.for.Estep, idx.for.Estep + 60), "new.times"] <- rep(new.times, + 2)
##' assumed.nu = c(0, 0.3, 0.4, 0.3)
##' alpha.start <- rep(0, 22)
##'
##' ## caution! this next line may take several minutes (5-10, depanding on
##' ##    the speed of your machine) to run.
##' \dontrun{cfr.ests <- EMforCFR(assumed.nu = assumed.nu,
##'                               alpha.start.values = alpha.start,
##'                               full.data = simulated.outbreak.deaths,
##'                               verb = FALSE,
##'                               SEM.var = TRUE,
##'                               max.iter = 500,
##'                               tol = 1e-05)}
##' @keywords coarse data
##' @keywords incomplete data
##' @keywords case fatality ratio
##' @keywords infectious disease
##' @export
EMforCFR <- function(assumed.nu, alpha.start.values, full.data,
max.iter=50, verb=FALSE, tol=1e-10, SEM.var=TRUE){
## full.data is the data output from the observe.epidemic() function
##    with a column specifying the rows to be used for analysis

#######################################
## calculate naive and GLM estimates ##
#######################################

D.1 <- sum(full.data[full.data[,"grp"]==1&!is.na(full.data[,"new.times"]),"D"])
D.2 <- sum(full.data[full.data[,"grp"]==2&!is.na(full.data[,"new.times"]),"D"])
N.1 <- sum(full.data[full.data[,"grp"]==1&!is.na(full.data[,"new.times"]),"N"])
N.2 <- sum(full.data[full.data[,"grp"]==2&!is.na(full.data[,"new.times"]),"N"])
naive.rel.cfr <- sum(D.2)/sum(N.2)/(sum(D.1)/sum(N.1))

dat <- data.frame(full.data)
unadj.glm.fit <- glm(dat[,"D"] ~ factor(dat[,"new.times"]) + factor(dat[,"grp"]),
offset=log(dat[,"N"]), family=poisson)

#########################
## set up EM algorithm ##
#########################
proposed.rel.cfr.chain <- rep(0, max.iter)
n.params <- max(dat[,"new.times"], na.rm=TRUE)+1
phi.chain <- matrix(nrow=n.params, ncol=max.iter)
nlag <- length(assumed.nu)

## convergence codes:
## 0 = all parameters converge
## 1 = rCFR converges, but some other parameters do not
## 2 = rCFR does not converge
EMconv <- SEMconv <- 0

## looping variables
eps <- 1
j <- 0
alpha <- alpha.start.values
while(eps>tol){
j <- j+1
############
## E STEP ##
############
dat <- run.Estep(alpha,full.data=full.data,
nlag=nlag, assumed.nu=assumed.nu)

############
## M STEP ##
############
maxLik <- run.Mstep(dat)
phi.chain[,j] <- maxLik$phi alpha <- phi.chain[2:(n.params-1),j] proposed.rel.cfr.chain[j] <- exp(phi.chain[n.params,j]) ## check convergence if(j>1) eps <- max((phi.chain[,j]-phi.chain[,j-1])^2) if(j==max.iter) { ## message("*** WARNING: EM agorithm didn't converge ***") if((phi.chain[n.params,j]-phi.chain[n.params,j-1])^2 < eps){ EMconv <- 1 } else { EMconv <- 2 } break } } var.joint <- maxLik$Var

phi <- phi.chain[,j]
proposed.rel.cfr <- proposed.rel.cfr.chain[j]
EMiter <- j
if(verb) {
print(paste("naive estimator =", round(naive.rel.cfr, 3)))
print(paste("GLM estimate unadjusted for lags =", round(glm.rel.cfr, 3)))
print(paste(EMiter, "iterations for EM convergence"))
}

if(SEM.var){
DM.out <- SEM.variance(full.data=full.data, dat, phi, max.iter, tol, nlag=nlag, alpha.start.values, assumed.nu=assumed.nu)
DM <- DM.out$DM DMiter <- DM.out$DMiter
if(length(DM.out$loop.idx)>0) { loop.idx <- DM.out$loop.idx
var.joint <- var.joint[-loop.idx,-loop.idx]
}
## set SEM convergence codes
if(any(DMiter==0)) {
if(DMiter[length(DMiter)]==0) {
SEMconv <- 2
} else { SEMconv <- 1 }

}

## calculate CFR variance if it converged
if(SEMconv<2){
variance.SEM <- var.joint +
var.joint %*% DM %*% solve(diag(1, ncol(DM))-DM)
proposed.rel.cfr.var.SEM <- variance.SEM[nrow(DM), nrow(DM)]
if(proposed.rel.cfr.var.SEM<0) {
proposed.rel.cfr.var.SEM <- NA
SEMconv <- 2
}
} else { proposed.rel.cfr.var.SEM <- NA }

if(verb){
print("Estimate with SEM")
print(paste("proposed CFR =", round(proposed.rel.cfr, 3),
"; 95% CI (",
round(exp(log(proposed.rel.cfr)
-2*sqrt(proposed.rel.cfr.var.SEM)),3),
",",
round(exp(log(proposed.rel.cfr)
+2*sqrt(proposed.rel.cfr.var.SEM)),3),
")"))
if(any(DM.out$DMiter==0)) { print("non-convergent DM indices:") print(which(DM.out$DMiter==0))
}
}
} else {
SEMconv <- NA
DM <- NULL
DMiter <- NULL
proposed.rel.cfr.var.SEM <- NULL
}

## ##############
## OUTPUT LIST ##
## ##############
out <- list(naive.rel.cfr=naive.rel.cfr,
glm.rel.cfr=glm.rel.cfr,
EM.rel.cfr=proposed.rel.cfr,
EM.rel.cfr.var=var.joint[nrow(var.joint), nrow(var.joint)],
EM.rel.cfr.var.SEM = proposed.rel.cfr.var.SEM,
EM.rel.cfr.chain=proposed.rel.cfr.chain,
EMiter=EMiter,
EMconv=EMconv,
SEMconv=SEMconv,
ests=phi,
ests.chain.EM=phi.chain,
DM=DM,
DMiter=DMiter)

if(verb) {
print("Estimate with just GLM variance")
print(paste("proposed CFR =", round(proposed.rel.cfr, 3),
"; 95% CI (",
round(exp(log(proposed.rel.cfr)
-2*sqrt(out$EM.rel.cfr.var)),3), ",", round(exp(log(proposed.rel.cfr) +2*sqrt(out$EM.rel.cfr.var)),3),
")"))
}

return(out)

}

## This function is meant to be run only through the function EMforCFR()
##   and is used to calculate the variance via the Supplemented EM
##   algorithm (see Meng and Rubin, 1991)

## params:
##   \item{full.data}{ A matrix of observed data. See description in EMforCFR helpfile.}
##   \item{dat}{A data frame.}
##   \item{phi}{ A vector of fitted parameters from the final EM iteration.}
##   \item{max.iter}{ The maximum number of iterations for SEM algorithm. }
##   \item{tol}{ A tolerance to use to test for convergence of the EM algorithm. }
##   \item{nlag}{ The number of time units for lagged data.  Corresponds to length(assumed.nu).}
##   \item{alpha.start.values}{ a vector starting values for the reporting rate parameter of the GLM model. This must have length which corresponds to one less than the number of unique integer values of full.dat[,"new.times"].}
##   \item{assumed.nu}{ a vector of probabilities corresponding to the survival distribution, i.e. nu[i]=Pr(surviving i days | fatal case) }

## returned list:
##    \item{DM}{The estimate of the variance-covariance matrix for the model parameters.  Only converged rows are returned.}
##    \item{DMiter}{A vector whose ith entry is the number of iterations needed for convergence of the ith row of the DM matrix.}
##    \item{loop.idx}{If not NULL, the values correspond to the original indices of DM which have been omitted because of lack of convergence.}
SEM.variance <- function(full.data, dat, phi, max.iter, tol, nlag,
alpha.start.values, assumed.nu){
## algorithm parameters
eps <- 1
iter <- 0
phi.hat <- phi ## using phi from notation in Bayesian Data Analysis (Gelman, p323)
phi0 <- c(1, alpha.start.values, 1) ## the starting values for the parameters: c(beta0, alpha2-T, gamma2)
n.params <- length(phi0)

## sequence of matrices which will converge to DM
Rt <- array(dim=c(n.params, n.params, max.iter))
DM <- matrix(nrow=n.params, ncol=n.params)
DMiter <- rep(0, n.params)

## sequence of coefficient estimates
phi.t <- matrix(nrow=n.params, ncol=max.iter+1)
alpha.t <- alpha.start.values
phi.t[,1] <- phi0
loop.idx <- 1:n.params
while(length(loop.idx)>0) {
iter <- iter + 1

##THIS EM CALCULATES PHI^(T+1)
## E step
dat <- run.Estep(alpha.t, nlag=nlag,
full.data=full.data, assumed.nu=assumed.nu)
## M step
phi.t[,iter+1] <- run.Mstep(dat)$phi ## generate Rt given phi.t for(i in loop.idx){ ## defining phi.tmp here as phi^t(i) from Gelman phi.t.i <- phi.hat phi.t.i[i] <- phi.t[i,iter] alpha.t.i <- phi.t.i[2:(n.params-1)] ## E step dat <- run.Estep(alpha.t.i, nlag=nlag, full.data=full.data, assumed.nu=assumed.nu) ## M step phi.tplus1.i <- run.Mstep(dat)$phi

## fis the ith row of the current Rt matrix
Rt[i,,iter] <- (phi.tplus1.i-phi.hat)/(phi.t[i,iter]-phi.hat[i])
if(iter==1) next
if(all((Rt[i,,iter]-Rt[i,,iter-1])^2 < sqrt(tol))){
loop.idx <- loop.idx[-which(loop.idx==i)]
DM[i,] <- Rt[i,,iter]
DMiter[i] <- iter
}

}

## renew alpha.t
alpha.tplus1 <- phi.t[2:(n.params-1), iter+1]
alpha.t <- alpha.tplus1

if(iter == max.iter) break("maximum iterations reached")

}

if(length(loop.idx)>0)	DM <- DM[-loop.idx, -loop.idx]

DM.out <- list(DM=DM, DMiter=DMiter, loop.idx=loop.idx)

return(DM.out)
}

## E-step of the EM algorithm.

# \arguments{
#  \item{alpha}{Current estimates of the alpha parameters from the GLM model.}
#  \item{full.data}{ A matrix of observed data. See description in EMforCFR helpfile.}
#  \item{nlag}{ The number of time units for lagged data.  Corresponds to length(assumed.nu).}
#  \item{assumed.nu}{ a vector of probabilities corresponding to the survival distribution, i.e. nu[i]=Pr(surviving i days | fatal case) }
# }

# \value{ A data matrix with the same format as full.data from the EMforCFR() documentation.
# }

run.Estep <- function(alpha, full.data, nlag, assumed.nu){

## storage for reconstructed deaths
n.times <- nrow(full.data)/2
idx.to.reconstruct <- which(full.data[,"grp"]==1 &!is.na(full.data[,"new.times"]))
times.to.reconstruct <- full.data[idx.to.reconstruct,"time"]
Dhat.1 <- Dhat.2 <- rep(0, n.times)

## define the data
N.1 <- full.data[1:n.times,"N"]
N.2 <- full.data[(n.times+1):(2*n.times),"N"]
D.1 <- full.data[1:n.times,"D"]
D.2 <- full.data[(n.times+1):(2*n.times),"D"]

## set the alpha coefficients
alpha.long <- rep(0, n.times)
alpha.long[times.to.reconstruct[-1]] <- alpha
alpha.long[(max(times.to.reconstruct)+1):n.times] <- alpha[length(alpha)]

## ########
## loops for calculating expected value of the deaths

for(t in times.to.reconstruct) {
denom.1 <- denom.2 <- rep(0, nlag)
for(i in 1:nlag){
N.idx <- (t+i-1):(t+i-nlag)
nu.idx <- 1:length(N.idx)
## group 1
denom.1[i] <- sum(assumed.nu[nu.idx]*N.1[N.idx]*exp(alpha.long[N.idx]))
## group 2
denom.2[i] <- sum(assumed.nu[nu.idx]*N.2[N.idx]*exp(alpha.long[N.idx]))
}

## exclude 0s from denominator
denom.1[denom.1==0] <- NA
denom.2[denom.2==0] <- NA

## sum them up
D.idx <- (t+1):(t+nlag)
Dhat.1[t] <- N.1[t] * exp(alpha.long[t]) * sum(D.1[D.idx] * assumed.nu[nu.idx]/denom.1, na.rm=TRUE)
Dhat.2[t] <- N.2[t] * exp(alpha.long[t]) * sum(D.2[D.idx] * assumed.nu[nu.idx]/denom.2, na.rm=TRUE)
}

dat <- cbind(Dhat=c(Dhat.1, Dhat.2),
new.times=full.data[,"new.times"],
grp=full.data[,"grp"],
N=full.data[,"N"])
return(dat)
}

## the M-step

# \arguments{
#  \item{dat}{data matrix passed from EMforCFR().}
# }

# \value{ A list with two components
#         \item{phi }{fitted vector of parameters}
#         \item{Var }{variance-covariance matrix from the fitted model}
# }

run.Mstep <- function(dat){
subset <- which(dat[,"N"]>0)
fit <- glm(dat[,"Dhat"] ~ factor(dat[,"new.times"]) + factor(dat[,"grp"]),
offset=log(dat[,"N"]),family=poisson, subset=subset)

phi <- coef(fit)

out <- list(phi=phi, Var=vcov(fit))
return(out)
}

nickreich/coarseDataTools documentation built on May 23, 2017, 2:46 p.m.