R/get_indiv.r
In InSilicoVA: Probabilistic Verbal Autopsy Coding with 'InSilicoVA' Algorithm

Documented in get.indiv updateIndiv

#' Get individual COD probabilities from InSilicoVA Model Fits
#' 
#' This function calculates individual probabilities for each death and provide posterior credible intervals for each estimates. The default set up is to calculate the 95% C.I. when running the inSilicoVA model. If a different C.I. is desired after running the model, this function provides faster re-calculation without refitting the model.
#' 
#' 
#' @param object Fitted \code{"insilico"} object.
#' @param data data for the fitted \code{"insilico"} object. The first column of the data should be the ID that matches the \code{"insilico"} fitted model.
#' @param CI Credible interval for posterior estimates.
#' @param is.aggregate logical indicator for constructing aggregated distribution rather than individual distributions.
#' @param by list of column names to group by.
#' @param is.sample logical indicator for returning the posterior samples of individual probabilities instead of posterior summaries.
#' @param java_option Option to initialize java JVM. Default to ``-Xmx1g'', which sets the maximum heap size to be 1GB.
#' @param \dots Not used.
#' @return
#' \item{mean}{ individual mean COD distribution matrix.} 
#' \item{median}{ individual median COD distribution matrix.} 
#' \item{lower}{ individual lower bound for each COD probability.} 
#' \item{upper}{ individual upper bound for each COD probability.} 

#' @author Zehang Li, Tyler McCormick, Sam Clark
#' 
#' Maintainer: Zehang Li <lizehang@uw.edu>
#' @seealso \code{\link{insilico}}, \code{\link{updateIndiv}}, \code{\link{plot.insilico}}
#' @references 
#' Tyler H. McCormick, Zehang R. Li, Clara Calvert, Amelia C. Crampin,
#' Kathleen Kahn and Samuel J. Clark Probabilistic cause-of-death assignment
#' using verbal autopsies, \emph{Journal of the American Statistical
#' Association} (2016), 111(515):1036-1049.
#' @examples
#' \dontrun{
#' data(RandomVA1)
#' fit1<- insilico(RandomVA1, subpop = NULL,
#'                 Nsim = 1000, burnin = 500, thin = 10 , seed = 1,
#'                 auto.length = FALSE)
#' summary(fit1, id = "d199")
#' 
#' # Calculate aggregated COD distributions
#' agg.csmf <- get.indiv(data = RandomVA1, fit1, CI = 0.95, 
#'                      is.aggregate = TRUE, by = NULL)
#' head(agg.csmf) 
#' 
#' agg.by.sex.age <- get.indiv(data = RandomVA1, fit1, CI = 0.95, 
#'                             is.aggregate = TRUE, by = list("sex", "age"))
#' head(agg.by.sex.age$mean) 
#' 
#' # Obtain individual level P(Y|X) posterior draws (N by C by Nitr array)
#' prob <- get.indiv(data = RandomVA1, fit1, is.sample = TRUE)
#' dim(prob)
#' }
#' @export get.indiv
get.indiv <- function(object, data = NULL, CI = 0.95, is.aggregate = FALSE, by = NULL, is.sample = FALSE, java_option = "-Xmx1g", ...){
	if(is.null(java_option)) java_option = "-Xmx1g"
	options( java.parameters = java_option )
	id <- object$id
	obj <- .jnew("sampler/InsilicoSampler2")

	#-----------------------------------------------------------------#
	# if grouping is provided
	if(is.aggregate){
		if(is.null(data)){
			stop("Data not provided for grouping")
		}
		data <- data.frame(data)
		# check by variable
		if(is.null(by)){
			message("No groups specified, aggregate for all deaths\n")
			data$sub <- "All"
			col.index <- which(colnames(data) == "sub")
		}else{
			if(!methods::is(by, "list")){
				stop("Please specify by variable using list, see manual example for details.")
			}else{
				col.index <- match(by, colnames(data))
				if(length(which(is.na(col.index))) > 0){
					stop("Contains unknown grouping variables!")
				}
			}
		}
		# check id
		if(colnames(data)[1] != "ID"){
			colnames(data)[1] <- "ID"
			warning("The first column of data is assumed to be ID")
		}
		n.match <- length(which(!is.na(match(data[, 1], object$id)))) 
		if(length(n.match) == 0){
			stop("No data has the matching ID as in the fitted insilico object")
		}else{
			message(paste(n.match, "deaths matched from data to the fitted object\n"))
		}

		# get grouping vector and names for all data
		datagroup.all <- data[, c(1, col.index)]
		if(length(col.index) > 1){
			datagroup.all$final.group<- apply(datagroup.all[, -1], 1, function(x){paste(x,collapse = ' ')})
		}else{
			datagroup.all$final.group <- datagroup.all[, -1]
		}
		# get the group name for subset in insilico fitted data
		index.tmp <- match(rownames(object$data.final), datagroup.all[, 1])
		if(length(which(!is.na(index.tmp))) == 0){
			stop("no matching ID found in the data")
		}
		datagroup <- datagroup.all$final.group[index.tmp]
		datagroup <- datagroup[!is.na(datagroup)]

		allgroups <- sort(unique(datagroup))
		datagroup <- match(datagroup, allgroups)
		datagroup.all$final.group.num <- match(datagroup.all$final.group, allgroups)
		datagroup.j <- .jarray(as.integer(datagroup), dispatch = TRUE)
		Ngroup <- as.integer(length(allgroups))
	}

	#---------------------------------------------------------------#

	# 3d array of csmf to be Nsub * Nitr * C
	if(is.null(object$subpop)){
		csmf <- array(0, dim = c(1, dim(object$csmf)[1], dim(object$csmf)[2]))
		csmf[1, , ] <- object$csmf
		subpop <- rep(as.integer(1), length(object$id))
	}else{
		csmf <- array(0, dim = c(length(object$csmf), dim(object$csmf[[1]])[1], dim(object$csmf[[1]])[2]))
		for(i in 1:length(object$csmf)){
			csmf[i, , ] <- object$csmf[[i]]
		}
		subpop <- as.integer(pmatch(object$subpop, names(object$csmf), duplicates.ok=T))
	}	


	if(object$external){
		csmf <- csmf[, , -object$external.causes, drop = FALSE]
		csmf <- apply(csmf, c(1, 2), function(x){
			if(sum(x) != 0){
				return(x/sum(x))
			}else{
				return(x)
			}})
		csmf <- aperm(csmf, c(2, 3, 1))
	}

	data.j <- .jarray(object$data.final, dispatch = TRUE)
	subpop.j <- .jarray(subpop, dispatch = TRUE)
	impossible.j <- .jarray(object$impossible.causes, dispatch = TRUE)
	# get condprob to be Nitr * S * C array
	if(object$updateCondProb == FALSE){
		condprob <- array(0, dim = c(dim(csmf)[2], dim(object$probbase)[1], dim(object$probbase)[2]))
		for(itr in 1:dim(csmf)[2]){
			condprob[itr, , ] <- object$probbase
		}
	}else{
		if(object$keepProbbase.level){
			condprob <- array(0, dim = c(dim(object$conditional.probs)[1], dim(object$probbase)[1], dim(object$probbase)[2]))
			for(i in 1:dim(condprob)[1]){
				#fix for interVA probbase
				object$probbase[object$probbase == "B -"] <- "B-"
				object$probbase[object$probbase == ""] <- "N"
				
				temp <- object$conditional.probs[i, match(object$probbase, colnames(object$conditional.probs))]
				condprob[i, , ] <- matrix(as.numeric(temp), 
											dim(object$probbase)[1], 
										    dim(object$probbase)[2])
			}
		}else{
			condprob <- object$conditional.probs
		}
	}

	# rJava package (0.9-8) currently has a bug with multi-dimension array
	# value passed to java contains 0's not in the original array
	# to bypass this open issue, condprob and csmf are transformed 
	# 	into vectors:
	# dim(csmf) = Nsub * Nitr * C 
	# dim(condprob) = Nitr * S * C
	# after transformation (i, j, k) -> (k-1)*d1*d2 + (j-1)*d1 + i
	Nsub <- as.integer(dim(csmf)[1])
	Nitr <- as.integer(dim(csmf)[2])
	C <- as.integer(dim(csmf)[3])
	S <- as.integer(dim(condprob)[2])

	condprob <- as.vector(condprob)
	csmf <- as.vector(csmf)
	condprob.j <- .jarray(condprob, dispatch = TRUE)
	csmf.j <- .jarray(csmf, dispatch = TRUE)
	#-------------------------------------------------------------------#
	if(is.sample){
		message("Calculating individual COD posterior draws\n")
		# return a list with length N, each with dimensional C by Nitr
		indiv <- try( 
			 .jcall(obj, "[[[D", "IndivProbSample", 
					 data.j, impossible.j, csmf.j, subpop.j, condprob.j, 
					 (1 - CI)/2, 1-(1-CI)/2, 
					 Nsub, Nitr, C, S)
			 , FALSE)
		if(is(indiv, "try-error")){
			java_message()	
			stop()
		}
		indivprob <- array(NA, c(length(indiv), C, Nitr))
		for(i in 1:length(indiv)){
			indivprob[i, , ] <- do.call(rbind, lapply(indiv[[i]], .jevalArray))		
		}

		# if not customized probbase, use longer cause names for output
		if(!object$is.customized){
			if(object$data.type == "WHO2012"){
				data("causetext", envir = environment())
				causetext<- get("causetext", envir  = environment())
			}else{
				data("causetextV5", envir = environment())
				causetext<- get("causetextV5", envir  = environment())
			}			
			match.cause <- pmatch(causetext[, 1],  colnames(object$probbase))
			index.cause <- order(match.cause)[1:sum(!is.na(match.cause))]
			dimnames(indivprob)[[2]] <- causetext[index.cause, 2]
		}else{
			dimnames(indivprob)[[2]] <- colnames(object$probbase)
		}
		
		## add back all external cause death 41:51 in standard VA
		if(object$external){
			external.causes <- object$external.causes
			C0 <- dim(indivprob)[2]
			ext.flag <- apply(object$indiv.prob[, external.causes], 1, sum)
			ext.probs <- object$indiv.prob[which(ext.flag > 0), , drop = FALSE]

			indivprob.ext <- array(0, dim = c(dim(indivprob)[1] + sum(ext.flag > 0), dim(indivprob)[2] + length(external.causes), dim(indivprob)[3]))
			for(t in 1:Nitr){
				indivprob.ext[1:dim(indivprob)[1], , t] <- cbind(
										  indivprob[, 1:(external.causes[1] - 1), t], 
										  matrix(0, dim(indivprob)[1], length(external.causes)), 
										  indivprob[, external.causes[1]:C0, t]
										)
				indivprob.ext[(dim(indivprob)[1] + 1):dim(indivprob.ext)[1], , t] <- ext.probs
			}
			dimnames(indivprob.ext)[[2]] <- colnames(object$indiv.prob)
			dimnames(indivprob.ext)[[1]]  <- c(id[match(rownames(object$data.final), id)], id[which(ext.flag > 0)])
		}else{
			indivprob.ext <- indivprob
			dimnames(indivprob.ext)[[2]] <- colnames(object$indiv.prob)
			dimnames(indivprob.ext)[[1]]  <- rownames(object$data.final)
		}
		dimnames(indivprob.ext)[[3]] <- paste0("sample", 1:Nitr)

		return(indivprob.ext)

	}else if(!is.aggregate & !is.sample){
		message("Calculating individual COD distributions...\n")
		# return rbind(mean, median, low, up), (4N) * C matrix
		indiv <- try( 
			 .jcall(obj, "[[D", "IndivProb", 
					 data.j, impossible.j, csmf.j, subpop.j, condprob.j, 
					 (1 - CI)/2, 1-(1-CI)/2, 
					 Nsub, Nitr, C, S)
			, FALSE)
		if(is(indiv, "try-error")){
			java_message()	
			stop()
		}
		indiv <- do.call(rbind, lapply(indiv, .jevalArray))
		
		# if not customized probbase, use longer cause names for output
		if(!object$is.customized){
			if(object$data.type == "WHO2012"){
				data("causetext", envir = environment())
				causetext<- get("causetext", envir  = environment())
			}else{
				data("causetextV5", envir = environment())
				causetext<- get("causetextV5", envir  = environment())
			}			
			match.cause <- pmatch(causetext[, 1],  colnames(object$probbase))
			index.cause <- order(match.cause)[1:sum(!is.na(match.cause))]
			colnames(indiv) <- causetext[index.cause, 2]
		}else{
			colnames(indiv) <- colnames(object$probbase)
		}

		K <- dim(indiv)[1] / 4


		## add back all external cause death 41:51 in standard VA
		if(object$external){
			external.causes <- object$external.causes
			C0 <- dim(indiv)[2]
			ext.flag <- apply(object$indiv.prob[, external.causes], 1, sum)
			ext.probs <- object$indiv.prob[which(ext.flag > 0), , drop = FALSE]

			indiv <- cbind(indiv[, 1:(external.causes[1] - 1)], 
				          matrix(0, dim(indiv)[1], length(external.causes)), 
				          indiv[, external.causes[1]:C0])
			colnames(indiv) <- colnames(object$indiv.prob)
			id.out <- c(id[match(rownames(object$data.final), id)], id[which(ext.flag > 0)])
		}else{
			id.out <- id
			ext.probs <- NULL
		}

		mean <- rbind(indiv[1:K, ], ext.probs)
		median <- rbind(indiv[(K+1):(2*K), ], ext.probs)
		lower <- rbind(indiv[(2*K+1):(3*K), ], ext.probs)
		upper <- rbind(indiv[(3*K+1):(4*K), ], ext.probs)

		rownames(mean) <- rownames(median) <- rownames(lower) <- rownames(upper) <- id.out
		return(list(mean = mean, median = median, 
					lower = lower, upper = upper))

	#-------------------------------------------------------------------#
	}else{
		message("Aggregating individual COD distributions...\n")
		# return (4 x Ngroup) * (C + 1) matrix, last column is sample size

		indiv <- try( 
			.jcall(obj, "[[D", "AggIndivProb", 
					 data.j, impossible.j, csmf.j, subpop.j, condprob.j,
					 datagroup.j, Ngroup, (1 - CI)/2, 1-(1-CI)/2, 
					 Nsub, Nitr, C, S)
			, FALSE)
		if(is(indiv, "try-error")){
			java_message()	
			stop()
		}
		indiv <- do.call(rbind, lapply(indiv, .jevalArray))
		if(object$data.type == "WHO2012"){
				data("causetext", envir = environment())
				causetext<- get("causetext", envir  = environment())
		}else{
			data("causetextV5", envir = environment())
			causetext<- get("causetextV5", envir  = environment())
		}		
		weight <- indiv[, dim(indiv)[2]]
		indiv <- indiv[, -dim(indiv)[2]]

		match.cause <- pmatch(causetext[, 1],  colnames(object$probbase))
		index.cause <- order(match.cause)[1:sum(!is.na(match.cause))]
		colnames(indiv) <- causetext[index.cause, 2]

		K <- dim(indiv)[1] / 4


		## add back all external cause death 41:51 in standard VA
		if(object$external){
			external.causes <- object$external.causes
			C0 <- dim(indiv)[2]
			ext.flag <- apply(object$indiv.prob[, external.causes], 1, sum)
			ext.probs <- data.frame(object$indiv.prob[which(ext.flag > 0), external.causes, drop = FALSE])
			ext.probs$ID <- rownames(ext.probs)
			ext.probs <- merge(ext.probs, datagroup.all[, c("ID", "final.group.num")])
			probcolumns <- which(colnames(ext.probs) %in% c("ID", "final.group.num") == FALSE)

			ext.probs.agg <- matrix(0, length(allgroups), length(probcolumns))
			ext.weight <- rep(0, length(allgroups))

			for(i in 1:length(allgroups)){
				thisgroup <- which(ext.probs$final.group.num == i)
				if(length(thisgroup) > 0){
					ext.probs.agg[i, ] <- apply(ext.probs[thisgroup, probcolumns, drop=FALSE], 2, mean)	
					
					# get weight using the count instead of ratio
					# easier for later manipulation when the total count is divided
					ext.weight[i] <- length(thisgroup)
					ext.probs.agg[i, ] <- ext.probs.agg[i, ] * ext.weight[i] 			
				}
			}
			ext.probs.agg.4fold <- rbind(ext.probs.agg, ext.probs.agg,
										 ext.probs.agg, ext.probs.agg)
			# multiply group size
			indiv <- indiv * weight

			indiv <- cbind(indiv[, 1:(external.causes[1] - 1)], 
				          ext.probs.agg.4fold, 
				          indiv[, external.causes[1]:C0])

			# divide by full size
			indiv <- indiv / (weight + rep(ext.weight, 4))

			colnames(indiv) <- colnames(object$indiv.prob)
		}else{
			id.out <- id
			ext.probs <- NULL
		}

		mean <- indiv[1:K, ]
		median <- indiv[(K+1):(2*K), ]
		lower <- indiv[(2*K+1):(3*K), ]
		upper <- indiv[(3*K+1):(4*K), ]


		
		# if no grouping, return a matrix
		if(is.null(by)){
			out <- data.frame(cbind(mean, median, lower, upper))
			colnames(out) <- c("Mean", "Median", "Lower", "Upper")
			return(out)
		}else{
			rownames(mean) <- rownames(median) <- rownames(lower) <- rownames(upper) <- allgroups
			return(list(mean = t(mean), median = t(median), 
					lower = t(lower), upper = t(upper)))
		}
		
	}


}




#' Update individual COD probabilities from InSilicoVA Model Fits
#' 
#' This function updates individual probabilities for each death and provide posterior credible intervals for each estimates. 
#' 
#' 
#' @param object Fitted \code{"insilico"} object.
#' @param CI Credible interval for posterior estimates.
#' @param java_option Option to initialize java JVM. Default to ``-Xmx1g'', which sets the maximum heap size to be 1GB.
#' @param \dots Not used.
#' @return object Updated \code{"insilico"} object.
#' @author Zehang Li, Tyler McCormick, Sam Clark
#' 
#' Maintainer: Zehang Li <lizehang@@uw.edu>
#' @seealso \code{\link{insilico}}, \code{\link{get.indiv}}
#' @references #' Tyler H. McCormick, Zehang R. Li, Clara Calvert, Amelia C. Crampin,
#' Kathleen Kahn and Samuel J. Clark Probabilistic cause-of-death assignment
#' using verbal autopsies, \emph{Journal of the American Statistical
#' Association} (2016), 111(515):1036-1049.
#' @examples
#' \dontrun{
#' data(RandomVA1)
#' fit1a<- insilico(RandomVA1, subpop = NULL,  
#'                 Nsim = 1000, burnin = 500, thin = 10 , seed = 1,
#'                 auto.length = FALSE)
#' summary(fit1a, id = "d199")
#' 
#' # The following script updates credible interval for individual 
#' fit1b <- updateIndiv(fit1a, CI = 0.95)
#' summary(fit1b, id = "d199")
#' }
#' @export updateIndiv 

updateIndiv <- function(object, CI = 0.95, java_option = "-Xmx1g", ...){
		
	 indiv  <- get.indiv(object, CI = CI, ...)
	 object$indiv.prob.lower <- indiv$lower
	 object$indiv.prob.upper <- indiv$upper
	 object$indiv.prob.median <- indiv$median
	 object$indiv.CI <- CI

	 return(object)
}