Nothing
## implement Tariff method and calculate accuracy metrics
## algorithm last update 2015.03.13
## wrapper last update 2015.12.06
#' Replicate Tariff methods
#'
#' This function implements Tariff method.
#'
#' @param causes.train character vector of causes, or the column name of cause in the training data
#' @param symps.train N.train by S matrix
#' @param symps.test N.test by S matrix
#' @param causes.table list of causes in the data
#' @param use.rank logical indicator for whether using ranks instead of scores
#' @param nboot.rank number of re-sampling for baseline rank comparison. Default to 1, which resamples training data to have a uniform cause distribution of the same size. Set this to 0 removes bootstrapping the training dataset.
#' @param use.sig logical indicator for whether using significant Tariff only
#' @param nboot.sig number of re-sampling for testing significance.
#' @param use.top logical indicator for whether the tariff matrix should be cleaned to have only top symptoms
#' @param ntop number of top tariff kept for each cause
#' @param ... not used
#'
#' @return \item{score}{matrix of score for each cause within each death}
#' \item{causes.train}{vector of most likely causes in training data}
#' \item{causes.test}{vector of most likely causes in testing data}
#' \item{csmf}{vector of CSMF}
#' \item{causes.table}{cause list used for output, i.e., list of existing causes in the training data}
#' \item{use.rank}{logical indicator for whether using ranks instead of scores}
#' @author Zehang Li, Tyler McCormick, Sam Clark
#'
#' Maintainer: Zehang Li <lizehang@@uw.edu>
#' @references James, S. L., Flaxman, A. D., Murray, C. J., & Population Health Metrics Research Consortium. (2011). \emph{Performance of the Tariff Method: validation of a simple additive algorithm for analysis of verbal autopsies.} \emph{Population Health Metrics, 9(1), 1-16.}
#' @references Serina, P., Riley, I., Stewart, A., James, S. L., Flaxman, A. D., Lozano, R., ... & Ahuja, R. (2015). \emph{Improving performance of the Tariff Method for assigning causes of death to verbal autopsies.} \emph{BMC medicine, 13(1), 1.}
#' @references Tyler H. McCormick, Zehang R. Li, Clara Calvert, Amelia C.
#' Crampin, Kathleen Kahn and Samuel J. Clark(2016) \emph{Probabilistic
#' cause-of-death assignment using verbal autopsies},
#' \url{http://arxiv.org/abs/1411.3042} \emph{To appear, Journal of the American Statistical Association}
#' @keywords Tariff
#' @examples
#'\donttest{
#' data("RandomVA3")
#' test <- RandomVA3[1:200, ]
#' train <- RandomVA3[201:400, ]
#' allcauses <- unique(train$cause)
#' fit <- tariff(causes.train = "cause", symps.train = train,
#' symps.test = test, causes.table = allcauses)
#' correct <- which(fit$causes.test[,2] == test$cause)
#' accuracy <- length(correct) / dim(test)[1]
#' }
tariff <- function(causes.train, symps.train, symps.test, causes.table = NULL, use.rank = TRUE, nboot.rank = 1, use.sig = TRUE, nboot.sig = 500, use.top = FALSE, ntop = 40, ...){
# if input cause is the column name
if(class(causes.train) == "character" && length(causes.train) == 1){
colindex <- match(causes.train, colnames(symps.train))
colindex2 <- match(causes.train, colnames(symps.test))
if(is.na(colindex)){
stop("Cannot find the cause-of-death column in training data")
}
causes.train <- symps.train[, colindex]
symps.train <- symps.train[, -colindex]
# also remove this from testing data if it is provided
if(!is.na(colindex2)){
causes.test <- symps.test[, colindex2]
symps.test <- symps.test[, -colindex2]
}
}
if(is.null(causes.table)){
causes.table <- unique(causes.train)
}
id.train <- symps.train[, 1]
symps.train <- symps.train[, -1]
id.test <- symps.test[, 1]
symps.test <- symps.test[, -1]
# make sure train and test has the same columns
joint <- intersect(colnames(symps.train), colnames(symps.test))
symps.test <- symps.test[, joint]
symps.train <- symps.train[, joint]
# function to convert InterVA input back to numeric
# @para
# mat : matrix containing "Y", "" and "."
# missing : impute value for "."
toBinary <- function(mat, missing = 0){
mat <- as.matrix(mat)
mat2 <- matrix(0, dim(mat)[1], dim(mat)[2])
mat2[which(toupper(mat) == "Y")] <- 1
mat2[which(mat == ".")] <- missing
return(mat2)
}
# steps for re-sampling
# function to count combinations from symps and causes
# @para
# symps : N by S matrix
# causes : N matrix
# causelist : C vector
# binary : boolean indicating "Y" or 1
# @return
# count matrix (C by S), normalized!
count.combo <- function(symps, causes, causelist, binary){
cond.count <- matrix(0, length(causelist), dim(symps)[2])
for(i in 1:length(causelist)){
cause <- causelist[i]
list <- which(causes == cause)
if(length(list) == 0){
# cat("!")
next;
}
cases <- symps[list, ,drop=FALSE]
if(length(list) == 1){
}
# remove missing from calculation
if(binary){
count <- apply(cases, 2, function(x){length(which(x == 1))/length(x)})
}else{
count <- apply(cases, 2, function(x){length(which(x == "Y"))/length(x)})
}
cond.count[i, ] <- count
}
return(cond.count)
}
# function to get tariff (not tariff score) from count matrix
# @para
# X: C by S matrix
# @return
# tariff: C by S matrix
getTariff <- function(X){
# find median by symptom
med <- apply(X, 2, median)
# find IQR by symptom, handle zero denominator
iqr <- apply(X, 2, IQR)
# if iqr is 0, replace with range
rr <- apply(X, 2, function(x){max(x) - min(x)})
iqr[which(iqr == 0)] <- rr[which(iqr == 0)]
# if range is also 0, means all the same value,
# since it will be 0 anyway, just to avoid division by 0 here
iqr[which(iqr == 0)] <- 0.05
# rescale to Tariff now (still C by S matrix)
# NOTICE < danger >:
# R perform matrix - vector by column, so it has to be transposed first
tariff <- (t(X) - med) / iqr
tariff <- t(tariff)
# Tariff paper suggests the rounding here, to avoid over-fitting?
tariff <- round(2 * tariff) / 2
return(tariff)
}
# function to convert score to rank compared only to Resampled training set
# @para
# mat : score matrix (C by N)
# all : all score matrix (K by C)
# return
# mat.rank: score rank matrix (C by N)
toRank <- function(mat, all){
cat("Calculating ranks\n")
N <- dim(mat)[2]
C <- dim(mat)[1]
# notice rank from small to large, so take negative
mat.rank <- lapply(seq(1:C),
function(kk){
# cat(".");
return(
sapply(mat[kk, ], function(x, y){rank(-c(x, y))[1]}, all[,kk])
)})
out <- mat
for(i in 1:C){
out[i, ] <- mat.rank[[i]]
}
return(out)
}
# find individual top n causes
# function to get the n-th largest element's index
which.ordern <- function(x, n, isMax){
return(order(x, decreasing = isMax)[n])
}
##
## starts algorithm
##
# remove causes not in the training data
nonexist <- which(causes.table %in% unique(causes.train) == FALSE)
if(length(nonexist) > 0){
causes.table2 <- causes.table[-nonexist]
}else{
causes.table2 <- causes.table
}
S <- dim(symps.train)[2]
C <- length(causes.table2)
N.train <- dim(symps.train)[1]
N.test <- dim(symps.test)[1]
symps.num <- toBinary(symps.train, missing = 0)
##################################################################
# first bootstrap step, removing insignificant cause-symptom combo
if(use.sig){
cat("\nStart re-sampling for significant Tariff cells\n")
all.tariff.boot <- array(0, dim = c(nboot.sig, C, S))
# all.tariff.score.boot <- matrix(0, N.boot*N.train, C)
for(i in 1 : nboot.sig){
sample.boot <- sample(1:N.train, size = N.train, replace = TRUE)
symps.boot <- symps.num[sample.boot, ]
cause.boot <- causes.train[sample.boot]
count.boot <- count.combo(symps.boot, cause.boot, causes.table2, binary=T)
all.tariff.boot[i, , ] <- getTariff(count.boot)
# if(i %% 10 == 0) cat(".")
}
# get the lower bound for tariff, C by S matrix
lower <- apply(all.tariff.boot, c(2,3), function(x){quantile(x, 0.025)})
# get the upper bound for tariff, C by S matrix
upper <- apply(all.tariff.boot, c(2,3), function(x){quantile(x, 1-0.025)})
# check if it covers zero
cover <- sign(lower * upper)
# define the indicator matrix of which tariff to be removed
insig <- cover * 0
insig[which(cover != -1)] <- 1
if(sum(insig) == 0){
warning("No Tariff is significant, remove bootstrapping step")
insig <- matrix(1, C, S)
}
}else{
insig <- matrix(1, C, S)
}
###########################################################################
# calculate Tariff from training data
# function to remove tail tariff values
cleanTariff <- function(tariff, nonzero){
for(i in 1:dim(tariff)[1]){
order.tmp <- order(abs(tariff[i, ]), decreasing = TRUE)
tariff[i, order.tmp[1:nonzero]] <- 0
}
return(tariff)
}
# calculate actual tariff and delete the insignificant combo
X.train <- count.combo(symps.num, causes.train, causes.table2, binary=T)
tariff <- getTariff(X.train) * insig
if(use.top){
tariff <- cleanTariff(tariff, ntop)
}
###########################################################################
# second bootstrap step, getting Tariff score dist using uniform cause dist
# factor of each resampling draw. Truncate N.train to multiple of C
if(use.rank){
if(nboot.rank == 0){
nboot.rank <- 1
resample.rank <- FALSE
all.score.boot <- matrix(0, N.train, C)
}else{
resample.rank <- TRUE
factor <- trunc(N.train / C)
all.score.boot <- matrix(0, nboot.rank * factor * C, C)
}
# calculate which deaths are by which cause
index.by.cause <- lapply(causes.table2, function(k){which(causes.train == k)})
if(resample.rank){
for(i in 1:nboot.rank){
sample.boot <- rep(0, C*factor)
# re-sample stratified by cause
for(j in 1:C){
sample.boot[((j-1)*factor + 1):(j * factor)] <- sample(index.by.cause[[j]]
, factor, replace = TRUE)
}
symps.boot <- symps.num[sample.boot, ]
cause.boot <- causes.train[sample.boot]
# count.boot <- count.combo(symps.boot, cause.boot, causes.table, binary=T)
# tariff.boot <- getTariff(count.boot) * insig
# score.boot <- tariff.boot %*% t(symps.boot)
score.boot <- tariff %*% t(symps.boot)
all.score.boot[((i-1)*factor*C + 1) : (i*factor*C), ] <- t(score.boot)
# if(i %% 10 == 0) cat(".")
}
}else{
symps.boot <- symps.num
cause.boot <- causes.train
score.boot <- tariff %*% t(symps.boot)
all.score.boot <- t(score.boot)
}
}
# convert InterVA input back into numeric form (N by S matrix)
# test.blow <- alldata$symps.test[sample()]
symps.num.test <- toBinary(symps.test, missing = 0)
# calculate tariff score (C by N matrix)
score.num <- tariff %*% t(symps.num.test)
# find individual top cause (max score)
# use the cause names instead of the indexes
causes.test <- causes.table2[apply(score.num, 2, which.max)]
#
score <- NULL
if(use.rank){
# find individual top cause (min rank)
score <- toRank(score.num, all.score.boot)
causes.test <- causes.table2[apply(score, 2, which.min)]
}else{
score <- score.num
}
colnames(score) <- id.test
rownames(score) <- causes.table2
# find CSMF for testing set
CSMF <- (table(c(causes.test, causes.table2)) - 1) / length(causes.test)
if(use.rank){
CSMF <- (table(c(causes.test, causes.table2)) - 1)/length(causes.test)
}
# names(CSMF) <- causes.table2
CSMF <- CSMF[causes.table2]
# might need to include a way to transform from causes.table2 back to causes.table here
causes.train.out <- data.frame(ID = id.train)
causes.train.out$cause <- as.character(causes.train)
causes.test.out <- data.frame(ID = id.test)
causes.test.out$cause <- as.character(causes.test)
fit <- list(score = t(score),
causes.train = causes.train.out,
causes.test = causes.test.out,
csmf = CSMF,
causes.table = causes.table2,
use.rank = use.rank)
class(fit) <- "tariff"
return(fit)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.