Nothing
#' Obtain conditional probabilities from training data
#'
#' This is the function internally used in \code{insilico.train} function.
#'
#' @param train Training data, it should be in the same format as the testing data
#' and contains one additional column (see \code{cause} below) specifying known
#' cause of death. The first column is also assumed to be death ID.
#' @param gs the name of the column in \code{train} that contains cause of death.
#' @param gstable The list of causes of death used in training data.
#' @param thre a numerical value between 0 to 1. It specifies the maximum rate of
#' missing for any symptoms to be considered in the model. Default value is set to
#' 0.95, meaning if a symptom has more than 95\% missing in the training data, it
#' will be removed.
#' @param type Three types of learning conditional probabilities are provided: ``quantile''
#' or ``fixed''. Since InSilicoVA works with ranked conditional probabilities P(S|C), ``quantile''
#' means the rankings of the P(S|C) are obtained by matching the same quantile distributions
#' in the default InterVA P(S|C), and ``fixed'' means P(S|C) are matched to the closest values
#' in the default InterVA P(S|C) table. Empirically both types of rankings produce similar results. The third option ``empirical'' means no rankings are calculated, only the raw P(S|C) values are returned.
#' @param isNumeric Indicator if the input is already in numeric form. If the
#' input is coded numerically such that 1 for ``present'', 0 for ``absent'',
#' and -1 for ``missing'', this indicator could be set to True to avoid
#' conversion to standard InterVA format.
#' @param impute Indicator for whether to impute 1. P(S|C) with P(S) if symptom S does not exist more than the threshold of fractions within death due to C; and 2. values of exact 0 or 1.
#'
#' @return
#' \item{cond.prob}{raw P(S|C) matrix}
#' \item{cond.prob.alpha}{ranked P(S|C) matrix}
#' \item{table.alpha}{list of ranks used}
#' \item{table.num}{list of median numerical values for each rank}
#' \item{symps.train}{training data after removing symptoms with too high missing rate.}
#' @export extract.prob
#'
extract.prob <- function(train, gs, gstable, thre = 0.95, type = c("quantile", "fixed", "empirical")[1], isNumeric = FALSE, impute=TRUE){
## help functions to count vector contents
if(isNumeric){
yes <- 1
no <- 0
miss <- -1
}else{
yes <- "Y"
no <- c("", "N")
miss <- c(".", "-")
}
countyes <- function(x){length(which(toupper(x) %in% yes))}
countno <- function(x){length(which(x %in% no))}
countmiss <- function(x){length(which(x %in% miss))}
countna <- function(x){length(which(is.na(x)))}
##
## function to turn numeric matrix into level matrix
##
## cond.prob: S by C numeric matrix
## levels : length L character vector (from small to large)
## (e.g. E, D, C, B, A)
## cutoffs : length L numeric vector (percentile cutoff, last = 1)
## (e.g. .1, .2, .3, .7, 1) -> E=(0, 0.1), D=(.1, .2), ..
toLevel <- function(cond.prob, levels, cutoffs){
cond.prob.vec <- as.vector(cond.prob)
new <- rep(NA, length(cond.prob.vec))
cdf <- ecdf(cond.prob.vec)
table <- table.median <- rep(0, length(cutoffs))
for(i in length(cutoffs) : 1){
table[i] <- quantile(cdf, cutoffs[i])
if(i > 1){
table.median[i] <- quantile(cdf, (cutoffs[i] + cutoffs[i-1])/2)
}else{
table.median[i] <- quantile(cdf, (cutoffs[i])/2)
}
}
for(i in length(cutoffs) : 1){
new[which(cond.prob.vec <= table[i])] <- levels[i]
}
new <- matrix(new, dim(cond.prob)[1], dim(cond.prob)[2])
colnames(new) <- colnames(cond.prob)
rownames(new) <- rownames(cond.prob)
return(list(cond.prob = new, table.cut = table, table.median = table.median))
}
##
##
## step1: check missing items and update symptom matrix
##
##
## first screening out symptoms with highest missing
## remove missing rate > 0.95
train.id <- as.character(train[, 1])
train <- train[, -1]
train.gs <- as.character(train[, gs])
if(is.null(gstable)){
gstable <- unique(train.gs)
}
train <- train[, -which(colnames(train) == gs)]
miss.train <- apply(train, 2, countmiss) / dim(train)[1]
miss.too.many <- which(miss.train > thre)
if(length(miss.too.many) > 0){
train <- train[, -miss.too.many]
warning(paste(length(miss.too.many), "symptoms deleted for missing rate over the pre-specified threshold", thre),
immediate. = TRUE)
}
## check cause list
gs.missing <- which(gstable %in% unique(train.gs) == FALSE)
if(length(gs.missing) > 0){
warning(paste("Causes not found in training data and deleted:",
paste(gstable[gs.missing], collapse = ", ")),
immediate. = TRUE)
gstable <- gstable[-gs.missing]
}
##
##
## Get empirical prob base
##
##
# get cond.prob, a S x C matrix of P(S|C)
cond.prob <- NULL
for(i in 1:length(gstable)){
cause <- gstable[i]
list <- which(train.gs == cause)
cases <- train[list, ]
## handle only case scenario
if(length(list) == 1){
cond <- rep(0, length(cases))
cond[which(toupper(cases) == yes)] <- 1
cond[which(cases == miss)] <-- NA
}else{
# remove missing from calculation
count <- apply(cases, 2, countyes)
denom <- (length(list) - apply(cases, 2, countmiss))
denom[denom < length(list)*(1-thre)] <- NA
cond <- count / denom
}
cond.prob <- cbind(cond.prob, cond)
}
colnames(cond.prob) <- gstable
rownames(cond.prob) <- colnames(train)
S <- dim(cond.prob)[1]
C <- dim(cond.prob)[2]
## NA means for a particular cause and a particular symptom,
## all cases are missing,
## but it's unreasonable to delete either, since they are useful
## for other causes and symptoms
bysymps <- apply(cond.prob, 1, countna)
if(impute){
if(length(which(bysymps > 0)) > 0){
message(paste("\n", length(which(bysymps > 0)), "missing symptom-cause combinations in training data, imputed using average symptom prevalence\n"))
## impute by Pr(s | c) by Prob(s | any c)
## i.e. relatively no info provided from this s-c combination
overall <- apply(train, 2, countyes) / dim(train)[1]
for(i in 1:S){
cond.prob[i, which(is.na(cond.prob[i,]))] <- overall[i]
}
}
}
## to avoid 0 and 1
zeroes <- which(cond.prob == 0)
ones <- which(cond.prob == 1)
if(impute) cond.prob[zeroes] <- runif(length(zeroes), min = 1e-6, max = 2e-6)
if(impute) cond.prob[ones] <- 1 - runif(length(ones), min = 1e-6, max = 2e-6)
conditionals <- as.vector(cond.prob)
# grab the InterVA table
data("probbase", envir = environment())
probbase <- get("probbase", envir = environment())
intertable <- probbase[2:246, 17:76]
intertable[which(intertable == "B -")] <- "B-"
intertable[which(intertable == "")] <- "N"
levels <- c("I", "A+", "A", "A-", "B+", "B" , "B-", "C+", "C", "C-", "D+", "D", "D-", "E", "N")
counts <- rep(0, length(levels))
for(i in 1:length(levels)){
counts[i] <- length(which(as.vector(intertable) == levels[i]))
}
freqs <- cumsum(counts / sum(counts))
if(type == "quantile"){
# get empirical results
level.tmp <- toLevel(cond.prob, rev(levels), freqs)
cond.prob.alpha <- level.tmp$cond.prob
table.num <- rev(level.tmp$table.median)
}else if(type == "fixed"){
# get InterVA transformation results
table.num <- c(1, 0.8, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001, 0.0005, 0.0001, 0.00001, 0)
# get the middle point cut-off (first = 1)
inter.cut <- c(1, table.num[-length(table.num)] + diff(table.num)/2 )
# transform into the nearest InterVA values
cond.prob.alpha <- matrix("N", dim(cond.prob)[1], dim(cond.prob)[2])
for(i in 1:length(inter.cut)){
cond.prob.alpha[which(cond.prob <= inter.cut[i])] <- levels[i]
}
colnames(cond.prob.alpha) <- colnames(cond.prob)
rownames(cond.prob.alpha) <- rownames(cond.prob)
}else if(type == "empirical"){
cond.prob.alpha <- NULL
levels <- NULL
table.num <- NULL
}
# the levels returned are from high to low
if(sum(is.na(cond.prob)) > 0){
cond.prob.alpha[is.na(cond.prob)] <- NA
}
return(list(cond.prob = cond.prob,
cond.prob.alpha = cond.prob.alpha,
table.alpha = levels,
table.num = table.num,
symps.train = train))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.