Nothing
## ----include = FALSE----------------------------------------------------------
is_check <- ("CheckExEnv" %in% search()) || any(c("_R_CHECK_TIMINGS_",
"_R_CHECK_LICENSE_") %in% names(Sys.getenv()))
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"#,eval = !is_check
)
## ----setup--------------------------------------------------------------------
library(ezECM)
## ----datagen------------------------------------------------------------------
data_gen <- function(p = NULL, K = 3, Ntest = NULL, Ntrain = NULL,
Ntrain_missing = NULL, tst_missing = NULL, trn_missing = NULL){
mu_use <- matrix(rnorm(n = p*K, sd = 0.5), ncol = K, nrow = p)
Y <- list()
S <- list()
## random number in each class
N <- LaplacesDemon::rcat(n = Ntest + Ntrain + Ntrain_missing,
p = rep(1/3, times = K))
N <- as.vector(table(N))
## random very important category
vic <- sample(1:K, size = 1)
for(k in 1:K){
## random number of blocks in kth category
nblocks <- sample(1:2, size = 1)
## random covariance matrix for kth category
S[[k]] <- LaplacesDemon::rinvwishart(nu = p + 4, S = diag(p))
## If relevant, delete entries to form block covariance matrices of random sizes
if(nblocks == 2){
nblock1 <- sample(1:(p-1), size = 1)
block1_members <- sample(1:p, size = nblock1, replace = FALSE)
block2_members <- (1:p)[-block1_members]
zero_elements <- expand.grid(block1_members, block2_members)
S[[k]][zero_elements$Var1, zero_elements$Var2] <- 0
S[[k]][zero_elements$Var2, zero_elements$Var1] <- 0
}
## data for kth class is drawn from a MVN, given the mean and covariance
Y[[k]] <- as.data.frame(LaplacesDemon::rmvn(n = N[k], mu = mu_use[,k], Sigma= S[[k]]))
## data is transformed to (0,1) using the logistic function to run a check
Ytemp<- 1/(1+ exp(-Y[[k]]))
## if machine precision causes the output of the logistic function to round to 1, the experiment is stopped
## this has never happened
if(max(apply(Ytemp,2,function(X){range(X)})) == 1){
stop()
}else{
Y[[k]]<- 1/(1+ exp(-Y[[k]]))
}
## a column for the event class is appended to the data.frame
Y[[k]] <- cbind(Y[[k]], as.character(k))
names(Y[[k]]) <- c(paste("p",as.character(1:p), sep = ""), "event")
}
Y <- do.call("rbind", Y)
## A random sample of the data is taken to be the testing set
test_index <- sample(1:nrow(Y), size = Ntest)
testing <- Y[test_index,]
## The remainder is slated for training
training <- Y[-test_index, ]
## A random sample of the training set is set aside to have missing entries,
## the remainder is set aside to be the fully observed training set
train_full_index <- sample(1:nrow(training), size = Ntrain)
train_missing <- training[-train_full_index, ]
train_full <- training[train_full_index, ]
## If all event categories are not represented in the training set of full observations,
## the sampling scheme repeats
if(any(table(train_full$event) <= 1) | length(table(train_full$event)) <= (K-1)){
while(any(table(train_full$event) <= 1 | length(table(train_full$event)) <= (K-1))){
test_index <- sample(1:nrow(Y), size = Ntest)
testing <- Y[test_index,]
training <- Y[-test_index, ]
train_full_index <- sample(1:nrow(training), size = Ntrain)
train_missing <- training[-train_full_index, ]
train_full <- training[train_full_index, ]
}
}
## the true event category for the testing set is saved as a seperate variable,
## and deleted from the data.frame containing the data
test_truth <- testing$event
testing$event <- NULL
## Entries are randomly selected for deletion from the testing set.
## This scheme ensures a single discriminant for each observation
## not deleted in order to reduce computation time.
abs_present <- sample(size = Ntest, 1:p, replace = TRUE)
missing_pool <- matrix(1:p, ncol = p, nrow = Ntest, byrow = TRUE)
missing_pool <- t(apply(cbind(missing_pool, abs_present), 1, function(X,pp){
X[-c(X[pp + 1], pp +1)]
}, pp = p))
missing_pool_save <- missing_pool
frac_missing <- (p*tst_missing)/(p-1)
# sample which of the remaining elements will be missing
missing_sample <- sample(1:(nrow(missing_pool)*ncol(missing_pool)),
size = floor(nrow(missing_pool)*ncol(missing_pool)*(frac_missing)),
replace = FALSE)
missing_pool_save[missing_sample] <- NA
saved_data <- apply(cbind(missing_pool_save, unname(abs_present)), 1, function(X){
X[-which(is.na(X))]
})
for(j in 1:nrow(testing)){
testing[j,-c(saved_data[[j]])] <- NA
}
## Entries are randomly selected for deletion from the training set.
## This scheme ensures a single discriminant for each observation
## not deleted in order to reduce computation time.
abs_present <- sample(size = Ntrain_missing, 1:p, replace = TRUE)
abs_missing <- matrix(1:p, ncol = p, nrow = Ntrain_missing, byrow = TRUE)
abs_missing <- t(apply(cbind(abs_missing, abs_present), 1, function(X,pp){
X[-c(X[pp + 1], pp +1)]
}, pp = p))
abs_missing <- apply(abs_missing, 1, function(X){
sample(X, size = 1)
})
missing_pool <- matrix(1:p, ncol = p, nrow = Ntrain_missing, byrow = TRUE)
missing_pool <- t(apply(cbind(missing_pool, abs_present, abs_missing), 1, function(X,pp){
X[-c(X[pp + 1], X[pp + 2], pp +1, pp + 2)]
}, pp = p))
missing_pool_save <- missing_pool
frac_missing <- (p*trn_missing - 1)/(p-2)
# sample which of the remaining elements will be missing
missing_sample <- sample(1:(nrow(missing_pool)*ncol(missing_pool)), size = floor(nrow(missing_pool)*ncol(missing_pool)*(frac_missing)), replace = FALSE)
missing_pool_save[missing_sample] <- NA
saved_data <- apply(cbind(missing_pool_save, unname(abs_present)), 1, function(X){
X[-which(is.na(X))]
})
for(j in 1:nrow(train_missing)){
train_missing[j,-c(saved_data[[j]], p+1)] <- NA
}
return(list(Y = list(train_full = train_full, train_missing = train_missing,
testing = testing, test_truth = test_truth),
params = list(mu = mu_use, Sig = S, vic = vic)))
}
## -----------------------------------------------------------------------------
## Data Parameters
tst_missing <- 0.5
trn_missing <- 0.5
Ntrain <- 25
Ntest <- 100
Ntrain_missing <- 5 * Ntrain
K <- 3
## -----------------------------------------------------------------------------
P <- c(4,6)
#P <- c(4,6,8,10)
## -----------------------------------------------------------------------------
alphatilde <- 0.05
## -----------------------------------------------------------------------------
mixture_weights <- "training"
## -----------------------------------------------------------------------------
C01 <- matrix(c(0,1,1, 0), ncol = 2, nrow = 2)
C01
## -----------------------------------------------------------------------------
Cfneg <- C01
Cfneg[1,2] <- 2
Cfneg
## -----------------------------------------------------------------------------
Ccat <- matrix(1, ncol = 3, nrow = 3) - diag(3)
Ccat
## -----------------------------------------------------------------------------
iters <- 3#250
## -----------------------------------------------------------------------------
## MCMC parameters
BT <- c(150, 2150)
#BT <- c(500, 50500)
## -----------------------------------------------------------------------------
thinning <- 5
## -----------------------------------------------------------------------------
## Experimental Parameters
verb <- FALSE
#verb <- TRUE
## -----------------------------------------------------------------------------
## Data structures for saving progress
cECM_recfp <- cECM_recfn <- bayes_rec <- cECM_rec <- matrix(NA, ncol = length(P), nrow = iters)
Nvic <- rep(0, times = length(P))
exp_out <- list()
method_template <- data.frame(matrix(NA, ncol = 3, nrow = iters))
names(method_template) <- c("accurate", "FN", "FP")
data_template <- data.frame(matrix(NA, ncol = 2, nrow = iters))
names(data_template) <- c("Ntest", "Nvic")
data_template$Ntest <- Ntest
p_template <- list(cECM = method_template, becm = method_template,
mbecm = method_template, mbecm_Cfn = method_template,
mbecm_cat = method_template, data = data_template)
bayes_rec <- cECM_rec <- matrix(NA, ncol = length(P), nrow = iters)
if(verb){
toc <- Sys.time()
}
## -----------------------------------------------------------------------------
## Iterates over the differing numbers of total discriminants set for the experiment.
for(p in P){
## Builds the list for saving the results using a template list
exp_out[[p]] <- p_template
## Runs each model for `iters` number of independent data sets.
for(i in 1:iters){
## The i^{th} run for p discriminants
if(verb){
## set the experimental parameter verb <- TRUE to print progress
print(paste0("i = ", i, ", p = ", p, ", ", round(Sys.time() - toc, digits = 2), " ", units(Sys.time() - toc), " elapsed"))
}
## Generate random data set
Ylist <- data_gen(p = p, K = K, Ntest = Ntest, Ntrain = Ntrain,
Ntrain_missing = Ntrain_missing, tst_missing = tst_missing,
trn_missing = trn_missing)
## Saves the random data information to the environment
train_full<- Ylist$Y$train_full
train_missing <- Ylist$Y$train_missing
testing <- Ylist$Y$testing
test_truth <- Ylist$Y$test_truth
## Which category is the important one this time?
vic <- Ylist$params$vic
## Save the true total number of `vic` events in the testing data to be used
## later for analyzing performance.
exp_out[[p]][["data"]]$Nvic[i] <- sum(test_truth == as.character(vic))
## Fit the classical ECM model, apply the decision framework with the
## `cECM_decision()` function, then save the results
cECM <- cECM(x = train_full, transform = TRUE, newdata = testing)
cECM_out <- apply(cECM_decision(pval = cECM, alphatilde = alphatilde,
vic = as.character(vic),
cat_truth = test_truth)$events[,1:3] ,2,
sum, na.rm = TRUE)
exp_out[[p]][["cECM"]][i,] <- unname(cECM_out)
## Fit the B-ECM model, using only full p observations
bayes_fit <- BayesECM(Y = train_full)
## Run the predict function on the testing set.
## If there were multiple testing sets, the same model fit could be used on
## each one without having to rerun the `BayesECM()` function. This
## functionality is more important when using training data with missing
## entries.
bayes_pred <- predict(bayes_fit, Ytilde = testing,
mixture_weights = mixture_weights)
## The "becm_desision()" function applies the decision theoretic framework
## to the training and testing data. For one training and one testing set,
## where the user wants to try different values of `alphatilde` and `C`, it is
## not necessary to rerun the `BayesECM()` function or the `predict()`
## function.
becm_out <- becm_decision(bayes_pred = bayes_pred, alphatilde = alphatilde,
vic = as.character(vic), cat_truth = test_truth,
pn = TRUE, C = C01)
## Summarize and save the data.
becm_out <- apply(becm_out$results,2, sum, na.rm = TRUE)
exp_out[[p]][["becm"]][i,] <- unname(becm_out)
## Fit and save the B-ECM model that includes missing data
bayes_fit_missing <- BayesECM(Y = rbind(train_full, train_missing), BT = BT,
verb = verb)
bayes_pred_missing <- predict(bayes_fit_missing, Ytilde = testing,
thinning = thinning,
mixture_weights = mixture_weights)
missing_out <- becm_decision(bayes_pred = bayes_pred_missing, alphatilde = alphatilde,
vic = as.character(vic), cat_truth = test_truth,
pn = TRUE, C = C01)
mbecm_out <- apply(missing_out$results,2, sum, na.rm = TRUE)
exp_out[[p]][["mbecm"]][i,] <- unname(mbecm_out)
## The rest of the B-ECM variants are different through decision theory,
## not the model fit. All use partial observations for training.
## Note that the rej argument is supplied to becm_decision to reduce computation time
## Record the decision when the loss matrix is adjusted to target
## false negatives.
Cfn_out <- becm_decision(bayes_pred = bayes_pred_missing, alphatilde = alphatilde,
vic = as.character(vic), cat_truth = test_truth,
pn = TRUE, C = Cfneg, rej = missing_out$rej)
becm_Cfn_out <- apply(Cfn_out$results,2, sum, na.rm = TRUE)
exp_out[[p]][["mbecm_Cfn"]][i,] <- unname(becm_Cfn_out)
## Record the decisions when full class (K = 3) categorization is used
## instead of binary categorization
cat_out <- becm_decision(bayes_pred = bayes_pred_missing, alphatilde = alphatilde,
vic = as.character(vic), cat_truth = test_truth,
pn = TRUE, C = Ccat, rej = missing_out$rej)
becm_cat_out <- apply(cat_out$results,2, sum, na.rm = TRUE)
exp_out[[p]][["mbecm_cat"]][i,] <- unname(becm_cat_out)
}
}
## -----------------------------------------------------------------------------
ECM_boxplot <- function(exp_out, P = P, models = c("cECM", "becm",
"mbecm",
"mbecm_Cfn",
"mbecm_cat"),
cols = NULL,
legend_text = c("C-ECM", "B-ECM", "M-B-ECM",
bquote("M-B-ECM, " * C["1,2"] == 2), "M-B-ECM Cat"),
metric = "accurate"){
if(metric == "accurate"){
divisor <- function(exp_out, p){
return(exp_out[[p]]$data$Ntest)
}
ylab <- "Model Accuracy"
}else if(metric == "FN"){
divisor <- function(exp_out,p){
return(exp_out[[p]]$data$Nvic)
}
ylab <- "False Negative Rate"
}else if(metric == "FP"){
divisor <- function(exp_out, p){
return(exp_out[[p]]$data$Ntest - exp_out[[p]]$data$Nvic)
}
ylab <- "False Positive Rate"
}else{
stop("Argument 'metric' must be one of the following case sensitive character strings: 'accurate', 'FN', or 'FP'.")
}
boxplotdf <- do.call("cbind", lapply(exp_out[[P[1]]][models], function(X, m = metric){
X[[m]]
}))/divisor(exp_out, p = P[1])
for(p in P[-1]){
boxplotdf <- cbind(boxplotdf,do.call("cbind", lapply(exp_out[[p]][models], function(X, m = metric){
X[[m]]
}))/divisor(exp_out, p = p))
}
boxplotdf <- boxplotdf
if(max(boxplotdf) > 65){
ylim <- c(min(boxplotdf), 1.1)
}else{
ylim <- range(boxplotdf) * c(0.9,1.2)
}
if(is.null(cols)){
if(length(models) == 5){
pltcols <- hcl.colors(44, palette = "viridis", rev = TRUE)[c(3, 10, 20, 30, 37)]
}else if(length(models) > 10){
warning("You should consider recoding this function with a different way to select the colors used for the plot.")
pltcols <- hcl.colors(44, palette = "viridis", rev = TRUE)[seq(from = 2, to = 38, length.out = length(models))]
}else{
pltcols <- hcl.colors(44, palette = "viridis", rev = TRUE)[seq(from = 2, to = 38, length.out = length(models))]
}
}else{
if(length(cols) != length(models)){
stop("If supplying colors, a vector the same length as the 'models' argument must be used.")
}
}
opar <- par(no.readonly = TRUE)
on.exit(expr = suppressWarnings(par(opar)))
par(mar = c(4.25,3.85,1,0.5))
lmodels <- length(models)
graphics::boxplot(boxplotdf,
at = (1:((lmodels + 1)*length(P)))[-seq(from = lmodels + 1,
to = length(P)*(lmodels + 1),
by = lmodels + 1)],
xaxt = "n", yaxt = "n", ylim = ylim,
col = pltcols, xlab = "Number of Discriminants", ylab = ylab)
py <- pretty(boxplotdf)
graphics::axis(2, py)
graphics::axis(1, at =seq(from = 1, to = length(P)*(lmodels + 1), by = lmodels + 1) + 1,
labels = paste0("p = ", P) )
graphics::legend("topleft", bty ="n",
legend = legend_text,
fill = pltcols, horiz = FALSE, ncol = 3, y.intersp = 1.25)
}
## -----------------------------------------------------------------------------
ECM_boxplot(exp_out = exp_out, P = P, metric = "accurate")
## -----------------------------------------------------------------------------
ECM_boxplot(exp_out = exp_out, P = P, metric = "FN")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.