syn-data-code.R
In ezECM: Event Categorization Matrix Classification for Nuclear Detonations

## ----include = FALSE----------------------------------------------------------
is_check <- ("CheckExEnv" %in% search()) || any(c("_R_CHECK_TIMINGS_",
             "_R_CHECK_LICENSE_") %in% names(Sys.getenv()))
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"#,eval = !is_check
)

## ----setup--------------------------------------------------------------------
library(ezECM)

## ----datagen------------------------------------------------------------------
data_gen <- function(p = NULL, K = 3, Ntest = NULL, Ntrain = NULL, 
                     Ntrain_missing = NULL, tst_missing = NULL, trn_missing = NULL){
  
    mu_use <- matrix(rnorm(n = p*K, sd = 0.5), ncol = K, nrow = p)
    Y <- list()
    S <- list()
    
    ## random number in each class
    
    N <- LaplacesDemon::rcat(n = Ntest + Ntrain + Ntrain_missing, 
                             p = rep(1/3, times = K))
    N <- as.vector(table(N))
    
    ## random very important category
    
    vic <- sample(1:K, size = 1)
    
    for(k in 1:K){
      
      ## random number of blocks in kth category
      
      nblocks <- sample(1:2, size = 1)
      
      ## random covariance matrix for kth category
      
      S[[k]] <- LaplacesDemon::rinvwishart(nu = p + 4, S = diag(p))
      
      
      ## If relevant, delete entries to form block covariance matrices of random sizes
      
      if(nblocks == 2){
        nblock1 <- sample(1:(p-1), size = 1)
        block1_members <- sample(1:p, size = nblock1, replace = FALSE)
        block2_members <- (1:p)[-block1_members]
        
        zero_elements <- expand.grid(block1_members, block2_members)
        
        S[[k]][zero_elements$Var1, zero_elements$Var2] <- 0
        S[[k]][zero_elements$Var2, zero_elements$Var1] <- 0
        
      }
      
      ## data for kth class is drawn from a MVN, given the mean and covariance
      Y[[k]] <- as.data.frame(LaplacesDemon::rmvn(n = N[k], mu = mu_use[,k], Sigma= S[[k]]))
      
      ## data is transformed to (0,1) using the logistic function to run a check
      Ytemp<- 1/(1+ exp(-Y[[k]]))
      
      ## if machine precision causes the output of the logistic function to round to 1, the experiment is stopped
      ## this has never happened
      if(max(apply(Ytemp,2,function(X){range(X)})) == 1){
        stop()
      }else{
        Y[[k]]<- 1/(1+ exp(-Y[[k]]))
      }
      
      ## a column for the event class is appended to the data.frame
      Y[[k]] <- cbind(Y[[k]], as.character(k))
      names(Y[[k]]) <- c(paste("p",as.character(1:p),  sep = ""), "event")
      
    }
    
    Y <- do.call("rbind", Y)
    
    ## A random sample of the data is taken to be the testing set
    test_index <- sample(1:nrow(Y), size = Ntest)
    testing <- Y[test_index,]
    
    ## The remainder is slated for training
    training <- Y[-test_index, ]
    
    ## A random sample of the training set is set aside to have missing entries,
    ## the remainder is set aside to be the fully observed training set
    train_full_index <- sample(1:nrow(training), size = Ntrain)
    train_missing <- training[-train_full_index, ]
    train_full <- training[train_full_index, ]
    
    ## If all event categories are not represented in the training set of full observations,
    ## the sampling scheme repeats
    if(any(table(train_full$event) <= 1) | length(table(train_full$event)) <= (K-1)){
      while(any(table(train_full$event) <= 1 | length(table(train_full$event)) <= (K-1))){
        test_index <- sample(1:nrow(Y), size = Ntest)
        
        testing <- Y[test_index,]
        training <- Y[-test_index, ]
        train_full_index <- sample(1:nrow(training), size = Ntrain)
        train_missing <- training[-train_full_index, ]
        train_full <- training[train_full_index, ]
      }
    }
    
    ## the true event category for the testing set is saved as a seperate variable,
    ## and deleted from the data.frame containing the data
    test_truth <- testing$event
    testing$event <- NULL
    
    ## Entries are randomly selected for deletion from the testing set.
    ## This scheme ensures a single discriminant for each observation
    ## not deleted in order to reduce computation time.
    
    abs_present <- sample(size = Ntest, 1:p, replace = TRUE)
    missing_pool <- matrix(1:p, ncol = p, nrow = Ntest, byrow = TRUE)
    
    missing_pool <- t(apply(cbind(missing_pool, abs_present), 1, function(X,pp){
      X[-c(X[pp + 1], pp +1)]
    }, pp = p))
    
    missing_pool_save <- missing_pool
    frac_missing <- (p*tst_missing)/(p-1)
    
    # sample which of the remaining elements will be missing
    missing_sample <- sample(1:(nrow(missing_pool)*ncol(missing_pool)), 
                             size = floor(nrow(missing_pool)*ncol(missing_pool)*(frac_missing)), 
                             replace = FALSE)
    
    missing_pool_save[missing_sample] <- NA
    
    saved_data <- apply(cbind(missing_pool_save, unname(abs_present)), 1, function(X){
      X[-which(is.na(X))]
    })
    
    for(j in 1:nrow(testing)){
      testing[j,-c(saved_data[[j]])] <- NA
    }
    
    ## Entries are randomly selected for deletion from the training set.
    ## This scheme ensures a single discriminant for each observation
    ## not deleted in order to reduce computation time.
    
    abs_present <- sample(size = Ntrain_missing, 1:p, replace = TRUE)
    abs_missing <- matrix(1:p, ncol = p, nrow = Ntrain_missing, byrow = TRUE)
    
    abs_missing <- t(apply(cbind(abs_missing, abs_present), 1, function(X,pp){
      X[-c(X[pp + 1], pp +1)]
    }, pp = p))
    
    abs_missing <- apply(abs_missing, 1, function(X){
      sample(X, size = 1)
    })
    
    missing_pool <- matrix(1:p, ncol = p, nrow = Ntrain_missing, byrow = TRUE)
    
    missing_pool <- t(apply(cbind(missing_pool, abs_present, abs_missing), 1, function(X,pp){
      X[-c(X[pp + 1], X[pp + 2], pp +1, pp + 2)]
    }, pp = p))
    
    missing_pool_save <- missing_pool
    frac_missing <- (p*trn_missing - 1)/(p-2)
    
    # sample which of the remaining elements will be missing
    missing_sample <- sample(1:(nrow(missing_pool)*ncol(missing_pool)), size = floor(nrow(missing_pool)*ncol(missing_pool)*(frac_missing)), replace = FALSE)
    
    
    missing_pool_save[missing_sample] <- NA
    
    saved_data <- apply(cbind(missing_pool_save, unname(abs_present)), 1, function(X){
      X[-which(is.na(X))]
    })
    
    for(j in 1:nrow(train_missing)){
      train_missing[j,-c(saved_data[[j]], p+1)] <- NA
    }
    
    return(list(Y = list(train_full = train_full, train_missing = train_missing, 
                         testing = testing, test_truth = test_truth), 
                params = list(mu = mu_use, Sig = S, vic = vic)))
    
}


## -----------------------------------------------------------------------------

## Data Parameters
tst_missing <- 0.5
trn_missing <- 0.5

Ntrain <- 25
Ntest <- 100
Ntrain_missing <- 5 * Ntrain

K <- 3

## -----------------------------------------------------------------------------
P <- c(4,6)
#P <- c(4,6,8,10)

## -----------------------------------------------------------------------------
alphatilde <- 0.05

## -----------------------------------------------------------------------------
mixture_weights <- "training"

## -----------------------------------------------------------------------------
C01 <- matrix(c(0,1,1, 0), ncol = 2, nrow = 2)
C01

## -----------------------------------------------------------------------------
Cfneg <- C01
Cfneg[1,2] <- 2
Cfneg

## -----------------------------------------------------------------------------
Ccat <- matrix(1, ncol = 3, nrow = 3) - diag(3)
Ccat

## -----------------------------------------------------------------------------
iters <- 3#250

## -----------------------------------------------------------------------------
## MCMC parameters
BT <- c(150, 2150)
#BT <- c(500, 50500)

## -----------------------------------------------------------------------------
thinning <- 5

## -----------------------------------------------------------------------------
## Experimental Parameters

verb <- FALSE
#verb <- TRUE

## -----------------------------------------------------------------------------

## Data structures for saving progress

cECM_recfp <- cECM_recfn <- bayes_rec <- cECM_rec <- matrix(NA, ncol = length(P), nrow = iters)
Nvic <- rep(0, times = length(P))

exp_out <- list()
method_template <- data.frame(matrix(NA, ncol = 3, nrow = iters))
names(method_template) <- c("accurate", "FN", "FP")
data_template <- data.frame(matrix(NA, ncol = 2, nrow = iters))
names(data_template) <- c("Ntest", "Nvic")
data_template$Ntest <- Ntest

p_template <- list(cECM = method_template, becm = method_template, 
                   mbecm = method_template, mbecm_Cfn = method_template, 
                   mbecm_cat = method_template, data = data_template)

bayes_rec <- cECM_rec <- matrix(NA, ncol = length(P), nrow = iters)

if(verb){
toc <- Sys.time()
}

## -----------------------------------------------------------------------------
## Iterates over the differing numbers of total discriminants set for the experiment.
for(p in P){
  
  ## Builds the list for saving the results using a template list
  exp_out[[p]] <- p_template
  
  ## Runs each model for `iters` number of independent data sets.
  for(i in 1:iters){
    ## The i^{th} run for p discriminants
    if(verb){    
      ## set the experimental parameter verb <- TRUE to print progress
      print(paste0("i = ", i, ", p = ", p, ", ", round(Sys.time() - toc, digits = 2), " ", units(Sys.time() - toc), " elapsed"))
    }
    
    ## Generate random data set
    
      Ylist <- data_gen(p = p, K = K, Ntest = Ntest, Ntrain = Ntrain, 
                     Ntrain_missing = Ntrain_missing, tst_missing = tst_missing, 
                    trn_missing = trn_missing)
      
      ## Saves the random data information to the environment
      
      train_full<- Ylist$Y$train_full
      train_missing <- Ylist$Y$train_missing
      testing <- Ylist$Y$testing
      test_truth <- Ylist$Y$test_truth
      ## Which category is the important one this time?
      vic <- Ylist$params$vic
    
    
    ## Save the true total number of `vic` events in the testing data to be used
    ## later for analyzing performance.
    
    exp_out[[p]][["data"]]$Nvic[i] <- sum(test_truth == as.character(vic))
    
    ## Fit the classical ECM model, apply the decision framework with the
    ## `cECM_decision()` function, then save the results
    
    cECM <- cECM(x = train_full, transform = TRUE, newdata = testing)

    cECM_out <-  apply(cECM_decision(pval = cECM, alphatilde = alphatilde,
                                 vic = as.character(vic), 
                                 cat_truth = test_truth)$events[,1:3] ,2, 
                       sum, na.rm = TRUE)

    exp_out[[p]][["cECM"]][i,] <- unname(cECM_out)
    
    ## Fit the B-ECM model, using only full p observations
    
    bayes_fit <- BayesECM(Y = train_full)
    
    ## Run the predict function on the testing set.
    ## If there were multiple testing sets, the same model fit could be used on
    ## each one without having to rerun the `BayesECM()` function.  This
    ## functionality is more important when using training data with missing
    ## entries.
    
    bayes_pred <- predict(bayes_fit, Ytilde = testing, 
                          mixture_weights = mixture_weights)
    
    ## The "becm_desision()" function applies the decision theoretic framework
    ## to the training and testing data.  For one training and one testing set,
    ## where the user wants to try different values of `alphatilde` and `C`, it is
    ## not necessary to rerun the `BayesECM()` function or the `predict()`
    ## function.
    
    becm_out <- becm_decision(bayes_pred = bayes_pred, alphatilde = alphatilde,
                                vic = as.character(vic), cat_truth = test_truth, 
                              pn = TRUE, C = C01)
    
    ## Summarize and save the data.
    
    becm_out <- apply(becm_out$results,2, sum, na.rm = TRUE)
    
    exp_out[[p]][["becm"]][i,] <- unname(becm_out)
    
    
    ## Fit and save the B-ECM model that includes missing data
    
    bayes_fit_missing <- BayesECM(Y = rbind(train_full, train_missing), BT = BT, 
                                  verb = verb)
    
    bayes_pred_missing <- predict(bayes_fit_missing, Ytilde = testing, 
                                  thinning = thinning, 
                                  mixture_weights = mixture_weights)
    
    missing_out <- becm_decision(bayes_pred = bayes_pred_missing, alphatilde = alphatilde,
                             vic = as.character(vic), cat_truth = test_truth, 
                             pn = TRUE, C = C01)
    mbecm_out <- apply(missing_out$results,2, sum, na.rm = TRUE)
    
    
    exp_out[[p]][["mbecm"]][i,] <- unname(mbecm_out)
    
    ## The rest of the B-ECM variants are different through decision theory,
    ## not the model fit.  All use partial observations for training.
    ## Note that the rej argument is supplied to becm_decision to reduce computation time
    
    ## Record the decision when the loss matrix is adjusted to target
    ## false negatives.
    
    Cfn_out <- becm_decision(bayes_pred = bayes_pred_missing, alphatilde = alphatilde,
                         vic = as.character(vic), cat_truth = test_truth, 
                         pn = TRUE, C = Cfneg, rej = missing_out$rej)
    becm_Cfn_out <- apply(Cfn_out$results,2, sum, na.rm = TRUE)
    
    
    exp_out[[p]][["mbecm_Cfn"]][i,] <- unname(becm_Cfn_out)
    
    ## Record the decisions when full class (K = 3) categorization is used
    ## instead of binary categorization
    
    cat_out <-  becm_decision(bayes_pred = bayes_pred_missing, alphatilde = alphatilde,
                          vic = as.character(vic), cat_truth = test_truth, 
                          pn = TRUE, C = Ccat, rej = missing_out$rej)
    becm_cat_out <- apply(cat_out$results,2, sum, na.rm = TRUE)
    
    
    exp_out[[p]][["mbecm_cat"]][i,] <- unname(becm_cat_out)

    
  }
  
}




## -----------------------------------------------------------------------------

ECM_boxplot <- function(exp_out, P = P, models = c("cECM", "becm", 
                                                   "mbecm", 
                                                   "mbecm_Cfn",
                                                   "mbecm_cat"),
                        cols = NULL,
                        legend_text =  c("C-ECM", "B-ECM", "M-B-ECM", 
                  bquote("M-B-ECM, " * C["1,2"] == 2), "M-B-ECM Cat"),
                  metric = "accurate"){
  
  if(metric == "accurate"){
    divisor <- function(exp_out, p){
      return(exp_out[[p]]$data$Ntest)
    }
    ylab <- "Model Accuracy"
  }else if(metric == "FN"){
    divisor <- function(exp_out,p){
      return(exp_out[[p]]$data$Nvic)
    }
    ylab <- "False Negative Rate"
  }else if(metric == "FP"){
    divisor <- function(exp_out, p){
      return(exp_out[[p]]$data$Ntest - exp_out[[p]]$data$Nvic)
    }
    ylab <- "False Positive Rate"
  }else{
    stop("Argument 'metric' must be one of the following case sensitive character strings: 'accurate', 'FN', or 'FP'.")
  }
   boxplotdf <- do.call("cbind", lapply(exp_out[[P[1]]][models], function(X, m = metric){
  X[[m]]
}))/divisor(exp_out, p = P[1])
  
   
  for(p in P[-1]){
  boxplotdf <- cbind(boxplotdf,do.call("cbind", lapply(exp_out[[p]][models], function(X, m = metric){
  X[[m]]
}))/divisor(exp_out, p = p))
  }
  
   boxplotdf <- boxplotdf
   
   if(max(boxplotdf) > 65){
     ylim <- c(min(boxplotdf), 1.1)
   }else{
    ylim <- range(boxplotdf) * c(0.9,1.2)
   }
   

if(is.null(cols)){
if(length(models) == 5){
pltcols <- hcl.colors(44, palette = "viridis", rev = TRUE)[c(3, 10, 20, 30, 37)]
}else if(length(models) > 10){
  warning("You should consider recoding this function with a different way to select the colors used for the plot.")
  pltcols <- hcl.colors(44, palette = "viridis", rev = TRUE)[seq(from = 2, to = 38, length.out = length(models))]
}else{
  pltcols <- hcl.colors(44, palette = "viridis", rev = TRUE)[seq(from = 2, to = 38, length.out = length(models))]
}
}else{
  if(length(cols) != length(models)){
    stop("If supplying colors, a vector the same length as the 'models' argument must be used.")
  }
}
   
opar <- par(no.readonly = TRUE)
on.exit(expr = suppressWarnings(par(opar)))
par(mar = c(4.25,3.85,1,0.5))
lmodels <- length(models)
graphics::boxplot(boxplotdf,  
                  at = (1:((lmodels + 1)*length(P)))[-seq(from = lmodels + 1, 
                                                          to = length(P)*(lmodels + 1), 
                                                          by = lmodels + 1)],
         xaxt = "n", yaxt = "n", ylim = ylim, 
        col = pltcols, xlab = "Number of Discriminants", ylab = ylab)
py <- pretty(boxplotdf)
graphics::axis(2, py)
graphics::axis(1, at =seq(from = 1, to = length(P)*(lmodels + 1), by = lmodels + 1) + 1, 
               labels =  paste0("p = ", P) )
graphics::legend("topleft", bty ="n", 
       legend = legend_text,  
       fill = pltcols, horiz = FALSE, ncol = 3, y.intersp = 1.25)
}



## -----------------------------------------------------------------------------
ECM_boxplot(exp_out = exp_out, P = P, metric = "accurate")

## -----------------------------------------------------------------------------
ECM_boxplot(exp_out = exp_out, P = P, metric = "FN")
Any scripts or data that you put into this service are public.
ezECM documentation built on Oct. 18, 2024, 1:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ezECM
Event Categorization Matrix Classification for Nuclear Detonations

inst/doc/syn-data-code.R
In ezECM: Event Categorization Matrix Classification for Nuclear Detonations

Try the ezECM package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ezECM Event Categorization Matrix Classification for Nuclear Detonations

inst/doc/syn-data-code.R In ezECM: Event Categorization Matrix Classification for Nuclear Detonations

Try the ezECM package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ezECM
Event Categorization Matrix Classification for Nuclear Detonations

inst/doc/syn-data-code.R
In ezECM: Event Categorization Matrix Classification for Nuclear Detonations