R/CBDA_Stopping_Criteria.pipeline.R
In CBDA: Compressive Big Data Analytics

Documented in CBDA_Stopping_Criteria.pipeline

#' @title
#' Stopping Criteria function for Compressive Big Data Analytics
#'
#' @description
#'  This CBDA function generates a stopping criteria for the *max_covs - min_covs* nested
#'  predictive models generated in the previous step. It also populates the CBDA object.

#' @param label This is the label appended to RData workspaces generated within the CBDA calls

#' @param Kcol_min Lower bound for the percentage of features-columns sampling (used for the Feature Sampling Range - FSR)

#' @param Kcol_max Upper bound for the percentage of features-columns sampling (used for the Feature Sampling Range - FSR)

#' @param Nrow_min 	Lower bound for the percentage of cases-rows sampling (used for the Case Sampling Range - CSR)

#' @param Nrow_max Upper bound for the percentage of cases-rows sampling (used for the Case Sampling Range - CSR)

#' @param misValperc Percentage of missing values to introduce in BigData (used just for testing, to mimic real cases).

#' @param M Number of the BigData subsets on which perform Knockoff Filtering and SuperLearner feature mining

#' @param workspace_directory Directory where the results and workspaces are saved

#' @param max_covs Top features to include in the Validation Step where nested models are tested

#' @param min_covs Minimum number of top features to include in the initial model for the Validation Step

#' @param lambda Fisher test threshold for MSE (=1.005 by default)

#' @return value

#' @export

CBDA_Stopping_Criteria.pipeline <- function(label = "CBDA_package_test" , Kcol_min = 5 , Kcol_max = 15,
                                            Nrow_min = 30 , Nrow_max = 50 , misValperc = 0, M = 3000 ,
                                            workspace_directory = tempdir(), max_covs = 100 , min_covs = 5,
                                            lambda = 1.005) {

  range_n <- range_k <- qa_ALL <- algorithm_list <- cmatrix_ALL_validation <- NULL

  message("STOPPING CRITERIA GENERATION STEP HAS STARTED !!")
  filename_specs <- file.path(workspace_directory,paste0(label,"_validation_info.RData"))
  #eval(parse(text=paste0("load(\"",workspace_directory,"/",label,"_info.RData\")")))
  load(filename_specs)

  filename <- file.path(workspace_directory,
                        paste0("CBDA_SL_M",M,"_miss",misValperc,"_n",range_n,
                               "_k",range_k,"_Light_",label,"_VALIDATION.RData"))
  load(filename)
  #eval(parse(text=paste0("load(\"",workspace_directory,"/CBDA_SL_M",M,"_miss",
  #                       misValperc,"_n",range_n,"_k",range_k,"_Light_",label,"_VALIDATION.RData\")")))

  qa_ALL_Validation <- NULL
  qa_ALL_Validation <- matrix(0,max_covs-min_covs+1,5)

  counter <- 1
  for(j_global in min_covs:max_covs)
  {
    eval(parse(text=paste0("qa_ALL_Validation[",counter,",1] <- ",j_global,"")))
    eval(parse(text=paste0("qa_ALL_Validation[",counter,",2] <- Accuracy_",j_global,"")))
    eval(parse(text=paste0("qa_ALL_Validation[",counter,",3] <- MSE_",j_global,"")))
    counter <- counter + 1
  }

  m_validation = matrix(0,max_covs,length(algorithm_list))
  for (i in min_covs:max_covs){
    eval(parse(text=paste0("m_validation[",i,",] <- m_validation_",i)))
    eval(parse(text=paste0("cmatrix_ALL_validation[[",i,"]] <- cmatrix_",i)))
  }
  print(qa_ALL_Validation)
  ## Stopping Criteria for Accuracy and MSE Performance Metrics
  ## Two more columns added with 0 (continue) and 1 (stop)
  StopAcc <- NULL
  StopMSE <- NULL
  for(i in 1:dim(qa_ALL_Validation)[1]-1)
  {
    # Simple improvement (1%,5%, 0.05% in Accuracy)
    ifelse((qa_ALL_Validation[i+1,2]/qa_ALL_Validation[i,2]) > lambda,
           StopAcc[i] <- "Keep Going", StopAcc[i] <- "Stop")
    # F of Fisher test
    ifelse((qa_ALL_Validation[i,3]/qa_ALL_Validation[i,3])/(qa_ALL_Validation[i+1,3]/qa_ALL_Validation[i+1,3])
           > stats::qf(.95, df1=qa_ALL_Validation[i,3], df2=qa_ALL_Validation[i+1,3]),
           StopMSE[i] <- "Keep Going", StopMSE[i] <- "Stop")
  }

  Stopping_Criteria <- data.frame(NumberOfTopFeatures=qa_ALL_Validation[,1],Inference_Acc=qa_ALL_Validation[,2],
                                  Inference_MSE = qa_ALL_Validation[,3] ,
                                  StopAcc=c(StopAcc,"NA"), StopMSE=c(StopMSE,"NA"))

  if(is.na(StopMSE[1]))
  {StopMSE <- StopAcc}
  CBDA_object <- NULL
  CBDA_object[[1]] <- qa_ALL
  CBDA_object[[2]] <- cmatrix_ALL_validation
  CBDA_object[[3]] <- algorithm_list
  CBDA_object[[4]] <- m_validation
  CBDA_object[[5]] <- Stopping_Criteria

  names(CBDA_object)[1] <- c("LearningTable")
  names(CBDA_object)[2] <- c("ConfusionMatrices")
  names(CBDA_object)[3] <- c("SuperLearnerLibrary")
  names(CBDA_object)[4] <- c("SuperLearnerCoefficients")
  names(CBDA_object)[5] <- c("ValidationTable")
  CBDA_object[[6]] <- as.numeric(as.character(CBDA_object$LearningTable[,1]))
  names(CBDA_object)[6] <- c("TopFeatures")

  for (j in min_covs:max_covs)
  {
    eval(parse(text=paste0("rm(cmatrix_",j,")")))
    eval(parse(text=paste0("rm(KO_result_",j,")")))
    eval(parse(text=paste0("rm(Accuracy_",j,")")))
    eval(parse(text=paste0("rm(Classify)")))
    eval(parse(text=paste0("rm(Classify_MSE)")))
    eval(parse(text=paste0("rm(counter)")))
    eval(parse(text=paste0("rm(j)")))
    eval(parse(text=paste0("rm(i)")))
    eval(parse(text=paste0("rm(j_global)")))
    eval(parse(text=paste0("rm(k)")))
    eval(parse(text=paste0("rm(k_Acc)")))
    eval(parse(text=paste0("rm(k_ALL)")))
  }

  filename <- file.path(workspace_directory,
                        paste0("CBDA_SL_M",M,"_miss",misValperc,"_n",range_n,
                               "_k",range_k,"_Light_",label,"_VALIDATION.RData"))
  save(list = ls(all.names = TRUE), file = filename)

  cat("Performance metrics for the nested Predictive models.\n")
  cat("VALIDATION TABLE\n")
  print(CBDA_object$ValidationTable)
  cat("\n\nStopping Criteria completed successfully !!\n\n")

  return(CBDA_object)
}