Privacy preserving Evaporative Cooling (PrivateEC), Consensus Nested Cross Validation (cnCV), and Regular Nested Cross Validation (rnCV) for feature selection and classification with Relief-F and Random Forests

PrivateEC methods are described in the following publication.

Trang T. Le, W. K. Simmons, M. Misaki, B.C. White, J. Savitz, J. Bodurka, and B. A. McKinney. "Differential privacy-based evaporative cooling feature selection and classification with Relief-F and Random Forests," Bioinformatics, Volume 33, Issue 18, 15 September 2017, Pages 2906–2913. free

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(privateEC)

Simulation example comparing private Evaporative Cooling (pEC) with standard nested cross-validation (nCV) and a new consensus-feature nested cross validation (cnCV)

Simulate Balanced Case-Control Data with Numeric Predictors with Main Effects

library(privateEC)
n.samples <- 300     # 100 samples in train/holdout/test
n.variables <- 100   # 100 features
label <- "class"
type <- "mainEffect" # main effect simulatios
bias <- 0.6          # moderate effect size
pct.signals <- 0.1   # pct functional features
verbose <- FALSE

data.sets <- createSimulation(num.samples = n.samples,
                              num.variables = n.variables,
                              pct.signals = pct.signals,
                              label = label,
                              bias = bias,
                              pct.train = 1/3,
                              pct.holdout = 1/3,
                              pct.validation = 1/3,
                              sim.type = type,
                              save.file = NULL,
                              verbose = verbose)

Run pEC on simulated data

update.freq <- 5   # attributes evaporated per iteration
num_tree <- 500
importance.algorithm <- "ReliefFequalK"

pec.result <- privateEC(train.ds = data.sets$train,                  # training split
                        holdout.ds = data.sets$holdout,               # holdout split
                        validation.ds = data.sets$validation,         # validation set
                        label = data.sets$label,                      # class label
                        importance.algorithm = importance.algorithm,  # ReliefF
                        relief.k.method = "k_half_sigma",             # ReliefF knn
                        rf.ntree = num_tree,                          # random forest
                        is.simulated = TRUE,                          # validation set
                        bias = bias,                                  # pEC privacy
                        update.freq = update.freq,                    # num attr evaporated
                        save.file = NULL,                             # save results
                        signal.names = data.sets$signal.names,        # functional attr names
                        use.nestedCV = T,
                        verbose = verbose)

pEC results

Table of iterations

knitr::kable(pec.result$algo.acc, caption = "Evaporation Iterations",
             row.names = FALSE, digits = 3)

Plot of results

plotRunResults(pec.result)

pEC selected features

Best pEC model selected based on highest holdout accuracy. Train, holdout and independent validation accuracies of selected model shown below. Features in model are listed: simvar prefix variables are functional. Others are background.

multiple.max.indices <- which(pec.result$algo.acc$holdout.acc==max(pec.result$algo.acc$holdout.acc))
last.max <- multiple.max.indices[length(multiple.max.indices)]
cat("\n Max Holdout Accuracy Step [",last.max,"]\n")
cat("\n Accuracies: ")
print(pec.result$algo.acc[last.max,])
cat("\n Selected Features \n [",pec.result$atts.remain[[last.max]],"]\n")

Standard nested CV

rncv_result <- regular_nestedCV(train.ds = rbind(data.sets$train,data.sets$holdout), 
                                validation.ds =  data.sets$validation,
                                label = data.sets$label,
                                method.model = "classification",
                                is.simulated = TRUE,
                                ncv_folds = c(10, 10),
                                param.tune = FALSE,
                                learning_method = "rf", 
                                importance.algorithm = importance.algorithm,
                                relief.k.method = "k_half_sigma",             # ReliefF knn
                                wrapper = "relief",
                                inner_selection_percent = 0.2,
                                inner_selection_positivescores = TRUE,
                                num_tree = num_tree,
                                verbose = verbose)

nested CV results

cat("\n Train Accuracy [",rncv_result$cv.acc,"]\n")
cat("\n Validation Accuracy [",rncv_result$Validation,"]\n")
cat("\n Selected Features \n [",rncv_result$Features,"]\n")
cat("\n Elapsed Time [",rncv_result$Elapsed,"]\n")

Consensus nested CV

cncv_result <- consensus_nestedCV(train.ds = rbind(data.sets$train,data.sets$holdout), 
                                  validation.ds =  data.sets$validation, 
                                  label = data.sets$label,
                                  method.model = "classification",
                                  is.simulated = TRUE,
                                  ncv_folds = c(10, 10),
                                  param.tune = FALSE,
                                  learning_method = "rf", 
                                  importance.algorithm = importance.algorithm,
                                  relief.k.method = "k_half_sigma",             # ReliefF knn
                                  wrapper = "relief",
                                  inner_selection_percent = 0.2,
                                  inner_selection_positivescores = TRUE,
                                  num_tree = num_tree,
                                  verbose = verbose)

cnCV results

cat("\n Nested Cross-Validation Accuracy [",cncv_result$cv.acc,"]\n")
cat("\n Validation Accuracy [",cncv_result$Validation,"]\n")
cat("\n Selected Features \n [",cncv_result$Features,"]\n")
cat("\n Elapsed Time [",cncv_result$Elapsed,"]\n")

Simulate Data with Quantitative Outcome as well as Numeric Predictors with Main Effects

library(privateEC)
n.samples <- 300     # 100 samples in train/holdout/test
n.variables <- 100   # 100 features
label <- "qtrait"
method.model <- "regression"
type <- "mainEffect" # main effect simulatios
bias <- 0.6          # moderate effect size
pct.signals <- 0.1   # pct functional features
verbose <- FALSE

data.sets <- createSimulation(num.samples = n.samples,
                              num.variables = n.variables,
                              pct.signals = pct.signals,
                              label = label,
                              bias = bias,
                              pct.train = 1/3,
                              pct.holdout = 1/3,
                              pct.validation = 1/3,
                              sim.type = type,
                              save.file = NULL,
                              verbose = verbose)

Run pEC on simulated data

update.freq <- 5   # attributes evaporated per iteration
num_tree <- 500
importance.algorithm <- "RReliefFequalK"

pec.result <- privateEC(train.ds = data.sets$train,                  # training split
                        holdout.ds = data.sets$holdout,               # holdout split
                        validation.ds = data.sets$validation,         # validation set
                        label = data.sets$label,                      # class label
                        method.model = method.model,                  # analysis goal
                        importance.algorithm = importance.algorithm,  # ReliefF
                        relief.k.method = "k_half_sigma",             # ReliefF knn
                        rf.ntree = num_tree,                          # random forest
                        is.simulated = TRUE,                          # validation set
                        bias = bias,                                  # pEC privacy
                        update.freq = update.freq,                    # num attr evaporated
                        save.file = NULL,                             # save results
                        signal.names = data.sets$signal.names,        # functional attr names
                        use.nestedCV = T,
                        verbose = verbose)

pEC results

Table of iterations

knitr::kable(pec.result$algo.acc, caption = "Evaporation Iterations",
             row.names = FALSE, digits = 3)

Plot of results

plotRunResults(pec.result)

pEC selected features

Best pEC model selected based on highest holdout R-Squared. Train, holdout and independent validation R-Squareds of selected model shown below. Features in model are listed: simvar prefix variables are functional. Others are background.

multiple.max.indices <- which(pec.result$algo.acc$holdout.acc==max(pec.result$algo.acc$holdout.acc))
last.max <- multiple.max.indices[length(multiple.max.indices)]
cat("\n Max Holdout R-Squared Step [",last.max,"]\n")
cat("\n R-Squared: ")
print(pec.result$algo.acc[last.max,])
cat("\n Selected Features \n [",pec.result$atts.remain[[last.max]],"]\n")

Standard nested CV

rncv_result <- regular_nestedCV(train.ds = rbind(data.sets$train,data.sets$holdout), 
                                validation.ds =  data.sets$validation,
                                label = data.sets$label,
                                method.model = method.model,
                                is.simulated = TRUE,
                                ncv_folds = c(10, 10),
                                param.tune = FALSE,
                                learning_method = "rf", 
                                importance.algorithm = importance.algorithm,
                                relief.k.method = "k_half_sigma",             # ReliefF knn
                                wrapper = "relief",
                                inner_selection_percent = 0.2,
                                inner_selection_positivescores = TRUE,
                                num_tree = num_tree,
                                verbose = verbose)

nested CV results

cat("\n Train R-Squared [",rncv_result$cv.acc,"]\n")
cat("\n Validation R-Squared [",rncv_result$Validation,"]\n")
cat("\n Selected Features \n [",rncv_result$Features,"]\n")
cat("\n Elapsed Time [",rncv_result$Elapsed,"]\n")

Consensus nested CV

cncv_result <- consensus_nestedCV(train.ds = rbind(data.sets$train,data.sets$holdout), 
                                  validation.ds =  data.sets$validation, 
                                  label = data.sets$label,
                                  method.model = method.model,
                                  is.simulated = TRUE,
                                  ncv_folds = c(10, 10),
                                  param.tune = FALSE,
                                  learning_method = "rf", 
                                  importance.algorithm = importance.algorithm,
                                  relief.k.method = "k_half_sigma",             # ReliefF knn
                                  wrapper = "relief",
                                  inner_selection_percent = 0.2,
                                  inner_selection_positivescores = TRUE,
                                  num_tree = num_tree,
                                  verbose = verbose)

cnCV results

cat("\n Train R-Squared [",cncv_result$cv.acc,"]\n")
cat("\n Validation R-Squared [",cncv_result$Validation,"]\n")
cat("\n Selected Features \n [",cncv_result$Features,"]\n")
cat("\n Elapsed Time [",cncv_result$Elapsed,"]\n")


insilico/privateEC documentation built on May 22, 2020, 5:12 p.m.