Privacy preserving Evaporative Cooling (PrivateEC), Consensus Nested Cross Validation (cnCV), and Regular Nested Cross Validation (rnCV) for feature selection and classification with Relief-F and Random Forests

PrivateEC methods are described in the following publication.

Trang T. Le, W. K. Simmons, M. Misaki, B.C. White, J. Savitz, J. Bodurka, and B. A. McKinney. "Differential privacy-based evaporative cooling feature selection and classification with Relief-F and Random Forests," Bioinformatics, Volume 33, Issue 18, 15 September 2017, Pages 2906–2913. free

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(privateEC)

Simulation example comparing private Evaporative Cooling (pEC) with standard nested cross-validation (nCV) and a new consensus-feature nested cross validation (cnCV)

Simulate Balanced Case-Control Data with Numeric Predictors with Main Effects

library(privateEC)
n.samples <- 300     # 100 samples in train/holdout/test
n.variables <- 100   # 100 features
label <- "class"
type <- "mainEffect" # main effect simulatios
bias <- 0.6          # moderate effect size
pct.signals <- 0.1   # pct functional features
verbose <- FALSE

data.sets <- createSimulation(num.samples = n.samples,
                              num.variables = n.variables,
                              pct.signals = pct.signals,
                              label = label,
                              bias = bias,
                              pct.train = 1/3,
                              pct.holdout = 1/3,
                              pct.validation = 1/3,
                              sim.type = type,
                              save.file = NULL,
                              verbose = verbose)

Run pEC on simulated data

update.freq <- 5   # attributes evaporated per iteration
num_tree <- 500
importance.algorithm <- "ReliefFequalK"

pec.result <- privateEC(train.ds = data.sets$train,                   # training split
                        holdout.ds = data.sets$holdout,               # holdout split
                        validation.ds = data.sets$validation,         # independent validation set
                        label = data.sets$label,                      # class label
                        importance.algorithm = importance.algorithm,  # feature selection method (ReliefF)
                        relief.k.method = "k_half_sigma",             # ReliefF nearest neighbor k estimate
                        rf.ntree = num_tree,                          # random forest classifier
                        is.simulated = TRUE,                          # indicates there will be a validation set
                        bias = bias,                                  # pEC privacy temperature noise
                        update.freq = update.freq,                    # number of attributes to evaporate per step
                        save.file = NULL,                             # save results
                        signal.names = data.sets$signal.names,        # functional variable names
                        use.nestedCV = F,
                        verbose = verbose)

pEC results

Table of iterations

knitr::kable(pec.result$algo.acc, caption = "Evaporation Iterations",
             row.names = FALSE, digits = 3)

Plot of results

plotRunResults(pec.result)

pEC selected features

cat("\n Max Holdout Accuracy Step [",which.max(pec.result$algo.acc$holdout.acc),"]\n")
cat("\n Accuracies: ")
print(pec.result$algo.acc[which.max(pec.result$algo.acc$holdout.acc),])
cat("\n Selected Features \n [",pec.result$atts.remain[[which.max(pec.result$algo.acc$holdout.acc)]],"]\n")

Run regular nested CV

rncv_result <- regular_nestedCV(train.ds = data.sets$train, 
                                validation.ds =  data.sets$holdout, 
                                label = data.sets$label,
                                is.simulated = TRUE,
                                ncv_folds = c(10, 10),
                                param.tune = FALSE,
                                learning_method = "rf", 
                                importance.algorithm = importance.algorithm,
                                num_tree = num_tree,
                                verbose = verbose)

rnCV results

cat("\n Train Accuracy [",rncv_result$cv.acc,"]\n")
cat("\n Validation Accuracy [",rncv_result$Validation,"]\n")
cat("\n Selected Features \n [",rncv_result$Features,"]\n")
cat("\n Elapsed Time [",rncv_result$Elapsed,"]\n")

Run consensus nested CV

cncv_result <- consensus_nestedCV(train.ds = data.sets$train, 
                                  validation.ds =  data.sets$holdout, 
                                  label = data.sets$label,
                                  is.simulated = TRUE,
                                  ncv_folds = c(10, 10),
                                  param.tune = FALSE,
                                  learning_method = "rf", 
                                  importance.algorithm = importance.algorithm,
                                  num_tree = num_tree,
                                  verbose = verbose)

cnCV results

cat("\n Train Accuracy [",cncv_result$cv.acc,"]\n")
cat("\n Validation Accuracy [",cncv_result$Validation,"]\n")
cat("\n Selected Features \n [",cncv_result$Features,"]\n")
cat("\n Elapsed Time [",cncv_result$Elapsed,"]\n")

Simulate Imbalanced Data (parameter in effect: pct.imbalance, where 0/1 means all ctrls/cases)

n.samples <- 100
n.variables <- 100
label <- "class"
pct.imbalance = 0.56
type <- "mainEffect"
importance.algorithm <- "ReliefFequalK"
bias <- 0.4
pct.signals <- 0.1
update.freq <- 5
num_tree <- 500
verbose <- FALSE

data.sets <- createSimulation(num.samples = n.samples,
                              num.variables = n.variables,
                              pct.signals = pct.signals,
                              label = label,
                              pct.imbalance = pct.imbalance,
                              bias = bias,
                              pct.train = 1/3,
                              pct.holdout = 1/3,
                              pct.validation = 1/3,
                              sim.type = type,
                              save.file = NULL,
                              verbose = verbose)

Run pEC on imbalanced simulated data

pec.result <- privateEC(train.ds = data.sets$train,
                        holdout.ds = data.sets$holdout,
                        validation.ds = data.sets$validation,
                        label = data.sets$label,
                        importance.algorithm = importance.algorithm,
                        rf.ntree = num_tree,
                        is.simulated = TRUE,
                        bias = bias,
                        update.freq = update.freq,
                        save.file = NULL,
                        signal.names = data.sets$signal.names,
                        verbose = verbose)

pEC results

Table of iterations

knitr::kable(pec.result$algo.acc, caption = "Algorithm Iterations",
             row.names = FALSE, digits = 3)

Plot of results

plotRunResults(pec.result)

pEC selected features

cat("\n Selected Features \n [",pec.result$atts.remain[[which.max(pec.result$algo.acc$holdout.acc)]],"]\n")

Run regular nested CV

rncv_result <- regular_nestedCV(train.ds = data.sets$train, 
                                validation.ds =  data.sets$holdout, 
                                label = data.sets$label,
                                is.simulated = TRUE,
                                ncv_folds = c(10, 10),
                                param.tune = FALSE,
                                learning_method = "rf", 
                                importance.algorithm = importance.algorithm,
                                num_tree = num_tree,
                                verbose = verbose)

rnCV results

cat("\n Train Accuracy [",rncv_result$cv.acc,"]\n")
cat("\n Validation Accuracy [",rncv_result$Validation,"]\n")
cat("\n Selected Features \n [",rncv_result$Features,"]\n")
cat("\n Elapsed Time [",rncv_result$Elapsed,"]\n")

Run consensus nested CV

cncv_result <- consensus_nestedCV(train.ds = data.sets$train, 
                                  validation.ds =  data.sets$holdout, 
                                  label = data.sets$label,
                                  is.simulated = TRUE,
                                  ncv_folds = c(10, 10),
                                  param.tune = FALSE,
                                  learning_method = "rf", 
                                  importance.algorithm = importance.algorithm,
                                  num_tree = num_tree,
                                  verbose = verbose)

cnCV results

cat("\n Train Accuracy [",cncv_result$cv.acc,"]\n")
cat("\n Validation Accuracy [",cncv_result$Validation,"]\n")
cat("\n Selected Features \n [",cncv_result$Features,"]\n")
cat("\n Elapsed Time [",cncv_result$Elapsed,"]\n")

Simulate Mix Data (parameter in effect: pct.imbalance, where 0/1 means all ctrls/cases, pct.mixed)

n.samples <- 100
n.variables <- 100
label <- "class"
pct.imbalance = 0.5
pct.mixed = 0.5
mixed.type <- c("mainEffect", "interactionScalefree")
importance.algorithm <- "ReliefFequalK"
bias <- 0.4
pct.signals <- 0.1
update.freq <- 5
num_tree <- 500
verbose <- FALSE

data.sets <- createMixedSimulation(num.samples = n.samples,
                                   num.variables = n.variables,
                                   pct.signals = pct.signals,
                                   label = label,
                                   pct.imbalance = pct.imbalance,
                                   pct.mixed = pct.mixed,
                                   mixed.type = mixed.type,
                                   bias = bias,
                                   pct.train = 1/3,
                                   pct.holdout = 1/3,
                                   pct.validation = 1/3,
                                   save.file = NULL,
                                   verbose = verbose)

Run pEC on mixed simulated data

pec.result <- privateEC(train.ds = data.sets$train,
                        holdout.ds = data.sets$holdout,
                        validation.ds = data.sets$validation,
                        label = data.sets$label,
                        importance.algorithm = importance.algorithm,
                        rf.ntree = num_tree,
                        is.simulated = TRUE,
                        bias = bias,
                        update.freq = update.freq,
                        save.file = NULL,
                        signal.names = data.sets$signal.names,
                        verbose = verbose)

pEC results

Table of iterations

knitr::kable(pec.result$algo.acc, caption = "Algorithm Iterations",
             row.names = FALSE, digits = 3)

Plot of results

plotRunResults(pec.result)

pEC selected features

cat("\n Selected Features \n [",pec.result$atts.remain[[which.max(pec.result$algo.acc$holdout.acc)]],"]\n")

Run regular nested CV

rncv_result <- regular_nestedCV(train.ds = data.sets$train, 
                                validation.ds =  data.sets$holdout, 
                                label = data.sets$label,
                                is.simulated = TRUE,
                                ncv_folds = c(10, 10),
                                param.tune = FALSE,
                                learning_method = "rf", 
                                importance.algorithm = importance.algorithm,
                                num_tree = num_tree,
                                verbose = verbose)

rnCV results

cat("\n Train Accuracy [",rncv_result$cv.acc,"]\n")
cat("\n Validation Accuracy [",rncv_result$Validation,"]\n")
cat("\n Selected Features \n [",rncv_result$Features,"]\n")
cat("\n Elapsed Time [",rncv_result$Elapsed,"]\n")

Run consensus nested CV

cncv_result <- consensus_nestedCV(train.ds = data.sets$train, 
                                  validation.ds =  data.sets$holdout, 
                                  label = data.sets$label,
                                  is.simulated = TRUE,
                                  ncv_folds = c(10, 10),
                                  param.tune = FALSE,
                                  learning_method = "rf", 
                                  importance.algorithm = importance.algorithm,
                                  num_tree = num_tree,
                                  verbose = verbose)

cnCV results

cat("\n Train Accuracy [",cncv_result$cv.acc,"]\n")
cat("\n Validation Accuracy [",cncv_result$Validation,"]\n")
cat("\n Selected Features \n [",cncv_result$Features,"]\n")
cat("\n Elapsed Time [",cncv_result$Elapsed,"]\n")

Simulate Quantitative Outcome Data (parameter in effect: label = "qtrait", type = "mainEffect", importance.algorithm = "RReliefFequalK")

n.samples <- 100
n.variables <- 100
label <- "qtrait"
type <- "mainEffect"
importance.algorithm <- "RReliefFequalK"
bias <- 0.4
pct.signals <- 0.1
update.freq <- 5
num_tree <- 500
verbose <- FALSE

data.sets <- createSimulation(num.samples = n.samples,
                              num.variables = n.variables,
                              pct.signals = pct.signals,
                              label = label,
                              bias = bias,
                              pct.train = 1/3,
                              pct.holdout = 1/3,
                              pct.validation = 1/3,
                              sim.type = type,
                              save.file = NULL,
                              verbose = verbose)

Run pEC on quantitative simulated data

pec.result <- privateEC(train.ds = data.sets$train,
                        holdout.ds = data.sets$holdout,
                        validation.ds = data.sets$validation,
                        label = data.sets$label,
                        importance.algorithm = importance.algorithm,
                        rf.ntree = num_tree,
                        is.simulated = TRUE,
                        bias = bias,
                        update.freq = update.freq,
                        save.file = NULL,
                        signal.names = data.sets$signal.names,
                        verbose = verbose)

pEC results

Table of iterations

knitr::kable(pec.result$algo.acc, caption = "Algorithm Iterations",
             row.names = FALSE, digits = 3)

Plot of results

plotRunResults(pec.result)

pEC selected features

cat("\n Selected Features \n [",pec.result$atts.remain[[which.max(pec.result$algo.acc$holdout.acc)]],"]\n")

Run regular nested CV

rncv_result <- regular_nestedCV(train.ds = data.sets$train, 
                                validation.ds =  data.sets$holdout, 
                                label = data.sets$label,
                                is.simulated = TRUE,
                                ncv_folds = c(10, 10),
                                param.tune = FALSE,
                                learning_method = "rf", 
                                importance.algorithm = importance.algorithm,
                                num_tree = num_tree,
                                verbose = verbose)

rnCV results

cat("\n Train R-squared [",rncv_result$cv.acc,"]\n")
cat("\n Validation R-squared [",rncv_result$Validation,"]\n")
cat("\n Selected Features \n [",rncv_result$Features,"]\n")
cat("\n Elapsed Time [",rncv_result$Elapsed,"]\n")

Run consensus nested CV

cncv_result <- consensus_nestedCV(train.ds =  data.sets$train, 
                                  validation.ds =  data.sets$holdou, 
                                  label = data.sets$label,
                                  is.simulated = TRUE,
                                  ncv_folds = c(10, 10),
                                  param.tune = FALSE,
                                  learning_method = "rf", 
                                  importance.algorithm = importance.algorithm,
                                  num_tree = num_tree,
                                  verbose = verbose)

cnCV results

cat("\n Train R-squared [",cncv_result$cv.acc,"]\n")
cat("\n Validation R-squared [",cncv_result$Validation,"]\n")
cat("\n Selected Features \n [",cncv_result$Features,"]\n")
cat("\n Elapsed Time [",cncv_result$Elapsed,"]\n")


insilico/privateEC documentation built on May 22, 2020, 5:12 p.m.