#
# nested cross-validation for feature-selection/parameter-tuning
# Saeid Parvandeh, April 2018
#
#############################################
##### Consensus nested cross validation #####
#--------------------------------------------
#' Consensus nested cross validation for feature selection and parameter tuning
#' @param train.ds A training data frame with last column as outcome
#' @param validation.ds A validation data frame with last column as outcome
#' @param label A character vector of the outcome variable column name.
#' @param method.model Column name of outcome variable (string), classification or regression. If the analysis goal is classification make the column a factor type.
#' For regression, make outcome column numeric type.
#' @param is.simulated A TRUE or FALSE character for data type
#' @param ncv_folds A numeric vector to indicate nested cv folds: c(k_outer, k_inner)
#' @param param.tune A TRUE or FALSE character for tuning parameters
#' @param learning_method Name of the method: glmnet/xgbTree/rf
#' @param importance.algorithm A character vestor containing a specific importance algorithm subtype
#' @param wrapper feature selection algorithm including: rf, glmnet, t.test, centrality methods (PageRank, Katz,
#' EpistasisRank, and EpistasisKatz from Rinbix packages), ReliefF family, and etc.
#' @param inner_selection_percent = Percentage of features to be selected in each inner fold.
#' @param inner_selection_positivescores A TRUE or FALSE character to select positive scores (if the value is False, use the percentage method).
#' @param tune.inner_selection_percent A sequence vector of possible percentages for tuning
#' @param tune.k A sequence vector to tune k nearest neighbors in relief method, if TRUE the default grid is seq(1,kmax), where kmax=floor((m-1)/2)
#' and m is the number of samples. However, this kmax is for balanced data. If data are imbalance, where m_minority + m_majority = m, then kmax = floor(m_minority-1).
#' Default is FALSE.
#' @param tuneGrid A data frame with possible tuning values. The columns are named the same as the tuning parameters.
#' This caret library parameter, for more information refer to http://topepo.github.io/caret/available-models.html.
#' @param relief.k.method A character of numeric to indicate number of nearest neighbors for relief algorithm.
#' Possible characters are: k_half_sigma (floor((num.samp-1)*0.154)), m6 (floor(num.samp/6)),
#' myopic (floor((num.samp-1)/2)), and m4 (floor(num.samp/4))
#' @param num_tree Number of trees in random forest and xgboost methods
#' @param verbose A flag indicating whether verbose output be sent to stdout
#' @return A list with:
#' \describe{
#' \item{cv.acc}{Training data accuracy}
#' \item{Validation}{Validation data accuracy}
#' \item{Features}{number of variables detected correctly in nested cross validation}
#' \item{Train_model}{Traing model to use for validation}
#' \item{Elapsed}{total elapsed time}
#' }
#' num.samples <- 100
#' num.variables <- 100
#' pct.signals <- 0.1
#' label <- "class"
#' sim.data <- createSimulation(num.samples = num.samples,
#' num.variables = num.variables,
#' pct.signals = pct.signals,
#' sim.type = "mainEffect",
#' label = label,
#' verbose = FALSE)
#' cnCV.results <- consensus_nestedCV(train.ds = sim.data$train,
#' validation.ds = sim.data$holdout,
#' label = label,
#' is.simulated = TRUE,
#' ncv_folds = c(10, 10),
#' param.tune = FALSE,
#' learning_method = "rf",
#' importance.algorithm = "ReliefFbestK",
#' num_tree = 500,
#' verbose = FALSE)
#' @family nestedCV
#' @export
consensus_nestedCV <- function(train.ds = NULL,
validation.ds = NULL,
label = "class",
method.model = "classification",
is.simulated = TRUE,
ncv_folds = c(10, 10),
param.tune = FALSE,
learning_method = "rf",
xgb.obj = "binary:logistic",
importance.algorithm = "ReliefFequalK",
wrapper = "relief",
inner_selection_percent = NULL,
inner_selection_positivescores = TRUE,
tune.inner_selection_percent = NULL,
tune.k = FALSE,
tuneGrid = NULL,
relief.k.method = "k_half_sigma",
num_tree = 500,
covars_vec = NULL,
covars.pval.adj = 0.05,
verbose = FALSE){
# Define variables
tune_params <- NULL; accu_vec <- NULL
Train_accu <- NULL; Test_accu <- NULL
relief_atts <- list()
ptm <- proc.time()
expr_data <- train.ds[, -ncol(train.ds)]
num_vars <- ncol(expr_data)
var_names <- colnames(expr_data)
if(verbose){cat("\nRunning consensus nested cross-validation...\n")}
if(verbose){cat("\n Create [",ncv_folds[1],"] outer folds\n")}
outer_folds <- caret::createFolds(train.ds[, label], ncv_folds[1], list = FALSE)
for (i in 1:ncv_folds[1]){
atts <- list()
if(verbose){cat("\n Create [",ncv_folds[2],"] inner folds of outer fold[",i,"]\n")}
inner_folds <- caret::createFolds(train.ds[, label][outer_folds!=i], ncv_folds[2], list = TRUE)
if(verbose){cat("\n Feature Selection...\n")}
for (j in 1:length(inner_folds)){
inner_idx <- which(outer_folds!=i)[-inner_folds[[j]]]
if (tune.k){
m <- table(train.ds[inner_idx, "class"])
if (m[1]==m[2]){
kmax <- floor((sum(m)-1)/2)
} else {
m_minority <- m[which.min(m)]
kmax <- floor(m_minority-1)
}
tuneK <- seq(1,kmax)
} else if (!tune.k){
tuneK <- NULL
} else {
tuneK <- tune.k
}
# if someone specifies a numeric value (integer hopefully), use this value for k.
# However, make sure it is not larger than floor((num.samp.min-1)/2), where
# num.samp.min is the min of the train, holdout.. sample sizes.
# Or you could test the inequality when you encounter each data split.
# If someone does exceed the threshold, set k to floor((num.samp.min-1)/2)
# and writing warning that says
# "ReliefF k too large. Using maximum."
if (is.numeric(relief.k.method)) {
if (relief.k.method > floor((dim(train.ds[inner_idx, ])[1]-1)/2)){
warning("ReliefF k too large. Using maximum.")
k <- floor((dim(train.ds[inner_idx, ])[1]-1)/2)
} else {
k <- relief.k.method
}
} else if (relief.k.method == "myopic"){
k <- floor((dim(train.ds[inner_idx, ])[1]-1)/2)
# where k may change based on num.samp for train, holdout...
} else if (relief.k.method == "m6") { # default "m6" method
k <- floor(dim(train.ds[inner_idx, ])[1]/6)
# where k may change based on num.samp for train, holdout...
} else if (relief.k.method == "m4") {
k <- floor(dim(train.ds[inner_idx, ])[1]/4)
} else {
k <- floor((dim(train.ds[inner_idx, ])[1]-1)*0.154)
}
if (sum(ncv_folds)>(dim(train.ds[inner_idx, ])[1])/3){
stop("There are less than three observations in each fold")
}
k_inner_accs <- NULL
if (wrapper == "relief" && !is.null(tuneK)){
for (tk in tuneK) {
ranked_vars <- CORElearn::attrEval(label, train.ds[inner_idx, ],
estimator = importance.algorithm,
costMatrix = NULL,
outputNumericSplits=FALSE,
kNearestEqual = tk)
top_vars <- names(which(sort(ranked_vars, decreasing = TRUE)>0))
# random forest
inner_trn.data <- as.matrix(train.ds[inner_idx, top_vars])
inner_tst.data <- as.matrix(train.ds[-inner_idx, top_vars])
if(method.model == "classification"){
inner_trn.pheno <- as.factor(train.ds[, label][inner_idx])
inner_tst.pheno <- as.factor(train.ds[, label][-inner_idx])
} else {
inner_trn.pheno <- train.ds[, label][inner_idx]
inner_tst.pheno <- train.ds[, label][-inner_idx]
}
tune_rf.model <- randomForest::randomForest(inner_trn.data,
y = if(method.model == "classification"){as.factor(inner_trn.pheno)}else{inner_trn.pheno},
mtry = if (method.model == "classification"){
max(floor(ncol(inner_trn.pheno)/3), 1)
} else {
floor(sqrt(ncol(inner_trn.pheno)))
},
ntree = num_tree)
inner_Train_accu <- ifelse(method.model == "classification", 1 - mean(tune_rf.model$confusion[, "class.error"]),
stats::cor(as.numeric(as.vector(tune_rf.model$predicted)), inner_trn.pheno)^2)
# test
inner_test.pred <- stats::predict(tune_rf.model, newdata = inner_tst.data)
inner_Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(inner_test.pred), as.factor(inner_tst.pheno))$byClass["Balanced Accuracy"],
stats::cor(as.numeric(as.vector(inner_test.pred)), inner_tst.pheno)^2)
k_inner_accs <- rbind(k_inner_accs, data.frame(train=inner_Train_accu, test=inner_Test_accu))
}
# optimal k
k_inner_accs$trade_off <- abs(k_inner_accs$train-k_inner_accs$test)
k_inner_accs <- k_inner_accs[order(k_inner_accs$trade_off), ]
ktuned <- as.integer(rownames(k_inner_accs)[1])
}
inner_accs <- NULL
if (wrapper == "relief" && !is.null(tune.inner_selection_percent)){
for (step in tune.inner_selection_percent){
ranked_vars <- CORElearn::attrEval(label, train.ds[inner_idx, ],
estimator = importance.algorithm,
costMatrix = NULL,
outputNumericSplits=FALSE,
kNearestEqual = ifelse(!is.null(tuneK), ktuned, k))
# evaluations
wrapper.topN <- step*length(ranked_vars)
top_vars <- names(sort(ranked_vars, decreasing = TRUE)[1:wrapper.topN])
# random forest
inner_trn.data <- as.matrix(train.ds[inner_idx, top_vars])
inner_tst.data <- as.matrix(train.ds[-inner_idx, top_vars])
if(method.model == "classification"){
inner_trn.pheno <- as.factor(train.ds[, label][inner_idx])
inner_tst.pheno <- as.factor(train.ds[, label][-inner_idx])
} else {
inner_trn.pheno <- train.ds[, label][inner_idx]
inner_tst.pheno <- train.ds[, label][-inner_idx]
}
tune_rf.model <- randomForest::randomForest(inner_trn.data,
y = if(method.model == "classification"){as.factor(inner_trn.pheno)}else{inner_trn.pheno},
mtry = if (method.model == "classification"){
max(floor(ncol(inner_trn.pheno)/3), 1)
} else {
floor(sqrt(ncol(inner_trn.pheno)))
},
ntree = num_tree)
inner_Train_accu <- ifelse(method.model == "classification", 1 - mean(tune_rf.model$confusion[, "class.error"]),
stats::cor(as.numeric(as.vector(tune_rf.model$predicted)), inner_trn.pheno)^2)
# test
inner_test.pred <- stats::predict(tune_rf.model, newdata = inner_tst.data)
inner_Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(inner_test.pred), as.factor(inner_tst.pheno))$byClass["Balanced Accuracy"],
stats::cor(as.numeric(as.vector(inner_test.pred)), inner_tst.pheno)^2)
inner_accs <- rbind(inner_accs, data.frame(train=inner_Train_accu, test=inner_Test_accu))
}
} else if (wrapper == "relief" && is.null(tune.inner_selection_percent)) {
ranked_vars <- CORElearn::attrEval(label, train.ds[inner_idx, ],
estimator = importance.algorithm,
costMatrix = NULL,
outputNumericSplits=FALSE,
kNearestEqual = ifelse(!is.null(tuneK), ktuned, k))
} else if (wrapper == "rf") {
ranfor.fit <- randomForest::randomForest(as.factor(label) ~ ., data = train.ds[inner_idx, ], ntree = num_tree)
rf.importance <- importance(ranfor.fit)
rf.sorted<-sort(rf.importance, decreasing=T, index.return=T)
rf.df <-data.frame(att=rownames(rf.importance)[rf.sorted$ix],rf.scores=rf.sorted$x)
ranked_vars <- as.character(rf.df[,"att"])
} else if (wrapper == "glmnet") {
if(method.model == "classification"){
Class <- as.factor(train.ds[inner_idx, ncol(train.ds)])
}
glm_data <- data.frame(train.ds[inner_idx, -ncol(train.ds)], Class)
ranked_glmnet <- Rinbix::rankGlmnet(glm_data)
ranked_vars <- ranked_glmnet$score
names(ranked_vars) <- ranked_glmnet$variable
} else if (wrapper == "ttest") {
t_test_pvals <- vector(mode = "numeric", length = num_vars)
names(t_test_pvals) <- var_names
for (var_idx in 1:num_vars) {
t_test_pvals[var_idx] <- t.test(train.ds[inner_idx, var_idx] ~ train.ds[inner_idx, ncol(train.ds)])$p.value
}
ranked_vars <- t_test_pvals
} else if (wrapper == "PageRank") {
Adj_mat <- ifelse(cor(train.ds[inner_idx, -ncol(train.ds)]) > 0, 1, 0)
diag(Adj_mat) <- 0
ranked_vars <- Rinbix::PageRank(Adj_mat)[, 1]
} else if (wrapper == "Katz") {
Adj_mat <- ifelse(cor(train.ds[inner_idx, -ncol(train.ds)]) > 0, 1, 0)
a <- eigen(Adj_mat)
beta <- rep(1, nrow(Adj_mat))/nrow(Adj_mat)
alpha <- 1/max(a$values) - (1/max(a$values))/100
ranked_vars <- Rinbix::EpistasisKatz(Adj_mat, alpha, beta)
names(ranked_vars) <- colnames(Adj_mat)
} else if (wrapper == "EpistasisKatz") {
if(method.model == "classification"){
Class <- as.factor(train.ds[inner_idx, ncol(train.ds)])
}
regain_data <- data.frame(train.ds[inner_idx, -ncol(train.ds)], Class)
regain_mat <- Rinbix::regainParallel(regain_data, regressionFamily = ifelse(method.model == "classification",
"binomial", "gaussian"))
alpha <- 1/mean(colSums(regain_mat))
beta <- diag(regain_mat)
diag(regain_mat) <- 0
ranked_vars <- Rinbix::EpistasisKatz(regain_mat, alpha, beta)
names(ranked_vars) <- colnames(regain_mat)
} else if (wrapper == "EpistasisRank") {
if(method.model == "classification"){
Class <- as.factor(train.ds[inner_idx, ncol(train.ds)])
}
regain_data <- data.frame(train.ds[inner_idx, -ncol(train.ds)], Class)
regain_mat <- Rinbix::regainParallel(regain_data, regressionFamily = ifelse(method.model == "classification",
"binomial", "gaussian"))
er_rank <- Rinbix::EpistasisRank(regain_mat, Gamma_vec = .85)
ranked_vars <- er_rank$ER
names(ranked_vars) <- er_rank$gene
}
if (wrapper == "relief" && inner_selection_positivescores){
top_vars <- names(which(sort(ranked_vars, decreasing = TRUE)>0))
} else if (wrapper == "relief" && !is.null(inner_selection_percent)){
wrapper.topN <- inner_selection_percent*length(ranked_vars)
top_vars <- names(sort(ranked_vars, decreasing = TRUE)[1:wrapper.topN])
} else if (wrapper == "relief" && !is.null(tune.inner_selection_percent)){
# find optimal percentage
inner_accs$trade_off <- abs(inner_accs$train-inner_accs$test)
inner_accs <- inner_accs[order(inner_accs$trade_off), ]
inner_opt_idx <- as.integer(rownames(inner_accs)[1])
# selected top optimal percentage genes
wrapper.topN <- tune.inner_selection_percent[inner_opt_idx]*length(ranked_vars)
top_vars <- names(sort(ranked_vars, decreasing = TRUE)[1:wrapper.topN])
} else if (wrapper == "rf"){
if(inner_selection_positivescores==F){
wrapper.topN <- round(inner_selection_percent*length(ranked_vars))
top_vars <- ranked_vars[1:wrapper.topN]
}else{
top_vars <- names(ranked_vars.num[which(ranked_vars.num>0)])
}
} else {
num_ranked_vars <- length(ranked_vars)
if (num_ranked_vars < wrapper.topN) {
cat("WARNING glmnet selected less than specified top N:", wrapper.topN)
cat(" setting top N to length glnmnet selection:", num_ranked_vars, "\n")
wrapper.topN <- num_ranked_vars
}
if (num_ranked_vars < 1) {
cat("No variable is selected:", num_ranked_vars, "\n")
} else {
if (wrapper == "ttest") {
top_vars <- names(sort(ranked_vars, decreasing = FALSE)[1:wrapper.topN])
} else {
top_vars <- names(sort(ranked_vars, decreasing = TRUE)[1:wrapper.topN])
}
}
}
if(!is.null(covars_vec)){
cat("Adjusting for covariates...\n")
train.data_fltr <- train.ds[inner_idx, c(top_vars, "class")]
inner_covars_vec <- covars_vec[inner_idx]
npdr.results <- npdr('class', train.data_fltr,
regression.type='glm', attr.diff.type='numeric-abs',
nbd.method='multisurf', nbd.metric = 'manhattan',
covars=inner_covars_vec, # works with sex.covar.mat as well
covar.diff.type='match-mismatch', # for categorical covar like sex
msurf.sd.frac=0.5, padj.method='bonferroni')
top_vars <- npdr.results[npdr.results$pval.adj<covars.pval.adj, "att"]
}
atts[[j]] <- top_vars
}
relief_atts[[i]] <- Reduce(intersect, atts)
if(param.tune){
outer_idx <- which(outer_folds!=i)
trn.data <- as.matrix(train.ds[outer_idx, relief_atts[[i]]])
tst.data <- as.matrix(train.ds[-outer_idx, relief_atts[[i]]])
if(method.model == "classification"){
trn.pheno <- as.factor(train.ds[, label][outer_idx])
tst.pheno <- as.factor(train.ds[, label][-outer_idx])
} else {
trn.pheno <- train.ds[, label][outer_idx]
tst.pheno <- train.ds[, label][-outer_idx]
}
if(verbose){cat("\n Parameter tuning...\n")}
train_model <- caret::train(x = trn.data,
y = trn.pheno,
method = learning_method,
metric = ifelse(is.factor(trn.pheno), "Accuracy", "RMSE"),
trControl = caret::trainControl(method = "cv"),
tuneGrid = tuneGrid)
train_pred <- stats::predict(train_model, trn.data)
train_acc <- ifelse(method.model == "classification",
confusionMatrix(train_pred, trn.pheno)$byClass["Balanced Accuracy"],
stats::cor(trn.pheno, train_pred)^2)
test_pred <- stats::predict(train_model, tst.data)
test_acc <- ifelse(method.model == "classification",
confusionMatrix(test_pred, tst.pheno)$byClass["Balanced Accuracy"],
stats::cor(tst.pheno, test_pred)^2)
accu <- abs(train_acc-test_acc)
tune_params <- rbind(tune_params, data.frame(train_model$bestTune, accu))
}
}
nCV_atts <- Reduce(intersect, relief_atts)
if(identical(nCV_atts, character(0))){stop("There is no consensus feature!")}
if(param.tune) {
tuneParam <- tune_params[which.min(tune_params$accu), ]
}
if(verbose){cat("\n Validating...\n")}
train.data <- as.matrix(train.ds[, nCV_atts])
if(!is.null(validation.ds)) {
test.data <- as.matrix(validation.ds[, nCV_atts])
}
if(method.model == "classification"){
train.pheno <- as.integer(train.ds[, label])
if(!is.null(validation.ds)) {
test.pheno <- as.integer(validation.ds[, label])
}
} else if (method.model == "regression"){
train.pheno <- train.ds[, label]
if(!is.null(validation.ds)) {
test.pheno <- validation.ds[, label]
}
}
if(verbose){cat("\n Perform [",learning_method,"]\n")}
if(learning_method == "glmnet"){
# glmnet - train
if(param.tune){alpha = tuneParam$alpha; lambda = tuneParam$lambda
train.model <- glmnet::glmnet(train.data, train.pheno, family = ifelse(method.model == "classification", "binomial", "gaussian"),
alpha = alpha, lambda = lambda)
train.pred <- stats::predict(train.model, train.data, s = lambda, type = ifelse(method.model == "classification","class", "response"))
}else{
train.model <- glmnet::glmnet(train.data, train.pheno, family = ifelse(method.model == "classification", "binomial", "gaussian"))
train.pred <- stats::predict(train.model, train.data, s = min(train.model$lambda), type = ifelse(method.model == "classification","class", "response"))
}
Train_accu <- ifelse(method.model == "classification" , confusionMatrix(as.factor(train.pred),
as.factor(train.pheno))$byClass["Balanced Accuracy"],
stats::cor(as.numeric(train.pred), train.pheno)^2)
# glmnet - test
if(is.null(validation.ds)){
Test_accu <- NA
} else {
if(param.tune){
test.pred <- stats::predict(train.model, test.data, s = lambda, type = "class")
} else {
test.pred <- stats::predict(train.model, test.data, s = min(train.model$lambda), type = "class")
}
Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(test.pred),
as.factor(test.pheno))$byClass["Balanced Accuracy"],
stats::cor(test.pred, test.pheno)^2)
}
} else if(learning_method == "xgbTree"){
# xgboost - train
dtrain <- xgboost::xgb.DMatrix(data = train.data, label = ifelse(train.pheno == 1, 0, 1))
dtest <- xgboost::xgb.DMatrix(data = test.data, label = ifelse(test.pheno == 1, 0, 1))
if(param.tune){
shrinkage = tuneParam$eta; max_depth = tuneParam$max_depth; gamma = tuneParam$gamma
subsample = tuneParam$subsample; colsample_bytree = tuneParam$colsample_bytree
min_child_weight = tuneParam$min_child_weight
} else {
shrinkage = 0.3; max_depth = 2; gamma = 0; subsample = 1; colsample_bytree = 1; min_child_weight = 1
}
train.model <- xgboost::xgboost(data = dtrain,
eta = shrinkage,
nrounds = 2,
max_depth = max_depth,
gamma = gamma,
subsample = subsample,
colsample_bytree = colsample_bytree,
min_child_weight = min_child_weight,
objective = ifelse(method.model == "classification", "binary:logistic", "reg:linear"))
train.pred.prob <- stats::predict(train.model, dtrain)
train.pred.bin <- as.numeric(train.pred.prob > 0.5)
Train_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(ifelse(train.pred.bin == 0, 1, 2)),
as.factor(train.pheno))$byClass["Balanced Accuracy"],
stats::cor(train.pred.prob, train.pheno)^2)
# xgboost - test
if(is.null(validation.ds)){
Test_accu <- NA
} else {
test.pred.prob <- stats::predict(train.model, dtest)
test.pred.bin <- as.numeric(test.pred.prob > 0.5)
Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(ifelse(test.pred.bin == 0, 1, 2)),
as.factor(test.pheno))$byClass["Balanced Accuracy"],
stats::cor(test.pred.prob, test.pheno)^2)
}
} else if(learning_method == "rf") {
# random forest - train
train.model <- randomForest::randomForest(train.data,
y = if(method.model == "classification"){as.factor(train.pheno)}else{train.pheno},
mtry = if(param.tune && tuneParam$mtry > 1 && tuneParam$mtry < ncol(train.data))
{tuneParam$mtry} else if (method.model == "classification"){
max(floor(ncol(train.data)/3), 1)
} else {
floor(sqrt(ncol(train.data)))
},
ntree = num_tree)
Train_accu <- ifelse(method.model == "classification", 1 - mean(train.model$confusion[, "class.error"]),
stats::cor(as.numeric(as.vector(train.model$predicted)), train.pheno)^2)
# random forest - test
if(is.null(validation.ds)){
Test_accu <- NA
} else {
test.pred <- stats::predict(train.model, newdata = test.data)
Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(test.pred), as.factor(test.pheno))$byClass["Balanced Accuracy"],
stats::cor(as.numeric(as.vector(test.pred)), test.pheno)^2)
}
}
elapsed.time <- (proc.time() - ptm)[3]
if(verbose){cat("nestedCV elapsed time", elapsed.time, "\n")}
list(cv.acc = Train_accu, Validation = Test_accu, Features = nCV_atts, Train_model = train.model, Elapsed = elapsed.time)
}
###########################################
##### Regular nested cross validation #####
#------------------------------------------
#' Regular nested cross validation for feature selection and parameter tuning
#' @param train.ds A training data frame with last column as outcome
#' @param validation.ds A validation data frame with last column as outcome
#' @param label A character vector of the outcome variable column name.
#' @param method.model Column name of outcome variable (string), classification or regression. If the analysis goal is classification make the column a factor type.
#' For regression, make outcome column numeric type.
#' @param is.simulated A TRUE or FALSE character for data type
#' @param ncv_folds A numeric vector to indicate nested cv folds: c(k_outer, k_inner)
#' @param param.tune A TRUE or FALSE character for tuning parameters
#' @param learning_method Name of the method: glmnet/xgbTree/rf
#' @param xgb.obj Name of xgboost algorithm
#' @param importance.algorithm A character vestor containing a specific importance algorithm subtype
#' @param wrapper feature selection algorithm including: rf, glmnet, t.test, centrality methods (PageRank, Katz,
#' EpistasisRank, and EpistasisKatz from Rinbix packages), ReliefF family, and etc.
#' @param inner_selection_percent = .2 Percentage of features to be selected in each inner fold.
#' @param inner_selection_positivescores A TRUE or FALSE character to select positive scores (if the value is False, use the percentage method).
#' @param relief.k.method A character of numeric to indicate number of nearest neighbors for relief algorithm.
#' Possible characters are: k_half_sigma (floor((num.samp-1)*0.154)), m6 (floor(num.samp/6)),
#' @param tuneGrid A data frame with possible tuning values. The columns are named the same as the tuning parameters.
#' This caret library parameter, for more information refer to http://topepo.github.io/caret/available-models.html.
#' myopic (floor((num.samp-1)/2)), and m4 (floor(num.samp/4))
#' @param num_tree Number of trees in random forest and xgboost methods
#' @param verbose A flag indicating whether verbose output be sent to stdout
#' @return A list with:
#' \describe{
#' \item{cv.acc}{Training data accuracy}
#' \item{Validation}{Validation data accuracy}
#' \item{Features}{number of variables detected correctly in nested cross validation}
#' \item{Train_model}{Traing model to use for validation}
#' \item{Elapsed}{total elapsed time}
#' }
#' num.samples <- 100
#' num.variables <- 100
#' pct.signals <- 0.1
#' label <- "class"
#' sim.data <- createSimulation(num.samples = num.samples,
#' num.variables = num.variables,
#' pct.signals = pct.signals,
#' sim.type = "mainEffect",
#' label = label,
#' verbose = FALSE)
#' rnCV.results <- regular_nestedCV(train.ds = sim.data$train,
#' validation.ds = sim.data$holdout,
#' label = label,
#' is.simulated = TRUE,
#' ncv_folds = c(10, 10),
#' param.tune = FALSE,
#' learning_method = "rf",
#' importance.algorithm = "ReliefFbestK",
#' num_tree = 500,
#' verbose = FALSE)
#' @family nestedCV
#' @export
regular_nestedCV <- function(train.ds = NULL,
validation.ds = NULL,
label = "class",
method.model = "classification",
is.simulated = TRUE,
ncv_folds = c(10, 10),
param.tune = FALSE,
learning_method = "rf",
xgb.obj = "binary:logistic",
importance.algorithm = "ReliefFequalK",
wrapper = "relief",
inner_selection_percent = 0.2,
inner_selection_positivescores = TRUE,
relief.k.method = "k_half_sigma",
tuneGrid = NULL,
num_tree = 500,
verbose = FALSE){
# Define variables
relief_atts <- list(); tune_params <- NULL
Train_accu <- NULL; Test_accu <- NULL
ptm <- proc.time()
if(verbose){cat("\nRunning regular nested cross-validation...\n")}
outer_folds <- caret::createFolds(train.ds[, label], ncv_folds[1], list = FALSE)
for (i in 1:ncv_folds[1]){
inner_atts <- list()
inner_tune_params <- NULL
if(verbose){cat("\n Create [",ncv_folds[2],"] inner folds of outer fold[",i,"]\n")}
inner_folds <- caret::createFolds(train.ds[, label][outer_folds!=i], ncv_folds[2], list = TRUE)
if(verbose){cat("\n Feature Selection and Parameter Tuning...\n")}
for (j in 1:length(inner_folds)){
inner_idx <- which(outer_folds!=i)[-inner_folds[[j]]]
# if someone specifies a numeric value (integer hopefully), use this value for k.
# However, make sure it is not larger than floor((num.samp.min-1)/2), where
# num.samp.min is the min of the train, holdout.. sample sizes.
# Or you could test the inequality when you encounter each data split.
# If someone does exceed the threshold, set k to floor((num.samp.min-1)/2)
# and writing warning that says
# "ReliefF k too large. Using maximum."
if (is.numeric(relief.k.method)) {
if (relief.k.method > floor((dim(train.ds[inner_idx, ])[1]-1)/2)){
warning("ReliefF k too large. Using maximum.")
k <- floor((dim(train.ds[inner_idx, ])[1]-1)/2)
} else {
k <- relief.k.method
}
} else if (relief.k.method == "myopic"){
k <- floor((dim(train.ds[inner_idx, ])[1]-1)/2)
# where k may change based on num.samp for train, holdout...
} else if (relief.k.method == "m6") { # default "m6" method
k <- floor(dim(train.ds[inner_idx, ])[1]/6)
# where k may change based on num.samp for train, holdout...
} else if (relief.k.method == "m4") {
k <- floor(dim(train.ds[inner_idx, ])[1]/4)
} else {
k <- floor((dim(train.ds[inner_idx, ])[1]-1)*0.154)
}
if (sum(ncv_folds)>(dim(train.ds[inner_idx, ])[1])/3){
stop("There are less than three observations in each fold")
}
if (wrapper == "relief"){
ranked_vars <- CORElearn::attrEval(label, train.ds[inner_idx, ],
estimator = importance.algorithm,
costMatrix = NULL,
outputNumericSplits=FALSE,
kNearestEqual = k)
} else if (wrapper == "rf") {
rf_model <- CORElearn::CoreModel(label, train.ds[inner_idx, ], model = "rf")
ranked_vars <- CORElearn::rfAttrEval(rf_model)
} else if (wrapper == "glmnet") {
if(method.model == "classification"){
Class <- as.factor(train.ds[inner_idx, ncol(train.ds)])
}
glm_data <- data.frame(train.ds[inner_idx, -ncol(train.ds)], Class)
ranked_glmnet <- Rinbix::rankGlmnet(glm_data)
ranked_vars <- ranked_glmnet$score
names(ranked_vars) <- ranked_glmnet$variable
} else if (wrapper == "ttest") {
t_test_pvals <- vector(mode = "numeric", length = num_vars)
names(t_test_pvals) <- var_names
for (var_idx in 1:num_vars) {
t_test_pvals[var_idx] <- t.test(train.ds[inner_idx, var_idx] ~ train.ds[inner_idx, ncol(train.ds)])$p.value
}
ranked_vars <- t_test_pvals
} else if (wrapper == "PageRank") {
Adj_mat <- ifelse(cor(train.ds[inner_idx, -ncol(train.ds)]) > 0, 1, 0)
diag(Adj_mat) <- 0
ranked_vars <- Rinbix::PageRank(Adj_mat)[, 1]
} else if (wrapper == "Katz") {
Adj_mat <- ifelse(cor(train.ds[inner_idx, -ncol(train.ds)]) > 0, 1, 0)
a <- eigen(Adj_mat)
beta <- rep(1, nrow(Adj_mat))/nrow(Adj_mat)
alpha <- 1/max(a$values) - (1/max(a$values))/100
ranked_vars <- Rinbix::EpistasisKatz(Adj_mat, alpha, beta)
names(ranked_vars) <- colnames(Adj_mat)
} else if (wrapper == "EpistasisKatz") {
if(method.model == "classification"){
Class <- as.factor(train.ds[inner_idx, ncol(train.ds)])
}
regain_data <- data.frame(train.ds[inner_idx, -ncol(train.ds)], Class)
regain_mat <- Rinbix::regainParallel(regain_data, regressionFamily = ifelse(method.model == "classification",
"binomial", "gaussian"))
alpha <- 1/mean(colSums(regain_mat))
beta <- diag(regain_mat)
diag(regain_mat) <- 0
ranked_vars <- Rinbix::EpistasisKatz(regain_mat, alpha, beta)
names(ranked_vars) <- colnames(regain_mat)
} else if (wrapper == "EpistasisRank") {
if(method.model == "classification"){
Class <- as.factor(train.ds[inner_idx, ncol(train.ds)])
}
regain_data <- data.frame(train.ds[inner_idx, -ncol(train.ds)], Class)
regain_mat <- Rinbix::regainParallel(regain_data, regressionFamily = ifelse(method.model == "classification",
"binomial", "gaussian"))
er_rank <- Rinbix::EpistasisRank(regain_mat, Gamma_vec = .85)
ranked_vars <- er_rank$ER
names(ranked_vars) <- er_rank$gene
}
wrapper.topN <- inner_selection_percent*length(ranked_vars)
if (wrapper == "relief" && inner_selection_positivescores){
top_vars <- names(which(sort(ranked_vars, decreasing = TRUE)>0))
} else if (wrapper == "relief" && !inner_selection_positivescores){
top_vars <- names(sort(ranked_vars, decreasing = TRUE)[1:wrapper.topN])
} else {
num_ranked_vars <- length(ranked_vars)
if (num_ranked_vars < wrapper.topN) {
cat("WARNING glmnet selected less than specified top N:", wrapper.topN)
cat(" setting top N to length glnmnet selection:", num_ranked_vars, "\n")
wrapper.topN <- num_ranked_vars
}
if (num_ranked_vars < 1) {
cat("No variable is selected:", num_ranked_vars, "\n")
} else {
if (wrapper == "ttest") {
top_vars <- names(sort(ranked_vars, decreasing = FALSE)[1:wrapper.topN])
} else {
top_vars <- names(sort(ranked_vars, decreasing = TRUE)[1:wrapper.topN])
}
}
}
inner_trn.data <- as.matrix(train.ds[inner_idx, top_vars])
inner_tst.data <- as.matrix(train.ds[-inner_idx, top_vars])
if(method.model == "classification"){
inner_trn.pheno <- as.factor(train.ds[, label][inner_idx])
inner_tst.pheno <- as.factor(train.ds[, label][-inner_idx])
} else {
inner_trn.pheno <- train.ds[, label][inner_idx]
inner_tst.pheno <- train.ds[, label][-inner_idx]
}
inner_train_model <- caret::train(x = inner_trn.data,
y = inner_trn.pheno,
method = learning_method,
metric = ifelse(is.factor(inner_trn.pheno), "Accuracy", "RMSE"),
trControl = caret::trainControl(method = "cv"),
tuneGrid = tuneGrid)
inner_train_pred <- stats::predict(inner_train_model, inner_trn.data)
inner_train_acc <- ifelse(method.model == "classification",confusionMatrix(inner_train_pred, inner_trn.pheno)$byClass["Balanced Accuracy"],
stats::cor(inner_trn.pheno, inner_train_pred)^2)
inner_test_pred <- stats::predict(inner_train_model, inner_tst.data)
inner_test_acc <- ifelse(method.model == "classification",confusionMatrix(inner_test_pred, inner_tst.pheno)$byClass["Balanced Accuracy"],
stats::cor(inner_tst.pheno, inner_test_pred)^2)
# store tuned parameters
inner_accu <- abs(inner_train_acc-inner_test_acc)
inner_tune_params <- rbind(inner_tune_params, data.frame(inner_train_model$bestTune, inner_accu))
#store features
inner_atts[[j]] <- top_vars
}
relief_atts[[i]] <- inner_atts[[which.min(inner_tune_params$inner_accu)]]
outer_tuneParam <- inner_tune_params[which.min(inner_tune_params$inner_accu), ]
outer_idx <- which(outer_folds!=i)
outer_train.data <- as.matrix(train.ds[outer_idx, relief_atts[[i]]])
outer_test.data <- as.matrix(train.ds[-outer_idx, relief_atts[[i]]])
if(method.model == "classification"){
outer_train.pheno <- as.integer(train.ds[outer_idx, label])
outer_test.pheno <- as.integer(train.ds[-outer_idx, label])
} else if (method.model == "regression"){
outer_train.pheno <- train.ds[outer_idx, label]
outer_test.pheno <- train.ds[-outer_idx, label]
}
if(verbose){cat("\n Perform [",learning_method,"]\n")}
if(learning_method == "glmnet"){
# glmnet - train
alpha = outer_tuneParam$alpha; lambda = outer_tuneParam$lambda
outer_glmnet.model <- glmnet::glmnet(outer_train.data, outer_train.pheno, family = ifelse(method.model == "classification", "binomial", "gaussian"),
alpha = alpha, lambda = lambda)
outer_train.pred <- stats::predict(outer_glmnet.model, outer_train.data, s = lambda, type = ifelse(method.model == "classification","class", "response"))
outer_Train_accu <- ifelse(method.model == "classification" , confusionMatrix(as.factor(outer_train.pred),
as.factor(outer_train.pheno))$byClass["Balanced Accuracy"],
stats::cor(as.numeric(outer_train.pred), outer_train.pheno)^2)
# test
outer_test.pred <- stats::predict(outer_glmnet.model, outer_test.data, s = lambda, type = "class")
outer_Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(outer_test.pred),
as.factor(outer_test.pheno))$byClass["Balanced Accuracy"],
stats::cor(outer_test.pred, outer_test.pheno)^2)
} else if(learning_method == "xgbTree"){
# xgboost - train
outer_dtrain <- xgboost::xgb.DMatrix(data = outer_train.data, label = ifelse(outer_train.pheno == 1, 0, 1))
outer_dtest <- xgboost::xgb.DMatrix(data = outer_test.data, label = ifelse(outer_test.pheno == 1, 0, 1))
# tuned parameters
shrinkage = outer_tuneParam$eta; max_depth = outer_tuneParam$max_depth; gamma = outer_tuneParam$gamma
subsample = outer_tuneParam$subsample; colsample_bytree = outer_tuneParam$colsample_bytree
min_child_weight = outer_tuneParam$min_child_weight
outer_xgb.model <- xgboost::xgboost(data = outer_dtrain,
eta = shrinkage,
nrounds = 2,
max_depth = max_depth,
gamma = gamma,
subsample = subsample,
colsample_bytree = colsample_bytree,
min_child_weight = min_child_weight,
objective = xgb.obj)
outer_train.pred.prob <- stats::predict(outer_xgb.model, outer_dtrain)
outer_train.pred.bin <- as.numeric(outer_train.pred.prob > 0.5)
outer_Train_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(ifelse(outer_train.pred.bin == 0, 1, 2)),
as.factor(outer_train.pheno))$byClass["Balanced Accuracy"],
stats::cor(outer_train.pred.prob, outer_train.pheno)^2)
# test
outer_test.pred.prob <- stats::predict(outer_xgb.model, outer_dtest)
outer_test.pred.bin <- as.numeric(outer_test.pred.prob > 0.5)
outer_Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(ifelse(outer_test.pred.bin == 0, 1, 2)),
as.factor(outer_test.pheno))$byClass["Balanced Accuracy"],
stats::cor(outer_test.pred.prob, outer_test.pheno)^2)
} else if(learning_method == "rf") {
# random forest - train
outer_rf.model <- randomForest::randomForest(outer_train.data,
y = if(method.model == "classification"){as.factor(outer_train.pheno)}else{outer_train.pheno},
mtry = if(outer_tuneParam$mtry > 1 && outer_tuneParam$mtry < ncol(outer_train.data))
{outer_tuneParam$mtry} else if (method.model == "classification"){
max(floor(ncol(outer_train.data)/3), 1)
} else {
floor(sqrt(ncol(outer_train.data)))
},
ntree = num_tree)
outer_Train_accu <- ifelse(method.model == "classification", 1 - mean(outer_rf.model$confusion[, "class.error"]),
stats::cor(as.numeric(as.vector(outer_rf.model$predicted)), outer_train.pheno)^2)
# test
outer_test.pred <- stats::predict(outer_rf.model, newdata = outer_test.data)
outer_Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(outer_test.pred), as.factor(outer_test.pheno))$byClass["Balanced Accuracy"],
stats::cor(as.numeric(as.vector(outer_test.pred)), outer_test.pheno)^2)
}
# store tuned parameters
outer_accu <- abs(outer_Train_accu-outer_Test_accu)
tune_params <- rbind(tune_params, data.frame(outer_tuneParam, outer_accu))
}
nCV_atts <- relief_atts[[which.min(tune_params$outer_accu)]]
if(identical(nCV_atts, character(0))){stop("No feature selected in nested CV loop!")}
tuneParam <- tune_params[which.min(tune_params$outer_accu), ]
if(verbose){cat("\n Validating...\n")}
train.data <- as.matrix(train.ds[, nCV_atts])
if(!is.null(validation.ds)) {
test.data <- as.matrix(validation.ds[, nCV_atts])
}
if(method.model == "classification"){
train.pheno <- as.integer(train.ds[, label])
if(!is.null(validation.ds)) {
test.pheno <- as.integer(validation.ds[, label])
}
} else if (method.model == "regression"){
train.pheno <- train.ds[, label]
if(!is.null(validation.ds)) {
test.pheno <- validation.ds[, label]
}
}
if(verbose){cat("\n Perform [",learning_method,"]\n")}
if(learning_method == "glmnet"){
# glmnet - train
if(param.tune){alpha = tuneParam$alpha; lambda = tuneParam$lambda
train.model <- glmnet::glmnet(train.data, train.pheno, family = ifelse(method.model == "classification", "binomial", "gaussian"),
alpha = alpha, lambda = lambda)
train.pred <- stats::predict(train.model, train.data, s = lambda, type = ifelse(method.model == "classification","class", "response"))
}else{
train.model <- glmnet::glmnet(train.data, train.pheno, family = ifelse(method.model == "classification", "binomial", "gaussian"))
train.pred <- stats::predict(train.model, train.data, s = min(train.model$lambda), type = ifelse(method.model == "classification","class", "response"))
}
Train_accu <- ifelse(method.model == "classification" , confusionMatrix(as.factor(train.pred),
as.factor(train.pheno))$byClass["Balanced Accuracy"],
stats::cor(as.numeric(train.pred), train.pheno)^2)
# glmnet - test
if(is.null(validation.ds)){
Test_accu <- NA
} else {
if(param.tune){
test.pred <- stats::predict(train.model, test.data, s = lambda, type = "class")
} else {
test.pred <- stats::predict(train.model, test.data, s = min(train.model$lambda), type = "class")
}
Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(test.pred),
as.factor(test.pheno))$byClass["Balanced Accuracy"],
stats::cor(test.pred, test.pheno)^2)
}
} else if(learning_method == "xgbTree"){
# xgboost - train
dtrain <- xgboost::xgb.DMatrix(data = train.data, label = ifelse(train.pheno == 1, 0, 1))
dtest <- xgboost::xgb.DMatrix(data = test.data, label = ifelse(test.pheno == 1, 0, 1))
if(param.tune){
shrinkage = tuneParam$eta; max_depth = tuneParam$max_depth; gamma = tuneParam$gamma
subsample = tuneParam$subsample; colsample_bytree = tuneParam$colsample_bytree
min_child_weight = tuneParam$min_child_weight
} else {
shrinkage = 0.3; max_depth = 2; gamma = 0; subsample = 1; colsample_bytree = 1; min_child_weight = 1
}
train.model <- xgboost::xgboost(data = dtrain,
eta = shrinkage,
nrounds = 2,
max_depth = max_depth,
gamma = gamma,
subsample = subsample,
colsample_bytree = colsample_bytree,
min_child_weight = min_child_weight,
objective = xgb.obj)
train.pred.prob <- stats::predict(train.model, dtrain)
train.pred.bin <- as.numeric(train.pred.prob > 0.5)
Train_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(ifelse(train.pred.bin == 0, 1, 2)),
as.factor(train.pheno))$byClass["Balanced Accuracy"],
stats::cor(train.pred.prob, train.pheno)^2)
# xgboost - test
if(is.null(validation.ds)){
Test_accu <- NA
} else {
test.pred.prob <- stats::predict(train.model, dtest)
test.pred.bin <- as.numeric(test.pred.prob > 0.5)
Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(ifelse(test.pred.bin == 0, 1, 2)),
as.factor(test.pheno))$byClass["Balanced Accuracy"],
stats::cor(test.pred.prob, test.pheno)^2)
}
} else if(learning_method == "rf") {
# random forest - train
train.model <- randomForest::randomForest(train.data,
y = if(method.model == "classification"){as.factor(train.pheno)}else{train.pheno},
mtry = if(param.tune && tuneParam$mtry > 1 && tuneParam$mtry < ncol(train.data))
{tuneParam$mtry} else if (method.model == "classification"){
max(floor(ncol(train.data)/3), 1)
} else {
floor(sqrt(ncol(train.data)))
},
ntree = num_tree)
Train_accu <- ifelse(method.model == "classification", 1 - mean(train.model$confusion[, "class.error"]),
stats::cor(as.numeric(as.vector(train.model$predicted)), train.pheno)^2)
# random forest - test
if(is.null(validation.ds)){
Test_accu <- NA
} else {
test.pred <- stats::predict(train.model, newdata = test.data)
Test_accu <- ifelse(method.model == "classification", confusionMatrix(as.factor(test.pred), as.factor(test.pheno))$byClass["Balanced Accuracy"],
stats::cor(as.numeric(as.vector(test.pred)), test.pheno)^2)
}
}
elapsed.time <- (proc.time() - ptm)[3]
if(verbose){cat("nestedCV elapsed time", elapsed.time, "\n")}
list(cv.acc = Train_accu, Validation = Test_accu, Features = nCV_atts, Train_model = train.model, Elapsed = elapsed.time)
}
#################################################
##### Functional attributes status detector #####
#------------------------------------------------
#' Given a vector functional (true) attribute associations and a vector of positive associations,
#' returns detection statistics like true positive rate, recall and precision.
#' @param functional A vector functional (true) attributes
#' @param positives A vector of positive associations
#' @return A list with:
#' \describe{
#' \item{TP}{True Positive}
#' \item{FP}{False Positive}
#' \item{FN}{False Negative}
#' \item{TPR}{True Positive Rate}
#' \item{FPR}{False Positive Rate}
#' \item{precision}{Fraction of relevant features among the retrieved features}
#' \item{recall}{Fraction of total relevant features that were actually retrieved}
#' }
#' num.samples <- 100
#' num.variables <- 100
#' pct.signals <- 0.1
#' label <- "class"
#' sim.data <- createSimulation(num.samples = num.samples,
#' num.variables = num.variables,
#' pct.signals = pct.signals,
#' sim.type = "mainEffect",
#' label = label,
#' verbose = FALSE)
#' rnCV.results <- regular_nestedCV(train.ds = sim.data$train,
#' validation.ds = sim.data$holdout,
#' label = label,
#' is.simulated = TRUE,
#' ncv_folds = c(10, 10),
#' param.tune = FALSE,
#' learning_method = "rf",
#' importance.algorithm = "ReliefFbestK",
#' num_tree = 500,
#' verbose = FALSE)
#' functional <- data.sets$signal.names
#' rncv.positives <- rncv_result$Features
#' rncv.detect <- functionalDetectionStats(functional, rncv.positives)
#' @family nestedCV
#' @export
functionalDetectionStats <- function(functional, positives){
TP <- sum(positives %in% functional)
FP <- sum((positives %in% functional)==F)
FN <- length(functional) - TP
precision <- TP/(TP+FP)
recall <- TP/(TP+FN)
num.positives <- length(positives)
TPR <- TP/num.positives #rate
FPR <- FP/num.positives #rate
return(list(TP=TP, FP=FP, FN=FN, TPR=TPR, FPR=FPR, precision=precision, recall=recall))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.