#' Model validation performing cross-validation and permutation tests.
#'
#' @param X n_samples x n_features matrix
#' @param y vector of labels
#' @param options options list
#' @param method method list with train, predict, score, and optionally select
#'
#' @return cv prediction, cv scores, rf scores, pt scores
#' @export
#'
validate <- function(X, y, method, options) {
# ----------------- INITIAL PROCESSING ----------------- #
# see if a feature selector is passed, otherwise default
# to selecting all features. in the latter case, also make
# sure rf_trials = 0 since random features don't make sense
if ("select" %in% names(method) ) {
select <- method$select
} else {
select <- function(X,y) {return(colnames(X))}
if (!("rf_trials" %in% names(options))) {
options$rf_trials <- 0
} else if (options$rf_trials != 0) {
message("Warning in validate():")
message(" no feature selector given but rf_trials != 0")
message(" forcing rf_trials = 0")
options$rf_trials <- 0
}
}
# default to five-fold cross-validation
if (!("n_folds" %in% names(options))) {
options$n_folds <- 5
}
# for permuted labels, to which the predicted outcome should be compared to with the score function
if (!("compare_pred" %in% names(options))) {
options$compare_pred <- "y"
} else {
if (!(options$compare_pred %in% c("y", "y_perm"))) {
stop("options$compare_pred needs to be \"y\" or \"y_perm\"")
}
}
# for paired data, take structure into account
if (!("paired" %in% names(options))) {
options$paired <- FALSE
}
if (options$paired) {
if (!("X_label" %in% names(options))) {
stop("X_label needs to be provided to take into account the paired structure in cross-validation and permutation testing.")
}
}
# also give these guys some shorter names
train <- method$train
predict <- method$predict
score <- method$score
# add return values to this list as we go along
return_values <- list()
# stores the number of features selected for each fold
# during cross-validation. we need this later for the
# random features test
feats_per_fold <- list()
# if score is a single function instead of a list of
# functions, wrap it in a list
if (class(score) != "list") {
score <- list(score)
}
# ----------------- END INITIAL PROCESSING ----------------- #
# ----------------- BEGIN CROSS-VALIDATION ----------------- #
# split data into folds
if (options$paired) {
folds <- caret::createFolds(seq(1, nrow(X)/2), options$n_folds)
fold_names <- names(folds)
for (fname in fold_names) {
folds[[fname]] <- which(options$X_label %in% options$X_label[folds[[fname]]])
}
} else {
folds <- caret::createFolds(y, options$n_folds)
fold_names <- names(folds)
}
# vector of cross-validation predictions
y_pred <- y
for (fname in fold_names) {
indices <- folds[[fname]]
X_train <- X[-indices, , drop = FALSE]
y_train <- y[-indices]
X_pred <- X[indices, , drop = FALSE]
features <- select(X_train, y_train)
# actually, check more for valid indices...
if (length(features) == 0) {
stop("method$select() did not return any features")
}
# store number of features selected in fold for later
feats_per_fold[[fname]] <- length(features)
model <- train(as.matrix(X_train[, features, drop = FALSE]), y_train)
y_pred[indices] <- predict(model, as.matrix(X_pred[, features, drop = FALSE]))
}
return_values$cv_y <- y_pred
# apply the list of functions in score to y_pred, y
f_star <- function(f) {f(y, y_pred)}
return_values$cv_score <- lapply(score, f_star)
# ----------------- END CROSS-VALIDATION ----------------- #
# ----------------- BEGIN RANDOM FEATURES ----------------- #
n_trials <- options$rf_trials
if (n_trials > 0) {
n_scores <- length(score)
rf_scores <- list(vector(mode = "numeric", length = n_trials))
rf_scores <- rep(rf_scores, n_scores)
for (trial in 1:n_trials) {
for (fname in fold_names) {
indices <- folds[[fname]]
X_train <- X[-indices, , drop = FALSE]
y_train <- y[-indices]
X_pred <- X[indices, , drop = FALSE]
# careful with sample() pathology here...
# select random features
total_features <- ncol(X_train)
features <- sample(1:total_features, feats_per_fold[[fname]])
model <- train(as.matrix(X_train[, features, drop = FALSE]), y_train)
y_pred[indices] <- predict(model, as.matrix(X_pred[, features, drop = FALSE]))
}
# compute list of scores
score_list <- lapply(score, f_star)
# assign them to vectors in the list
rf_scores <- lv_assign(rf_scores, score_list, trial)
}
return_values$rf_scores <- rf_scores
}
# ----------------- END RANDOM FEATURES ----------------- #
# ----------------- BEGIN PERMUTATION TESTING ----------------- #
n_trials <- options$pt_trials
if (n_trials > 0) {
n_scores <- length(score)
pt_scores <- list(vector(mode = "numeric", length = n_trials))
pt_scores <- rep(pt_scores, n_scores)
y_perm <- y
for (trial in 1:n_trials) {
if (options$paired) { # create permuted y, flip pairs with a 50% probability
for (fname in fold_names) {
indices <- folds[[fname]]
tmp_rn <- runif(length(indices)/2)
flip_pairs <- which(tmp_rn > 0.5)
flip_pairs_labels <- unique(options$X_label[indices])[flip_pairs]
y_perm[indices] <- y[indices]
for (ind_pair in flip_pairs_labels) {
y_perm[which(options$X_label == ind_pair)[2]] <- y[which(options$X_label == ind_pair)[1]]
y_perm[which(options$X_label == ind_pair)[1]] <- y[which(options$X_label == ind_pair)[2]]
}
}
} else {# create permuted y, but only permute inside each fold
for (fname in fold_names) {
indices <- folds[[fname]]
perm <- sample(1:length(indices))
y_perm[indices] <- y[indices[perm]]
}
}
for (fname in fold_names) {
indices <- folds[[fname]]
X_train <- X[-indices, , drop = FALSE]
y_train <- y_perm[-indices]
X_pred <- X[indices, , drop = FALSE]
features <- select(X_train, y_train)
model <- train(as.matrix(X_train[, features, drop = FALSE]), y_train)
y_pred[indices] <- predict(model, as.matrix(X_pred[, features, drop = FALSE]))
}
if (options$compare_pred == "y") {
# compute list of scores
score_list <- lapply(score, f_star)
} else if (options$compare_pred == "y_perm") {
f_star_perm <- function(f) {f(y_perm, y_pred)}
score_list <- lapply(score, f_star_perm)
}
pt_scores <- lv_assign(pt_scores, score_list, trial)
}
return_values$pt_scores <- pt_scores
}
# ----------------- END PERMUTATION TESTING ----------------- #
# ----------------- BEGIN FINAL PROCESSING ----------------- #
# unpack the list if its length is 1 (score was just a function)
if (length(score) == 1) {
return_values$cv_score <- return_values$cv_score[[1]]
if ("rf_scores" %in% names(return_values)) {
return_values$rf_scores <- return_values$rf_scores[[1]]
}
if ("pt_scores" %in% names(return_values)) {
return_values$pt_scores <- return_values$pt_scores[[1]]
}
}
# ----------------- END FINAL PROCESSING ----------------- #
return(return_values)
}
# for each index in vec_list, it sets
# vec_list[[ind]][v_index] = val_list[[ind]]
lv_assign <- function(vec_list, val_list, v_index) {
list_indices <- c(1:length(vec_list))
for (ind in list_indices) {
vec <- vec_list[[ind]]
vec[v_index] <- val_list[[ind]]
vec_list[[ind]] <- vec
}
return(vec_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.