#' Validation of binary classification rule
#'
#' @param model Model's formula
#' @param data Data to fit and assess model performance
#' @param type Type of statistical model
#' @param scope Scope in stepwise variable selection
#' @param validation_method "cv" (cross-validation) or "bootstrap"
#' @param B number of bootstrap samples / B-fold cross-validation
#' @param B_mat Data matrix for cross-validation
#' @param test_set_method How is the test set defined (either "all" -> use original dataset as test data (default for bootstrap), or "excluded" -> use patients excluded from cv/bootstrap sample as test data)
#' @param show_selected_coef If TRUE, show the selected coefficients (only for glm's with model selection)
#' @param criteria
#' @param roc If TRUE, true positive rates corresponding to false positive rates given in argument fpr are also evaluated and cv/bootstrap corrected
#' @param fpr
#' @param ... Other parameters for fit_method and fit_response
#'
#' @return List of validation output
#' @export
classValidation <- function(model, data, type, scope=NULL,
validation_method = "bootstrap", B = NULL, B_mat = NULL,
test_set_method = ifelse(validation_method == "bootstrap", "all", "excluded"),
show_selected_coef = F, criteria = c(Brier, c_index, calib),
roc = F, fpr = seq(0, 1, length = 101), ...){
n <- nrow(data)
#-- checks
if (is.null(B) && is.null(B_mat)) stop("Either argument B or B_mat must be provided")
if ((!is.null(B)) && (!is.null(B_mat)) && (ncol(B_mat) != B)) stop("Arguments B and B_mat incompatible")
#-- Create B and B_mat (containing selected observations in each bootstrap/cv sample)
if (is.null(B)) B <- ncol(B_mat)
if (is.null(B_mat)){
if (validation_method == "bootstrap") B_mat <- rmultinom(B, size = n, prob = rep(1,n)/n)
if (validation_method == "cv"){
which.group <- sample(rep(1:B, ceiling(n/B))[1:n], n, replace = F)
B_mat <- matrix(NA, nrow = n, ncol = B)
for (i in 1:B) B_mat[,i] <- as.numeric(which.group != i)
}
}
#-- function to calculate true positive rate at points fpr (corresponding false positive rate)
true_positive_rate <- function(predicted, y, fpr){
perf <- ROCR::performance(ROCR::prediction(predicted,y), "tpr", "fpr")
tpr <- rep(NA, length(fpr))
for (i in 1:length(tpr)) tpr[i] <- (perf@y.values[[1]])[sum(perf@x.values[[1]] <= fpr[i])]
tpr
}
#-- set up dummy data structures to store accuracy indices, selected coefficients, etc.
# selected variables on resampled data
if (show_selected_coef == T){
names.allcov <- colnames(model.matrix(model, data = data)) #includes intercept (is removed at the end)
num.allcov <- length(names.allcov)
selectedVars <- matrix(0, ncol = num.allcov, nrow = B)
colnames(selectedVars) <- names.allcov
}
# true positive rates on resampled data
tpr.test <- tpr.train <- matrix(NA, nrow = B, ncol = length(fpr))
#-- Indices and tpr on original data
#browser()
fitted.orig <- fit.response(fit.method(model, data, type, ...), data, type, ...)
y.orig <- as.numeric(model.response(model.frame(update(model,.~1), data)))
# (note: "update(model,.~1)" instead of "model" is necessary, because model.frame gives an error otherwise for gam's with model terms s(.)
perf.orig <- myaccuracy(fitted.orig, y.orig, criteria)
# aggrageted indices
indices <- matrix(nrow = length(perf.orig), ncol = 5)
rownames(indices) <- names(perf.orig)
colnames(indices) <- c("index.orig", "training", "test", "optimism", "index.corrected")
indices[,1] <- perf.orig
if (roc) tpr.orig <- true_positive_rate(as.numeric(fitted.orig), y.orig, fpr)
# indices for resampled data
#browser()
indices.resample <- matrix(ncol = length(perf.orig)*2, nrow = B)
colnames(indices.resample) <- c(paste(names(perf.orig), "train"),
paste(names(perf.orig), "test"))
#-- Start the actual valdation
cat("Start: \n")
for (i in 1:B){
cat("\r", i)
# datasets and response
trainData <- data[rep(1:n, B_mat[,i]), ]
if (test_set_method == "excluded") testData <- data[B_mat[,i] == 0, ]
if (test_set_method == "all") testData <- data
y.train <- as.numeric(model.response(model.frame(update(model, .~1), trainData)))
y.test <- as.numeric(model.response(model.frame(update(model, .~1), testData)))
# Predicted responses
fitModel <- fit.method(model, data= trainData, type, ...)
fitted.risk.train <- fit.response(fitModel, data = trainData, type, ...)
fitted.risk.test <- fit.response(fitModel, data = testData, type, ...)
# accuracy measures
indices.resample[i, 1:length(perf.orig)] <- myaccuracy(fitted.risk.train, y.train, criteria)
indices.resample[i, (length(perf.orig)+1):(2*length(perf.orig))] <- myaccuracy(fitted.risk.test, y.test, criteria)
if (show_selected_coef == T){
selectedVars[i, colnames(model.matrix(fitModel$formula, data = data))] <- 1
}
if (roc){
tpr.train[i,] <- true_positive_rate(as.numeric(fitted.risk.train), y.train, fpr)
tpr.test[i,] <- true_positive_rate(as.numeric(fitted.risk.test), y.test, fpr)
}
}
#browser()
#-- aggregate bootstrap/cv indices
summary.result <- apply(indices.resample, 2, function(x) mean(x, na.rm = TRUE))
indices[, "training"] <- summary.result[1:length(perf.orig)]
indices[, "test"] <- summary.result[(length(perf.orig)+1):(2*length(perf.orig))]
indices[, "optimism"] <- indices[, "training"] - indices[, "test"]
if (validation_method == "cv") indices[, "optimism"] <- indices[, "index.orig"] - indices[, "test"]
indices[, "index.corrected"] <- indices[, "index.orig"] - indices[, "optimism"]
#-- get estimate of "corrected" tpr
if (roc){
tpr.corrected <- tpr.orig - (apply(tpr.train, 2, mean) - apply(tpr.test, 2, mean))
# "correct" tpr.corrected to be between 0 and 1 and monotonically increasing
tpr.corrected <- pmin(pmax(tpr.corrected, 0), 1)
for (i in (2:length(tpr.corrected))){
if (tpr.corrected[i] < tpr.corrected[i-1]) tpr.corrected[i] <- tpr.corrected[i-1]
}
}
#-- final result
result <- list(indices = indices, indices.resample = indices.resample)
#add selected coefficients
if (show_selected_coef == T) {
if (colnames(selectedVars)[1] == "(Intercept)") selectedVars <- selectedVars[, -1]
numvar <- rowSums(selectedVars)
selectedVars_propChosen <- c(apply(selectedVars,2,mean), "mean.numberOfVariables" = mean(numvar), "sd.numberOfVariables" = sd(numvar))
selectedVars <- cbind(selectedVars, "numberOfVariables" = numvar)
result$selectedVars_propChosen <- selectedVars_propChosen
result$selectedVars.resample <- selectedVars
}
# add ROC information
if (roc){
result$ROC <- list(fpr = fpr, tpr.orig = tpr.orig, tpr.corrected = tpr.corrected, tpr.train = tpr.train, tpr.test = tpr.test)
}
# return result
result
}
#' @param model Model's formula
#' @param data Data to fit and assess model performance
#' @param type Type of statistical model
#' @param scope Scope in stepwise variable selection
#' @param validation_method "cv" (cross-validation) or "bootstrap"
#' @param B number of bootstrap samples / B-fold cross-validation
#' @param B_mat Data matrix for cross-validation
#' @param test_set_method How is the test set defined (either "all" -> use original dataset as test data (default for bootstrap), or "excluded" -> use patients excluded from cv/bootstrap sample as test data)
#' @param show_selected_coef If TRUE, show the selected coefficients (only for glm's with model selection)
#' @param criteria
#' @param roc If TRUE, true positive rates corresponding to false positive rates given in argument fpr are also evaluated and cv/bootstrap corrected
#' @param fpr
#' @param ... Other parameters for fit_method and fit_response
#'
#' @return List of validation output
#' @export
classValidation.new <- function(model, data, type, scope=NULL,
validation_method = c("bootstrap", "cv"), B = NULL, B_mat = NULL, doParallel = FALSE,
test_set_method = ifelse(validation_method == "bootstrap", "all", "excluded"),
show_selected_coef = F, criteria = c(Brier, c_index, calib), ...){
if (length(validation_method) > 1) {
validation_method <- "cv"
cat("Validation method is cross-validation!!! \n")
}
n <- nrow(data)
#-- checks
if (is.null(B) && is.null(B_mat)) stop("Either argument B or B_mat must be provided")
if ((!is.null(B)) && (!is.null(B_mat)) && (ncol(B_mat) != B)) stop("Arguments B and B_mat incompatible")
#-- Create B and B_mat (containing selected observations in each bootstrap/cv sample)
if (is.null(B)) B <- ncol(B_mat)
if (is.null(B_mat)){
if (validation_method == "bootstrap") B_mat <- rmultinom(B, size = n, prob = rep(1,n)/n)
if (validation_method == "cv"){
which.group <- sample(rep(1:B, ceiling(n/B))[1:n], n, replace = F)
B_mat <- matrix(NA, nrow = n, ncol = B)
for (i in 1:B) B_mat[,i] <- as.numeric(which.group != i)
}
}
if (show_selected_coef == T){
names.allcov <- colnames(model.matrix(model, data = data)) #includes intercept (is removed at the end)
}
fitted.orig <- fit.response(fit.method(model, data, type, ...), data, type, ...)
y.orig <- as.numeric(model.response(model.frame(update(model,.~1), data)))
# (note: "update(model,.~1)" instead of "model" is necessary, because model.frame gives an error otherwise for gam's with model terms s(.)
perf.orig <- myaccuracy(fitted.orig, y.orig, criteria)
# aggrageted indices
indices <- matrix(nrow = length(perf.orig), ncol = 5)
rownames(indices) <- names(perf.orig)
colnames(indices) <- c("index.orig", "training", "test", "optimism", "index.corrected")
indices[,1] <- perf.orig
#-- Start the actual valdation
if (doParallel) {
require(doParallel)
cl <- makeCluster(detectCores())
registerDoParallel(cl)
res <- foreach(i = 1:B, .packages = "R306") %dopar% {
# datasets and response
trainData <- data[rep(1:n, B_mat[,i]), ]
if (test_set_method == "excluded") testData <- data[B_mat[,i] == 0, ]
if (test_set_method == "all") testData <- data
y.train <- as.numeric(model.response(model.frame(update(model, .~1), trainData)))
y.test <- as.numeric(model.response(model.frame(update(model, .~1), testData)))
# Predicted responses
fitModel <- fit.method(model, data= trainData, type, ...)
fitted.risk.train <- fit.response(fitModel, data = trainData, type, ...)
fitted.risk.test <- fit.response(fitModel, data = testData, type, ...)
# accuracy measures
indices.resample.i <- c(myaccuracy(fitted.risk.train, y.train, criteria),
myaccuracy(fitted.risk.test, y.test, criteria))
if (show_selected_coef){
selectedVars.i <- matrix(rep(NA, 3), dimnames = list(NULL, num.allcov), nrow = 1)
if ("glm" %in% class(fitModel)) {
selectedVars.i[, colnames(model.matrix(fitModel$formula, data = data))] <- 1
} else {
if ("glmnet" %in% class(fitModel)) {
coefs <- coef(fitModel, s = fitModel$s)
selectedVars.i[, coefs@Dimnames[[1]][coefs@i + 1]] <- 1
} else {
selectedVars.i <- NA
}
}
out <- list(indices.resample.i, selectedVars.i)
} else {
out <- indices.resample.i
}
## return
return(out)
}
if (show_selected_coef) {
indices.resample <- do.call(rbind, lapply(res, function(x){x[[1]]}))
selectedVars <- do.call(rbind, lapply(res, function(x){x[[2]]}))
} else {
indices.resample <- do.call(rbind, res)
}
colnames(indices.resample) <- c(paste(names(perf.orig), "train"),
paste(names(perf.orig), "test"))
stopCluster(cl)
} else {
# indices for resampled data
indices.resample <- matrix(ncol = length(perf.orig)*2, nrow = B)
colnames(indices.resample) <- c(paste(names(perf.orig), "train"),
paste(names(perf.orig), "test"))
if (show_selected_coef) {
num.allcov <- length(names.allcov)
selectedVars <- matrix(0, ncol = num.allcov, nrow = B)
colnames(selectedVars) <- names.allcov
}
cat("Start: \n")
for (i in 1:B){
cat("\r", i)
# datasets and response
trainData <- data[rep(1:n, B_mat[,i]), ]
if (test_set_method == "excluded") testData <- data[B_mat[,i] == 0, ]
if (test_set_method == "all") testData <- data
y.train <- as.numeric(model.response(model.frame(update(model, .~1), trainData)))
y.test <- as.numeric(model.response(model.frame(update(model, .~1), testData)))
# Predicted responses
fitModel <- fit.method(model, data= trainData, type, ...)
fitted.risk.train <- fit.response(fitModel, data = trainData, type, ...)
fitted.risk.test <- fit.response(fitModel, data = testData, type, ...)
# accuracy measures
indices.resample[i, 1:length(perf.orig)] <- myaccuracy(fitted.risk.train, y.train, criteria)
indices.resample[i, (length(perf.orig)+1):(2*length(perf.orig))] <- myaccuracy(fitted.risk.test, y.test, criteria)
if (show_selected_coef == T){
if ("glm" %in% class(fitModel)) {
selectedVars[i, colnames(model.matrix(fitModel$formula, data = data))] <- 1
} else {
if ("glmnet" %in% class(fitModel)) {
coefs <- coef(fitModel, s = fitModel$s)
selectedVars[i, coefs@Dimnames[[1]][coefs@i + 1]] <- 1
} else {
selectedVars[i,] <- NA
}
}
}
}
}
#browser()
#-- aggregate bootstrap/cv indices
summary.result <- apply(indices.resample, 2, function(x) mean(x, na.rm = TRUE))
indices[, "training"] <- summary.result[1:length(perf.orig)]
indices[, "test"] <- summary.result[(length(perf.orig)+1):(2*length(perf.orig))]
indices[, "optimism"] <- indices[, "training"] - indices[, "test"]
if (validation_method == "cv") indices[, "optimism"] <- indices[, "index.orig"] - indices[, "test"]
indices[, "index.corrected"] <- indices[, "index.orig"] - indices[, "optimism"]
#-- final result
result <- list(indices = indices, indices.resample = indices.resample)
#add selected coefficients
if (show_selected_coef == T) {
if (colnames(selectedVars)[1] == "(Intercept)") selectedVars <- selectedVars[, -1]
numvar <- rowSums(selectedVars)
selectedVars_propChosen <- c(apply(selectedVars,2,mean), "mean.numberOfVariables" = mean(numvar), "sd.numberOfVariables" = sd(numvar))
selectedVars <- cbind(selectedVars, "numberOfVariables" = numvar)
result$selectedVars_propChosen <- selectedVars_propChosen
result$selectedVars.resample <- selectedVars
}
# return result
result
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.