# The 'kfold_cv' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-06
# Table of contents
# 1) kfold_cv |
# 1.1) is.kfold_cv | tested
# 1.2) levels.kfold_cv |
# 1.3) dimnames.kfold_cv |
# 1.4) max.kfold_cv |
# 1.5) features.kfold_cv |
# 1.6) coef.kfold_cv |
# 1.7) print.kfold_cv |
# 1.8) summary.kfold_cv |
###
### 1) kfold_cv
###
#' K-fold Cross Validation for Binary Classification
#'
#' A wrapper function that implements K-fold cross
#' validation for a chosen algorithm with binary data.
#'
#' @param y A factor with two levels.
#' @param X A matrix of predictors with the number of rows equal
#' to the length of \code{y}.
#' @param type The type of fitting algorithm to use. Options
#' include \code{glm} and \code{glmnet}.
#' @param K The number of folds to use.
#' @param control An optional list of estimation parameters
#' for the chosen algorithm.
#' @param progress Logical; If \code{TRUE} prints a progress
#' bar tracking which folds have finished estimation.
#'
#' @details The method \code{max} can be used to extract the
#' label for the fold that produced the best value for
#' a specified metric with the test sample. The method
#' \code{features} extracts the labels for the independent
#' variables associated with the best-performing fold.
#' The method \code{summary} reports the coefficients
#' and denotes which were statistically significant
#' across all folds.
#'
#' @return An R object of class 'kfold_cv'.
#'
#' @export
#' @examples
#' # Simulate data
#' sim = bc_simulate( 500, 8, 4 )
#' # Conduct 10-fold CV
#' cv_glm = kfold_cv( sim$y, sim$X, type = 'glm' )
#' cv_glmnet = kfold_cv( sim$y, sim$X, type = 'glmnet' )
kfold_cv = function( y, X,
type,
K = 10,
control = list(),
progress = T ) {
# Estimate run time
start = ert()
# Create index
index = cv_index( K, length( y ) )
# Carry out K-fold cross-validation
# Initialize output
out = list()
for ( k in 1:K ) {
out = c( out, list(NULL) )
}
names( out ) = paste( 'Fold', 1:K, sep = '_' )
# Initialize progress bar
if ( progress ) pb = txtProgressBar( min = 0, max = K, style = 3 )
# Loop over folds
for ( k in 1:K ) {
# Split data into training/test sets
dtbf = train_test( k, index, y, X )
# Fit desired model
out[[k]] = bc_estimate ( type, dtbf, control = control )
# Update progress bar
if (progress) setTxtProgressBar(pb,k)
}
if ( progress ) close( pb )
# Incorporate cross-validation index
out$index = index
# Incorporate total run time
run_time = ert( start )
out$run_time = run_time
# Set class for output
class( out ) = 'kfold_cv'
# Return list with fitting results
# for each fold
return( out )
}
# 1.1)
#' @rdname kfold_cv
#' @export
is.kfold_cv = function( x ) {
# Purpose:
# Checks if an object is of class 'kfold_cv'
# Arguments:
# x - An R object
# Returns:
# TRUE if the object is of class 'kfold_cv',
# FALSE otherwise.
return( inherits( x, 'kfold_cv' ) )
}
# 1.2)
#' @rdname kfold_cv
#' @export
levels.kfold_cv = function( x ) {
# Purpose:
# Extracts the binary levels for the
# factor representing the dependent variable.
# Arguments:
# x - An R object of class 'kfold_cv'
# Returns:
# The binary levels for the factor.
# Extract levels of dependent variable
out = x$Fold_1$levels
return( out )
}
# 1.3)
#' @rdname kfold_cv
#' @export
dimnames.kfold_cv = function( x ) {
# Purpose:
# Extracts the column labels for the matrix
# of predictors.
# Arguments:
# x - An R object of class 'kfold_cv'
# Returns:
# A vector with the column labels for the
# predictors.
# Extract labels for predictors
out = dimnames( x$Fold_1 )
return( out )
}
# 1.4)
#' @rdname kfold_cv
#' @export
max.kfold_cv = function( x, train = F, metric = 'AUC', na.rm = FALSE ) {
# Purpose:
# Extracts the fold with the best fit metrics based on
# either the test or training subset.
# Arguments:
# x - An R object of class 'kfold_cv'
# train - Logical; if TRUE, finds best result
# based on the training data
# metric - The metric to use
# na.rm - For compatibility with generic method
# Returns:
# A character string with the fold name.
# Extract fit metric of choice
sel = names(x)[ grep( 'Fold', names( x ) ) ]
fm = sapply( x[sel], subset, train = train, metric = metric )
single_metrics = c(
'TPR', 'FPR', 'AUC', 'd_prime',
'criterion', 'CE', 'R', 'Accuracy' )
if ( !( metric %in% single_metrics ) ) {
warning( 'Need metric comprising of a single value - defaulting to AUC',
call. = F )
metric = 'AUC'
}
metrics_to_max = c(
'AUC',
'd_prime',
'Accuracy',
'TPR'
)
if ( metric %in% metrics_to_max ) {
ind = which.max( fm )[1]
} else {
ind = which.min( fm )[1]
}
out = names( x[sel] )[ind]
return( out )
}
# 1.5)
#' @rdname kfold_cv
#' @export
features.kfold_cv = function( x, metric = 'AUC' ) {
# Purpose:
# Extracts the column labels for the
# predictors selected by the model
# as significant contributors to binary
# classification.
# Arguments:
# x - An R object of class 'kfold_cv'
# metric - The metric to use
# Returns:
# A vector with the column labels for the
# selected predictors.
# Find fold with best fit
sel = max( x, metric = metric )
# Extract labels for selected predictors
out = features( x[[sel]] )
return( out )
}
# 1.6)
#' @rdname kfold_cv
#' @export
coef.kfold_cv = function( x, int = F, sig = T, metric = 'AUC' ) {
# Purpose:
# Extracts the coefficients from the model fit based on the
# fold with the best fit metric.
# Arguments:
# x - An R object of class 'kfold_cv'
# int - Logical; if TRUE, includes
# the intercept
# sig - Logical; if TRUE, includes only
# significant/non-zero values
# metric - The metric to use
# Returns:
# The coefficients from the fitted model.
# Find fold with best fit
sel = max( x, metric = metric )
# Extract coefficients
out = coef( x[[sel]], int = int, sig = sig )
return( out )
}
# 1.7)
#' @rdname kfold_cv
#' @export
print.kfold_cv = function( x, digits = 2, metric = 'AUC' ) {
# Purpose:
# Displays basic details for an object of
# class 'kfold_cv'.
# Arguments:
# x - An R object of class 'kfold_cv'
# digits - The max number of digits when rounding
# metric - The type of fit metric to report
# Find fold with best fit
sel = max( x, metric = metric )
# Extract fit metric
fm = subset( x[[sel]], train = F, metric = metric )
string = paste( 'Estimation type:', x[[1]]$type )
cat( string, '\n' )
print( round( x$run_time, digits ) )
cat( '\n' )
cat( 'Coefficients:', '\n' )
val = coef( x[[sel]] )
print( round( val, digits ) )
cat( '\n' )
cat( 'Fit:', '\n' )
val = fm
string = paste( metric, '=', round( val, 2 ) )
cat( string, '\n' )
}
# 1.8)
#' @rdname kfold_cv
#' @export
summary.kfold_cv = function( x, metric = 'AUC' ) {
# Purpose:
# Extracts the coefficients, indicators for significance,
# the singular metric of choice, and a logical vector
# indicating the fold with the best predictive fit.
# Arguments:
# x - An R object of class 'kfold_cv'
# metric - The type of fit metric to report
# Returns:
# A data frame.
# Find fold with best fit
sel = max( x, metric = metric )
# Extract predictor labels
cn = x$Fold_1$predictors
# Extract number of folds
folds = names(x)[ grep( 'Fold', names( x ) ) ]
K = length( folds )
# Create data frame with results
out = data.frame(
Fold = 1:K,
Best = FALSE,
Metric = NA,
Intercept = NA,
stringsAsFactors = F
)
out$Coefficients =
matrix( 0, nrow( out ), length(cn) )
out$Significant =
matrix( 0, nrow( out ), length(cn) )
colnames( out$Coefficients ) = cn
colnames( out$Significant ) = cn
out$Best[ folds %in% sel ] = TRUE
out$Metric =
sapply( x[folds], subset, train = F, metric = metric )
out$Intercept =
sapply( x[folds], function(x) x$intercept )
# Loop over folds
inc = 1
for ( i in folds ) {
cf = coef( x[[i]] )
pos = cn %in% names( cf )
out$Coefficients[inc,pos] = coef( x[[i]] )
out$Significant[inc,pos] = 1
inc = inc + 1
}
if ( x$Fold_1$type == 'glmnet' ) {
out$lambda =
sapply( x[folds], function(x) x$best_lambda )
out$alpha =
sapply( x[folds], function(x) x$best_alpha )
}
return( out )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.