# The 'nested_cv' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-10
# Table of contents
# 1) nested_cv |
# 4.2) is.nested_cv | tested
# 1.2) levels.nested_cv | tested
# 1.3) dimnames.nested_cv | tested
# 1.4) coef.nested_cv |
# 1.5) print.nested_cv | tested
# 1.6) summary.nested_cv |
# 1.7) features.nested_cv |
###
### 1) nested_cv
###
#' Nested K-fold Cross Validation for Binary Classification
#'
#' A wrapper function that implements nested K-fold cross
#' validation for a chosen algorithm with binary data.
#'
#' @param y A factor with two levels.
#' @param X A matrix of predictors with the number of rows equal
#' to the length of \code{y}.
#' @param type The type of fitting algorithm to use. Options
#' include \code{glm} and \code{glmnet}.
#' @param K A vector indicating the number of folds to use for the
#' outer and inner iterations.
#' @param set_control A function to specify how to set the
#' \code{control} option for the outer folds based on the
#' results of the inner folds.
#' @param control List of additional settings for the
#' estimation algorithm for the inner folds.
#' @param progress Logical; if \code{TRUE}, tracks the progress of the
#' algoritm through both the inner and outer folds.
#'
#' @details The method \code{summary} returns a list with
#' the coefficients and their significance for the outer
#' folds and the inner folds per each outer fold, respectively.
#'
#' @return An R object of class 'nested_cv'.
#'
#' @export
#' @examples
#' # Simulate data
#' sim = bc_simulate( 500, 8, 4 )
#' # Conduct nested 10-fold CV
#' cv_glm = nested_cv( sim$y, sim$X, type = 'glm' )
#' # Can be slow
#' cv_glmnet = nested_cv( sim$y, sim$X, type = 'glmnet' )
nested_cv = function( y, X,
type,
K = c( 10, 10 ),
control = list(),
set_control = list(),
progress = T ) {
# Start computing run time
start = ert()
# Create index
index = cv_index( K[1], length( y ) )
# Initialize output for outer folds
out = list()
for ( k in 1:K[1] ) {
out = c( out, list(NULL) )
}
names( out ) = paste( 'Fold', 1:K[1], sep = '_' )
# Loop over outer folds
for ( k in 1:K[1] ) {
if ( progress ) print( paste( 'Outer fold:', k ) )
# Create list of data for inner folds
inner_dat = train_test( k, index, y, X )
# Cross-validation for inner selection
inner_res = kfold_cv( subset( inner_dat, T, T ),
subset( inner_dat, F, T ),
type,
K = K[2],
control = control,
progress = progress )
# Set control parameters to govern model selection
cntrl = bc_set_control( type, inner_res, set_control )
# Check predictive accuracy
out[[k]] = list(
inner = inner_res,
outer = bc_estimate( type, inner_dat, control = cntrl )
)
}
# Compute run time
run_time = ert( start )
out$run_time = run_time
# Save index for outer folds
out$outer_index = index
# Set class of output
class( out ) = 'nested_cv'
return( out )
}
# 1.1)
#' @rdname nested_cv
#' @export
is.nested_cv = function( x ) {
# Purpose:
# Checks if an object has
# class 'nested_cv'.
# Arguments:
# x - An R object
# Returns:
# Logical; if the class of
# the object is 'nested_cv',
# returns TRUE.
return( inherits( x, 'nested_cv' ) )
}
# 1.2)
#' @rdname nested_cv
#' @export
levels.nested_cv = function( x ) {
# Purpose:
# Extracts the binary levels for the
# factor representing the dependent variable.
# Arguments:
# x - An R object of class 'nested_cv'
# Returns:
# The binary levels for the factor.
# Extract levels of dependent variable
out = x$Fold_1$outer$levels
return( out )
}
# 1.3)
#' @rdname nested_cv
#' @export
dimnames.nested_cv = function( x ) {
# Purpose:
# Extracts the column labels for the matrix
# of predictors.
# Arguments:
# x - An R object of class 'nested_cv'
# Returns:
# A vector with the column labels for the
# predictors.
# Extract labels for predictors
out = dimnames( x$Fold_1$outer )
return( out )
}
# 1.4)
#' @rdname nested_cv
#' @export
coef.nested_cv = function( x, int = T ) {
# Purpose:
# Extracts coefficients from
# each outer fold of an object
# of class 'nested_cv'.
# Arguments:
# x - An R object of class 'nested_cv'
# Returns:
# The average and range for the coefficients
# selected for each outer fold during
# nested K-fold cross-validation.
# Extract variable names
nms = x$Fold_1$outer$predictors
# Extract number of folds
folds = sort( grep( 'Fold', names( x ) ) )
mat = matrix( 0, length( folds ), length( nms ) )
intercept = numeric( length( folds ) )
# Extract coefficient values
for ( nf in 1:length( folds ) ) {
cur_fold = folds[nf]
sel = nms %in% x[[ cur_fold ]]$outer$selected_vars
mat[nf,sel] = coef( x[[ cur_fold ]]$outer )
if ( int ) intercept[nf] = x[[ cur_fold ]]$outer$intercept
}
colnames( mat ) = nms
if ( int ) mat = cbind( Intercept = intercept, mat )
out = cbind( Fold = 1:length( folds ), mat )
return( out )
}
# 1.5)
#' @rdname nested_cv
#' @export
print.nested_cv = function( x, digits = 2, metric = 'AUC' ) {
# Purpose:
# Displays basic details for an object of
# class 'nested_cv'.
# Arguments:
# x - An R object of class 'nested_cv'
# digits - The max number of digits when rounding
# metric - The type of fit metric to report
# Extract indices for outer folds
folds = grep( 'Fold', names( x ) )
string = paste( 'Estimation type:', x[[1]]$outer$type )
cat( string, '\n' )
print( round( x$run_time, digits ) )
cat( '\n' )
cat( 'Coefficients:', '\n' )
val = coef( x )
string
for ( i in 2:ncol(val) ) {
if ( mean( val[,i] ) != 0 ) {
string = paste(
colnames( val )[i], ': ',
round( mean(val[,i]), digits ),
' (',
round( min(val[,i]), digits ),
' - ',
round( max(val[,i]), digits ),
')', sep = '' )
cat( string, '\n' )
}
}
cat( '\n' )
cat( 'Fit:', '\n' )
val = sapply( x[folds],
function(x) subset( x$outer, train = F, metric = metric ) )
string = paste(
metric, ': ',
round( mean( val ), digits ),
' (',
round( min(val), digits ),
' - ',
round( max(val), digits ),
')', sep = '' )
cat( string, '\n' )
}
# 1.6)
#' @rdname nested_cv
#' @export
summary.nested_cv = function( x, metric = 'AUC' ) {
# Purpose:
# ...
# Arguments:
# x - An R object of class 'nested_cv'
# metric -
# Returns:
# ...
nf = function( x ) {
grep( 'Fold', names(x) )
}
fo = nf( x )
# Number of folds for outer section
Ko = length( nf( x ) )
# Number of folds for inner section
Ki = length( nf( x$Fold_1$inner ) )
inner = data.frame(
Outer_fold = rep( 1:Ko, each = Ki ),
Inner_fold = rep( 1:Ki, Ko )
)
# Labels for predictors
cn = x[[ fo[1] ]]$outer$predictors
# Loop over outer folds
for ( o in 1:Ko ) {
# Extract summary of results for inner folds
val = summary( x[[ fo[o] ]]$inner )
# Initialize output for inner results
if ( o == 1 ) {
# Base summary for outer folds on inner folds
outer = data.frame(
Fold = 1:Ko,
Best = FALSE,
Metric = NA
)
outer$Coefficients =
matrix( 0, Ko, length(cn) )
outer$Significant =
matrix( 0, Ko, length(cn) )
colnames( outer$Coefficients ) = cn
colnames( outer$Significant ) = cn
# Add matrix of coefficients
tmp = matrix( NA, nrow( inner ), ncol(val$Coefficients) )
colnames( tmp ) = colnames( val$Coefficients )
inner$Coefficients = tmp
# Add matrix indicating when variables were significant
tmp = matrix( NA, nrow( inner ), ncol(val$Significant) )
colnames( tmp ) = colnames( val$Significant )
inner$Significant = tmp
rm( tmp )
# Add columns tracking the best fit and the
# fit metrics
inner$Best = NA
inner$Metric = NA
# Add additional parameters for glmnet results
if ( x$Fold_1$outer$type == 'glmnet' ) {
inner$lambda = NA
inner$alpha = NA
outer$lambda = NA
outer$alpha = NA
}
}
# Fill in rows (Inner)
sel = 1:Ki + Ki*(o-1)
inner$Coefficients[ sel, ] = val$Coefficients
inner$Significant[ sel, ] = val$Significant
inner$Best[sel] = val$Best
inner$Metric = val$Metric
# Add additional parameters for glmnet results (Inner)
if ( x$Fold_1$outer$type == 'glmnet' ) {
inner$lambda[sel] = val$lambda
inner$alpha[sel] = val$alpha
outer$lambda = val$lambda[ val$Best ]
outer$alpha = val$alpha[ val$Best ]
}
# Fill in rows (Outer)
cf = coef( x[[ fo[o] ]]$outer )
pos = cn %in% names( cf )
outer$Coefficients[o,pos] = cf
outer$Significant[o,pos] = 1
outer$Metric[o] = subset( x[[ fo[o] ]]$outer, train = F, metric = metric )
}
out = list(
outer = outer,
inner = inner
)
return( out )
}
# 1.7)
#' @rdname nested_cv
#' @export
features.nested_cv = function( x, cutoff = .95 ) {
# Purpose:
# Extracts the labels for the predictors determined
# to be non-zero.
# Arguments:
# x - An R object of class 'nested_cv'
# cutoff - The proportion of times a predictor
# needed to be non-zero to be selected
# Returns:
# A vector of labels for predictors.
# Extract matrix of coefficients
cf = coef( x )
# Extract column labels for predictors
cn = dimnames( x )
# Number of outer folds
K = nrow( cf )
# Determine estimates fixed to 0
zero_est = cf[,cn] == 0
# Determine frequency of non-zero estimates per variables
freq = K - colSums( zero_est )
out = cn[ freq > cutoff*K ]
return( out )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.