R/nested_cv.R

# The 'nested_cv' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-10

# Table of contents
# 1) nested_cv                    |
#   4.2) is.nested_cv             | tested
#   1.2) levels.nested_cv         | tested
#   1.3) dimnames.nested_cv       | tested
#   1.4) coef.nested_cv           |
#   1.5) print.nested_cv          | tested
#   1.6) summary.nested_cv        |
#   1.7) features.nested_cv       |

###
### 1) nested_cv
###

#' Nested K-fold Cross Validation for Binary Classification
#'
#' A wrapper function that implements nested K-fold cross
#' validation for a chosen algorithm with binary data.
#'
#' @param y A factor with two levels.
#' @param X A matrix of predictors with the number of rows equal
#'   to the length of \code{y}.
#' @param type The type of fitting algorithm to use. Options
#'   include \code{glm} and \code{glmnet}.
#' @param K A vector indicating the number of folds to use for the
#'   outer and inner iterations.
#' @param set_control A function to specify how to set the
#'   \code{control} option for the outer folds based on the
#'   results of the inner folds.
#' @param control List of additional settings for the
#'   estimation algorithm for the inner folds.
#' @param progress Logical; if \code{TRUE}, tracks the progress of the
#'   algoritm through both the inner and outer folds.
#'
#' @details The method \code{summary} returns a list with
#' the coefficients and their significance for the outer
#' folds and the inner folds per each outer fold, respectively.
#'
#' @return An R object of class 'nested_cv'.
#'
#' @export
#' @examples
#' # Simulate data
#' sim = bc_simulate( 500, 8, 4 )
#' # Conduct nested 10-fold CV
#' cv_glm = nested_cv( sim$y, sim$X, type = 'glm' )
#' # Can be slow
#' cv_glmnet = nested_cv( sim$y, sim$X, type = 'glmnet' )

nested_cv = function( y, X,
                      type,
                      K = c( 10, 10 ),
                      control = list(),
                      set_control = list(),
                      progress = T ) {

  # Start computing run time
  start = ert()

  # Create index
  index = cv_index( K[1], length( y ) )

  # Initialize output for outer folds
  out = list()
  for ( k in 1:K[1] ) {
    out = c( out, list(NULL) )
  }
  names( out ) = paste( 'Fold', 1:K[1], sep = '_' )

  # Loop over outer folds
  for ( k in 1:K[1] ) {

    if ( progress ) print( paste( 'Outer fold:', k ) )

    # Create list of data for inner folds
    inner_dat = train_test( k, index, y, X )

    # Cross-validation for inner selection
    inner_res = kfold_cv( subset( inner_dat, T, T ),
                          subset( inner_dat, F, T ),
                          type,
                          K = K[2],
                          control = control,
                          progress = progress )

    # Set control parameters to govern model selection
    cntrl = bc_set_control( type, inner_res, set_control )

    # Check predictive accuracy
    out[[k]] = list(
      inner = inner_res,
      outer = bc_estimate( type, inner_dat, control = cntrl )
    )
  }

  # Compute run time
  run_time = ert( start )
  out$run_time = run_time

  # Save index for outer folds
  out$outer_index = index

  # Set class of output
  class( out ) = 'nested_cv'

  return( out )
}

# 1.1)
#' @rdname nested_cv
#' @export

is.nested_cv = function( x ) {
  # Purpose:
  # Checks if an object has
  # class 'nested_cv'.
  # Arguments:
  # x - An R object
  # Returns:
  # Logical; if the class of
  # the object is 'nested_cv',
  # returns TRUE.

  return( inherits( x, 'nested_cv' ) )
}

# 1.2)
#' @rdname nested_cv
#' @export

levels.nested_cv = function( x ) {
  # Purpose:
  # Extracts the binary levels for the
  # factor representing the dependent variable.
  # Arguments:
  # x - An R object of class 'nested_cv'
  # Returns:
  # The binary levels for the factor.

  # Extract levels of dependent variable
  out = x$Fold_1$outer$levels

  return( out )
}

# 1.3)
#' @rdname nested_cv
#' @export

dimnames.nested_cv = function( x ) {
  # Purpose:
  # Extracts the column labels for the matrix
  # of predictors.
  # Arguments:
  # x - An R object of class 'nested_cv'
  # Returns:
  # A vector with the column labels for the
  # predictors.

  # Extract labels for predictors
  out = dimnames( x$Fold_1$outer )

  return( out )
}

# 1.4)
#' @rdname nested_cv
#' @export

coef.nested_cv = function( x, int = T ) {
  # Purpose:
  # Extracts coefficients from
  # each outer fold of an object
  # of class 'nested_cv'.
  # Arguments:
  # x - An R object of class 'nested_cv'
  # Returns:
  # The average and range for the coefficients
  # selected for each outer fold during
  # nested K-fold cross-validation.

  # Extract variable names
  nms = x$Fold_1$outer$predictors
  # Extract number of folds
  folds = sort( grep( 'Fold', names( x ) ) )

  mat = matrix( 0, length( folds ), length( nms ) )
  intercept = numeric( length( folds ) )
  # Extract coefficient values
  for ( nf in 1:length( folds ) ) {
    cur_fold = folds[nf]
    sel = nms %in% x[[ cur_fold ]]$outer$selected_vars
    mat[nf,sel] = coef( x[[ cur_fold ]]$outer )

    if ( int ) intercept[nf] = x[[ cur_fold ]]$outer$intercept

  }
  colnames( mat ) = nms
  if ( int ) mat = cbind( Intercept = intercept, mat )

  out = cbind( Fold = 1:length( folds ), mat )

  return( out )
}

# 1.5)
#' @rdname nested_cv
#' @export

print.nested_cv = function( x, digits = 2, metric = 'AUC' ) {
  # Purpose:
  # Displays basic details for an object of
  # class 'nested_cv'.
  # Arguments:
  # x      - An R object of class 'nested_cv'
  # digits - The max number of digits when rounding
  # metric - The type of fit metric to report

  # Extract indices for outer folds
  folds = grep( 'Fold', names( x ) )

  string = paste( 'Estimation type:', x[[1]]$outer$type )
  cat( string, '\n' )
  print( round( x$run_time, digits ) )
  cat( '\n' )

  cat( 'Coefficients:', '\n' )
  val = coef( x )
  string
  for ( i in 2:ncol(val) ) {
    if ( mean( val[,i] ) != 0 ) {
      string = paste(
        colnames( val )[i], ': ',
        round( mean(val[,i]), digits ),
        ' (',
        round( min(val[,i]), digits ),
        ' - ',
        round( max(val[,i]), digits ),
        ')', sep = '' )
      cat( string, '\n' )
    }
  }
  cat( '\n' )

  cat( 'Fit:', '\n' )
  val = sapply( x[folds],
                function(x) subset( x$outer, train = F, metric = metric ) )
  string = paste(
    metric, ': ',
    round( mean( val ), digits ),
    ' (',
    round( min(val), digits ),
    ' - ',
    round( max(val), digits ),
    ')', sep = '' )
  cat( string, '\n' )

}

# 1.6)
#' @rdname nested_cv
#' @export

summary.nested_cv = function( x, metric = 'AUC' ) {
  # Purpose:
  # ...
  # Arguments:
  # x      - An R object of class 'nested_cv'
  # metric -
  # Returns:
  # ...

  nf = function( x ) {
    grep( 'Fold', names(x) )
  }

  fo = nf( x )

  # Number of folds for outer section
  Ko = length( nf( x ) )
  # Number of folds for inner section
  Ki = length( nf( x$Fold_1$inner ) )

  inner = data.frame(
    Outer_fold = rep( 1:Ko, each = Ki ),
    Inner_fold = rep( 1:Ki, Ko )
  )

  # Labels for predictors
  cn = x[[ fo[1] ]]$outer$predictors

  # Loop over outer folds
  for ( o in 1:Ko ) {

    # Extract summary of results for inner folds
    val = summary( x[[ fo[o] ]]$inner )

    # Initialize output for inner results
    if ( o == 1 ) {

      # Base summary for outer folds on inner folds
      outer = data.frame(
        Fold = 1:Ko,
        Best = FALSE,
        Metric = NA
      )
      outer$Coefficients =
        matrix( 0, Ko, length(cn) )
      outer$Significant =
        matrix( 0, Ko, length(cn) )
      colnames( outer$Coefficients ) = cn
      colnames( outer$Significant ) = cn

      # Add matrix of coefficients
      tmp = matrix( NA, nrow( inner ), ncol(val$Coefficients) )
      colnames( tmp ) = colnames( val$Coefficients )
      inner$Coefficients = tmp
      # Add matrix indicating when variables were significant
      tmp = matrix( NA, nrow( inner ), ncol(val$Significant) )
      colnames( tmp ) = colnames( val$Significant )
      inner$Significant = tmp
      rm( tmp )
      # Add columns tracking the best fit and the
      # fit metrics
      inner$Best = NA
      inner$Metric = NA
      # Add additional parameters for glmnet results
      if ( x$Fold_1$outer$type == 'glmnet' ) {
        inner$lambda = NA
        inner$alpha = NA
        outer$lambda = NA
        outer$alpha = NA
      }
    }

    # Fill in rows (Inner)
    sel =  1:Ki + Ki*(o-1)
    inner$Coefficients[ sel, ] = val$Coefficients
    inner$Significant[ sel, ] = val$Significant
    inner$Best[sel] = val$Best
    inner$Metric = val$Metric

    # Add additional parameters for glmnet results (Inner)
    if ( x$Fold_1$outer$type == 'glmnet' ) {
      inner$lambda[sel] = val$lambda
      inner$alpha[sel] = val$alpha
      outer$lambda = val$lambda[ val$Best ]
      outer$alpha = val$alpha[ val$Best ]
    }

    # Fill in rows (Outer)
    cf = coef( x[[ fo[o] ]]$outer )
    pos = cn %in% names( cf )
    outer$Coefficients[o,pos] = cf
    outer$Significant[o,pos] = 1
    outer$Metric[o] = subset( x[[ fo[o] ]]$outer, train = F, metric = metric )
  }

  out = list(
    outer = outer,
    inner = inner
  )

  return( out )
}

# 1.7)
#' @rdname nested_cv
#' @export

features.nested_cv = function( x, cutoff = .95 ) {
  # Purpose:
  # Extracts the labels for the predictors determined
  # to be non-zero.
  # Arguments:
  # x      - An R object of class 'nested_cv'
  # cutoff - The proportion of times a predictor
  #          needed to be non-zero to be selected
  # Returns:
  # A vector of labels for predictors.

  # Extract matrix of coefficients
  cf = coef( x )
  # Extract column labels for predictors
  cn = dimnames( x )

  # Number of outer folds
  K = nrow( cf )

  # Determine estimates fixed to 0
  zero_est = cf[,cn] == 0
  # Determine frequency of non-zero estimates per variables
  freq = K - colSums( zero_est )

  out = cn[ freq > cutoff*K ]

  return( out )
}
rettopnivek/binclass documentation built on May 13, 2019, 4:46 p.m.