R/kfold_cv.R

# The 'kfold_cv' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-06

# Table of contents
# 1) kfold_cv                     |
#   1.1) is.kfold_cv              | tested
#   1.2) levels.kfold_cv          |
#   1.3) dimnames.kfold_cv        |
#   1.4) max.kfold_cv             |
#   1.5) features.kfold_cv        |
#   1.6) coef.kfold_cv            |
#   1.7) print.kfold_cv           |
#   1.8) summary.kfold_cv         |

###
### 1) kfold_cv
###

#' K-fold Cross Validation for Binary Classification
#'
#' A wrapper function that implements K-fold cross
#' validation for a chosen algorithm with binary data.
#'
#' @param y A factor with two levels.
#' @param X A matrix of predictors with the number of rows equal
#'   to the length of \code{y}.
#' @param type The type of fitting algorithm to use. Options
#'   include \code{glm} and \code{glmnet}.
#' @param K The number of folds to use.
#' @param control An optional list of estimation parameters
#'   for the chosen algorithm.
#' @param progress Logical; If \code{TRUE} prints a progress
#'   bar tracking which folds have finished estimation.
#'
#' @details The method \code{max} can be used to extract the
#'   label for the fold that produced the best value for
#'   a specified metric with the test sample. The method
#'   \code{features} extracts the labels for the independent
#'   variables associated with the best-performing fold.
#'   The method \code{summary} reports the coefficients
#'   and denotes which were statistically significant
#'   across all folds.
#'
#' @return An R object of class 'kfold_cv'.
#'
#' @export
#' @examples
#' # Simulate data
#' sim = bc_simulate( 500, 8, 4 )
#' # Conduct 10-fold CV
#' cv_glm = kfold_cv( sim$y, sim$X, type = 'glm' )
#' cv_glmnet = kfold_cv( sim$y, sim$X, type = 'glmnet' )

kfold_cv = function( y, X,
                     type,
                     K = 10,
                     control = list(),
                     progress = T ) {

  # Estimate run time
  start = ert()

  # Create index
  index = cv_index( K, length( y ) )

  # Carry out K-fold cross-validation

  # Initialize output
  out = list()
  for ( k in 1:K ) {
    out = c( out, list(NULL) )
  }
  names( out ) = paste( 'Fold', 1:K, sep = '_' )

  # Initialize progress bar
  if ( progress ) pb = txtProgressBar( min = 0, max = K, style = 3 )

  # Loop over folds
  for ( k in 1:K ) {

    # Split data into training/test sets
    dtbf = train_test( k, index, y, X )

    # Fit desired model
    out[[k]] = bc_estimate ( type, dtbf, control = control )

    # Update progress bar
    if (progress) setTxtProgressBar(pb,k)
  }
  if ( progress ) close( pb )

  # Incorporate cross-validation index
  out$index = index

  # Incorporate total run time
  run_time = ert( start )
  out$run_time = run_time

  # Set class for output
  class( out ) = 'kfold_cv'

  # Return list with fitting results
  # for each fold
  return( out )
}


# 1.1)
#' @rdname kfold_cv
#' @export

is.kfold_cv = function( x ) {
  # Purpose:
  # Checks if an object is of class 'kfold_cv'
  # Arguments:
  # x - An R object
  # Returns:
  # TRUE if the object is of class 'kfold_cv',
  # FALSE otherwise.

  return( inherits( x, 'kfold_cv' ) )
}

# 1.2)
#' @rdname kfold_cv
#' @export

levels.kfold_cv = function( x ) {
  # Purpose:
  # Extracts the binary levels for the
  # factor representing the dependent variable.
  # Arguments:
  # x - An R object of class 'kfold_cv'
  # Returns:
  # The binary levels for the factor.

  # Extract levels of dependent variable
  out = x$Fold_1$levels

  return( out )
}

# 1.3)
#' @rdname kfold_cv
#' @export

dimnames.kfold_cv = function( x ) {
  # Purpose:
  # Extracts the column labels for the matrix
  # of predictors.
  # Arguments:
  # x - An R object of class 'kfold_cv'
  # Returns:
  # A vector with the column labels for the
  # predictors.

  # Extract labels for predictors
  out = dimnames( x$Fold_1 )

  return( out )
}

# 1.4)
#' @rdname kfold_cv
#' @export

max.kfold_cv = function( x, train = F, metric = 'AUC', na.rm = FALSE ) {
  # Purpose:
  # Extracts the fold with the best fit metrics based on
  # either the test or training subset.
  # Arguments:
  # x      - An R object of class 'kfold_cv'
  # train  - Logical; if TRUE, finds best result
  #          based on the training data
  # metric - The metric to use
  # na.rm  - For compatibility with generic method
  # Returns:
  # A character string with the fold name.

  # Extract fit metric of choice
  sel = names(x)[ grep( 'Fold', names( x ) ) ]
  fm = sapply( x[sel], subset, train = train, metric = metric )

  single_metrics = c(
    'TPR', 'FPR', 'AUC', 'd_prime',
    'criterion', 'CE', 'R', 'Accuracy' )

  if ( !( metric %in% single_metrics ) ) {
    warning( 'Need metric comprising of a single value - defaulting to AUC',
             call. = F )
    metric = 'AUC'
  }

  metrics_to_max = c(
    'AUC',
    'd_prime',
    'Accuracy',
    'TPR'
  )

  if ( metric %in% metrics_to_max ) {
    ind = which.max( fm )[1]
  } else {
    ind = which.min( fm )[1]
  }
  out = names( x[sel] )[ind]

  return( out )
}

# 1.5)
#' @rdname kfold_cv
#' @export

features.kfold_cv = function( x, metric = 'AUC' ) {
  # Purpose:
  # Extracts the column labels for the
  # predictors selected by the model
  # as significant contributors to binary
  # classification.
  # Arguments:
  # x      - An R object of class 'kfold_cv'
  # metric - The metric to use
  # Returns:
  # A vector with the column labels for the
  # selected predictors.

  # Find fold with best fit
  sel = max( x, metric = metric )

  # Extract labels for selected predictors
  out = features( x[[sel]] )

  return( out )
}

# 1.6)
#' @rdname kfold_cv
#' @export

coef.kfold_cv = function( x, int = F, sig = T, metric = 'AUC' ) {
  # Purpose:
  # Extracts the coefficients from the model fit based on the
  # fold with the best fit metric.
  # Arguments:
  # x      - An R object of class 'kfold_cv'
  # int    - Logical; if TRUE, includes
  #          the intercept
  # sig    - Logical; if TRUE, includes only
  #          significant/non-zero values
  # metric - The metric to use
  # Returns:
  # The coefficients from the fitted model.

  # Find fold with best fit
  sel = max( x, metric = metric )
  # Extract coefficients
  out = coef( x[[sel]], int = int, sig = sig )

  return( out )
}

# 1.7)
#' @rdname kfold_cv
#' @export

print.kfold_cv = function( x, digits = 2, metric = 'AUC' ) {
  # Purpose:
  # Displays basic details for an object of
  # class 'kfold_cv'.
  # Arguments:
  # x      - An R object of class 'kfold_cv'
  # digits - The max number of digits when rounding
  # metric - The type of fit metric to report

  # Find fold with best fit
  sel = max( x, metric = metric )
  # Extract fit metric
  fm = subset( x[[sel]], train = F, metric = metric )

  string = paste( 'Estimation type:', x[[1]]$type )
  cat( string, '\n' )
  print( round( x$run_time, digits ) )
  cat( '\n' )

  cat( 'Coefficients:', '\n' )
  val = coef( x[[sel]] )
  print( round( val, digits ) )
  cat( '\n' )

  cat( 'Fit:', '\n' )
  val = fm
  string = paste( metric, '=', round( val, 2 ) )
  cat( string, '\n' )

}

# 1.8)
#' @rdname kfold_cv
#' @export

summary.kfold_cv = function( x, metric = 'AUC' ) {
  # Purpose:
  # Extracts the coefficients, indicators for significance,
  # the singular metric of choice, and a logical vector
  # indicating the fold with the best predictive fit.
  # Arguments:
  # x      - An R object of class 'kfold_cv'
  # metric - The type of fit metric to report
  # Returns:
  # A data frame.

  # Find fold with best fit
  sel = max( x, metric = metric )
  # Extract predictor labels
  cn = x$Fold_1$predictors

  # Extract number of folds
  folds = names(x)[ grep( 'Fold', names( x ) ) ]
  K = length( folds )

  # Create data frame with results
  out = data.frame(
    Fold = 1:K,
    Best = FALSE,
    Metric = NA,
    Intercept = NA,
    stringsAsFactors = F
  )
  out$Coefficients =
    matrix( 0, nrow( out ), length(cn) )
  out$Significant =
    matrix( 0, nrow( out ), length(cn) )
  colnames( out$Coefficients ) = cn
  colnames( out$Significant ) = cn

  out$Best[ folds %in% sel ] = TRUE

  out$Metric =
    sapply( x[folds], subset, train = F, metric = metric )
  out$Intercept =
    sapply( x[folds], function(x) x$intercept )

  # Loop over folds
  inc = 1
  for ( i in folds ) {

    cf = coef( x[[i]] )
    pos = cn %in% names( cf )
    out$Coefficients[inc,pos] = coef( x[[i]] )
    out$Significant[inc,pos] = 1
    inc = inc + 1
  }

  if ( x$Fold_1$type == 'glmnet' ) {
    out$lambda =
      sapply( x[folds], function(x) x$best_lambda )
    out$alpha =
      sapply( x[folds], function(x) x$best_alpha )
  }

  return( out )
}
rettopnivek/binclass documentation built on May 13, 2019, 4:46 p.m.