R/estimate.en_lr.R

# The 'estimate.en_lr' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-26

# Table of contents
# 1) estimate.en_lr        | *
# 2) bc_set_control.glmnet |

###
### 1) estimate.en_lr
###

estimate.en_lr = function( dat,
                           control = list() ) {
  # Purpose:
  # Applies elastic net logistic regression, using k-fold
  # cross-validation to estimate the optimal parameters for
  # governing the mixing value (for ridge versus lasso
  # regression) and the penalty term (the degree to which
  # parameters are regularized).
  # Arguments:
  # dat     - An R object of class 'train_test'
  # control - An optional list for control
  #           parameters, where...
  #             'alpha' = The mixing proportion
  #               ridge (0) versus
  #               lasso (1) regression.
  #             'lambda' = A sequence of penalty terms to
  #               fit.
  #             'nfolds' = The number of folds for
  #               cross-validation step to select
  #               the best penalty term for 'glmnet'.
  #             'costf' = The type of cost function to
  #               use for the 'cv.glmnet' function, where
  #               options include 'auc' (Area under the curve),
  #               'mae' (mean absolute error), 'class'
  #               (misclassification error), and 'deviance'
  #             'one_se' = Logical; if TRUE, selects the
  #               penalty term (and if needed, the mixing
  #               parameter) based on the simplest model
  #               still within one standard error of the model
  #               minimizing the chosen cost function
  #             'col_sel' = An index for the
  #               subset of predictors
  #               to use
  #             'prev_fit' = An optional
  #               variable with previous
  #               output from the 'glm'
  #               function
  #             'error_rate' = An optional
  #               proportion for selecting
  #               significant variables
  # Returns:
  # An object of class 'bc_estimate'.

  # Extract control parameters
  alpha = control$alpha
  lambda = control$lambda
  nfolds = control$nfolds
  costf = control$costf
  one_se = control$one_se
  prev_fit = control$prev_fit
  col_sel = control$col_sel
  error_rate = control$error_rate

  # Set default options if necessary
  if ( is.null( alpha ) )
    alpha = .5
  if ( is.null( nfolds ) )
    nfolds = 10
  if ( is.null( costf ) )
    costf = 'deviance'
  if ( is.null( one_se ) )
    one_se = T

  if ( is.null( col_sel ) ) {
    col_sel = 1:length( dimnames( dat ) )
  }

  if ( is.null( error_rate ) )
    error_rate = .05

  # Estimate run time
  start = ert()

  # Summary of process
  # Step 1)
  #   Extract data to fit
  # Step 2)
  #   Fit data using 'glmnet' functions
  #   If single value for mixing parameter
  # Step 2.1)
  #   Determine best value for penalty term (cross-validation)

  # Step 1)
  if ( is.null( prev_fit ) ) {
    # Check if results from a previous fit should be used
    # If not...

    # Step 2)
    # Extract data to fit

    # Note that glmnet uses the
    # last level of the factor
    # as the target class
    yf = subset( dat, T, T )
    # Flip levels of factor
    yf = factor( yf,
                 levels = rev( levels( dat ) ) )
    # Matrix of predictors
    cur_X = subset( dat, F, T )

    # Step 3)
    if ( length( col_sel ) > 1 ) {
      # Check if sufficient predictors are to be used

      # Extract predictors to use
      cur_X = cur_X[,col_sel]

      # Step 4)
      # Check if cross-validation should be used to
      # find the best penalty-term
      if ( !is.null( lambda ) ) {


        # Fit data using 'glmnet'
        fit = glmnet::glmnet( x = cur_X,
                              y = yf,
                              family = 'binomial',
                              alpha = alpha,
                              lambda = lambda[[1]] )

        # Set best value for mixing parameter
        best_alpha = alpha
        # Set best value for penalty term
        best_pt = lambda[[2]]

        # Add to 'cv.glmnet' object
        fit$best_pt = best_pt
        fit$best_alpha = best_alpha

      } else {


        # Create index indicating which observation
        # belongs to which fold
        i_index = cv_index( nfolds, length( yf ) )

        # Step 5)
        #   Determine if best mixing parameter should be selected
        if ( length( alpha ) == 1 ) {
          # If single value for mixing parameter is provided

          # Step 5.1)
          # Determine best value for penalty term

          # Fit data using 'cv.glmnet'
          fit = glmnet::cv.glmnet( x = cur_X,
                                   y = yf,
                                   foldid = i_index,
                                   family = 'binomial',
                                   alpha = alpha,
                                   type.measure = costf )

          # Determine best penalty term
          if ( one_se ) {
            best_pt = fit$lambda.1se
          } else {
            best_pt = fit$lambda.min
          }

          # Set best value for mixing parameter
          best_alpha = alpha

          # Add to 'cv.glmnet' object
          fit$best_pt = best_pt
          fit$best_alpha = best_alpha

        } else {

          # Step 5.2)
          # Determine best value for both mixing parameter and penalty term

          # Initialize grids
          fit_per_alpha = list()
          for ( a in 1:length( alpha ) ) {
            fit_per_alpha = c( fit_per_alpha, list( NULL ) )
          }
          # Matrix for results of cross-validation
          CV_res = matrix( NA, length( alpha ), 4 )
          colnames( CV_res ) = c(
            'lambda',
            'Mean_GOF',
            'GOF_minus_SE',
            'GOF_plus_SE'
          )

          # Loop over each value of alpha
          for ( a in 1:length( alpha ) ) {

            # Fit data using 'cv.glmnet'
            fit = glmnet::cv.glmnet( x = cur_X,
                                     y = yf,
                                     foldid = i_index,
                                     family = 'binomial',
                                     alpha = alpha[a],
                                     type.measure = costf )
            # Store results
            fit_per_alpha[[a]] = fit

            # Extract best value for penalty term
            if ( one_se ) {
              CV_res[a,1] = fit$lambda.1se
              sel_l = which( fit$lambda == fit$lambda.1se )
            } else {
              CV_res[a,1] = fit$lambda.min
              sel_l = which( fit$lambda == fit$lambda.min )
            }
            # Store results for goodness of fit
            CV_res[a,2] = fit$cvm[sel_l]
            CV_res[a,3] = fit$cvlo[sel_l]
            CV_res[a,4] = fit$cvup[sel_l]
          }

          # Find alpha that produced best goodness of fit
          if ( costf == 'auc' ) {
            sel = which.max( CV_res[,2] )
          } else {
            sel = which.min( CV_res[,2] )
          }

          # Extract fitting results
          fit = fit_per_alpha[[sel]]
          # Extract best-fitting penalty term
          best_pt = CV_res[sel,1]
          # Extract best-fitting mixing proportion
          best_alpha = alpha[a]

          # Add to 'cv.glmnet' object
          fit$best_pt = best_pt
          fit$best_alpha = best_alpha

        }

      }

    } else {
      # If insufficient predictors are provided, use standard logistic
      # regression approach

    }

  } else {

    # Set model fit to previous result
    fit = prev_fit
    # Extract values for penalty term and alpha
    best_pt = fit$best_pt
    best_alpha = fit$best_alpha

  }

  # Coefficients
  if ( length( col_sel ) > 1 ) {

    b0 = coef( fit, s = best_pt )[1,1]
    cf = coef( fit, s = best_pt )

    # Exclude intercept
    rn = rownames( cf )
    cf = cf[ rn %in% dimnames( dat ), 1]

    # Isolate significant coefficients
    # nz = which( sm$CT$p_value_UB < error_rate )
    # Isolate non-zero coefficients
    nz = cf != 0
    cf = cf[nz]
    selected_vars = names( cf )
  }

  # Fit metrics
  gof = fit_metrics( fit, dat, 'glmnet',
                     lambda_val = best_pt )

  # Run time duration
  run_time = ert( start )

  # Create a new class
  out = list(
    fit = fit,
    run_time = run_time,
    fit_metrics = gof,
    coefficients = cf,
    intercept = b0,
    predictors = dimnames( dat ),
    selected_vars = selected_vars,
    type = 'glmnet',
    levels = levels( dat ),
    # Include additional elements
    # noting best parameters for glmnet
    best_alpha = best_alpha,
    best_lambda = best_pt
  )

  class( out ) = 'bc_estimate'

  return( out )
}

###
### 2) bc_set_control.glmnet
###

bc_set_control.glmnet = function( cv, set_control ) {
  # Purpose:
  # Applies a function to the results of the K-fold
  # cross-validation for feature selection.
  # Arguments:
  # cv          - An R object of class 'kfold_cv'
  # set_control - A list, where...
  #                 'cutoff' =
  # Returns:
  # ...

  # Extract coefficients
  sm = summary( cv )

  # Extract number of folds
  K = length( grep( 'Fold', names( cv ) ) )
  # Determine which list components are folds
  folds = names( cv )[ grep( 'Fold', names( cv ) ) ]

  # Determine best value of alpha to use
  all_alpha = sapply( cv[folds], function(x) x$best_alpha )
  counts = table( all_alpha )
  alpha = as.numeric( names( counts )[ which.max( counts ) ] )

  # Specify range of lambdas to use
  lambda = cv[[ paste( 'Fold', sm$Fold[ sm$Best ], sep = '_' ) ]]$fit$lambda

  # Set default options
  cutoff = set_control$cutoff
  if ( is.null( cutoff ) ) cutoff = .05

  # Determine results that are consistently
  # non-zero
  sig = colSums( sm$Significant )
  co = floor( K * cutoff )
  sel = names( sig )[ sig > co ]

  control = list(
    col_sel = which( dimnames( cv$Fold_1 ) %in% sel ),
    alpha = alpha,
    lambda = list(
      lambda,
      sm$lambda[sm$Best]
    )
  )

  return( control )
}
rettopnivek/binclass documentation built on May 13, 2019, 4:46 p.m.