# The 'estimate.en_lr' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-26
# Table of contents
# 1) estimate.en_lr | *
# 2) bc_set_control.glmnet |
###
### 1) estimate.en_lr
###
estimate.en_lr = function( dat,
control = list() ) {
# Purpose:
# Applies elastic net logistic regression, using k-fold
# cross-validation to estimate the optimal parameters for
# governing the mixing value (for ridge versus lasso
# regression) and the penalty term (the degree to which
# parameters are regularized).
# Arguments:
# dat - An R object of class 'train_test'
# control - An optional list for control
# parameters, where...
# 'alpha' = The mixing proportion
# ridge (0) versus
# lasso (1) regression.
# 'lambda' = A sequence of penalty terms to
# fit.
# 'nfolds' = The number of folds for
# cross-validation step to select
# the best penalty term for 'glmnet'.
# 'costf' = The type of cost function to
# use for the 'cv.glmnet' function, where
# options include 'auc' (Area under the curve),
# 'mae' (mean absolute error), 'class'
# (misclassification error), and 'deviance'
# 'one_se' = Logical; if TRUE, selects the
# penalty term (and if needed, the mixing
# parameter) based on the simplest model
# still within one standard error of the model
# minimizing the chosen cost function
# 'col_sel' = An index for the
# subset of predictors
# to use
# 'prev_fit' = An optional
# variable with previous
# output from the 'glm'
# function
# 'error_rate' = An optional
# proportion for selecting
# significant variables
# Returns:
# An object of class 'bc_estimate'.
# Extract control parameters
alpha = control$alpha
lambda = control$lambda
nfolds = control$nfolds
costf = control$costf
one_se = control$one_se
prev_fit = control$prev_fit
col_sel = control$col_sel
error_rate = control$error_rate
# Set default options if necessary
if ( is.null( alpha ) )
alpha = .5
if ( is.null( nfolds ) )
nfolds = 10
if ( is.null( costf ) )
costf = 'deviance'
if ( is.null( one_se ) )
one_se = T
if ( is.null( col_sel ) ) {
col_sel = 1:length( dimnames( dat ) )
}
if ( is.null( error_rate ) )
error_rate = .05
# Estimate run time
start = ert()
# Summary of process
# Step 1)
# Extract data to fit
# Step 2)
# Fit data using 'glmnet' functions
# If single value for mixing parameter
# Step 2.1)
# Determine best value for penalty term (cross-validation)
# Step 1)
if ( is.null( prev_fit ) ) {
# Check if results from a previous fit should be used
# If not...
# Step 2)
# Extract data to fit
# Note that glmnet uses the
# last level of the factor
# as the target class
yf = subset( dat, T, T )
# Flip levels of factor
yf = factor( yf,
levels = rev( levels( dat ) ) )
# Matrix of predictors
cur_X = subset( dat, F, T )
# Step 3)
if ( length( col_sel ) > 1 ) {
# Check if sufficient predictors are to be used
# Extract predictors to use
cur_X = cur_X[,col_sel]
# Step 4)
# Check if cross-validation should be used to
# find the best penalty-term
if ( !is.null( lambda ) ) {
# Fit data using 'glmnet'
fit = glmnet::glmnet( x = cur_X,
y = yf,
family = 'binomial',
alpha = alpha,
lambda = lambda[[1]] )
# Set best value for mixing parameter
best_alpha = alpha
# Set best value for penalty term
best_pt = lambda[[2]]
# Add to 'cv.glmnet' object
fit$best_pt = best_pt
fit$best_alpha = best_alpha
} else {
# Create index indicating which observation
# belongs to which fold
i_index = cv_index( nfolds, length( yf ) )
# Step 5)
# Determine if best mixing parameter should be selected
if ( length( alpha ) == 1 ) {
# If single value for mixing parameter is provided
# Step 5.1)
# Determine best value for penalty term
# Fit data using 'cv.glmnet'
fit = glmnet::cv.glmnet( x = cur_X,
y = yf,
foldid = i_index,
family = 'binomial',
alpha = alpha,
type.measure = costf )
# Determine best penalty term
if ( one_se ) {
best_pt = fit$lambda.1se
} else {
best_pt = fit$lambda.min
}
# Set best value for mixing parameter
best_alpha = alpha
# Add to 'cv.glmnet' object
fit$best_pt = best_pt
fit$best_alpha = best_alpha
} else {
# Step 5.2)
# Determine best value for both mixing parameter and penalty term
# Initialize grids
fit_per_alpha = list()
for ( a in 1:length( alpha ) ) {
fit_per_alpha = c( fit_per_alpha, list( NULL ) )
}
# Matrix for results of cross-validation
CV_res = matrix( NA, length( alpha ), 4 )
colnames( CV_res ) = c(
'lambda',
'Mean_GOF',
'GOF_minus_SE',
'GOF_plus_SE'
)
# Loop over each value of alpha
for ( a in 1:length( alpha ) ) {
# Fit data using 'cv.glmnet'
fit = glmnet::cv.glmnet( x = cur_X,
y = yf,
foldid = i_index,
family = 'binomial',
alpha = alpha[a],
type.measure = costf )
# Store results
fit_per_alpha[[a]] = fit
# Extract best value for penalty term
if ( one_se ) {
CV_res[a,1] = fit$lambda.1se
sel_l = which( fit$lambda == fit$lambda.1se )
} else {
CV_res[a,1] = fit$lambda.min
sel_l = which( fit$lambda == fit$lambda.min )
}
# Store results for goodness of fit
CV_res[a,2] = fit$cvm[sel_l]
CV_res[a,3] = fit$cvlo[sel_l]
CV_res[a,4] = fit$cvup[sel_l]
}
# Find alpha that produced best goodness of fit
if ( costf == 'auc' ) {
sel = which.max( CV_res[,2] )
} else {
sel = which.min( CV_res[,2] )
}
# Extract fitting results
fit = fit_per_alpha[[sel]]
# Extract best-fitting penalty term
best_pt = CV_res[sel,1]
# Extract best-fitting mixing proportion
best_alpha = alpha[a]
# Add to 'cv.glmnet' object
fit$best_pt = best_pt
fit$best_alpha = best_alpha
}
}
} else {
# If insufficient predictors are provided, use standard logistic
# regression approach
}
} else {
# Set model fit to previous result
fit = prev_fit
# Extract values for penalty term and alpha
best_pt = fit$best_pt
best_alpha = fit$best_alpha
}
# Coefficients
if ( length( col_sel ) > 1 ) {
b0 = coef( fit, s = best_pt )[1,1]
cf = coef( fit, s = best_pt )
# Exclude intercept
rn = rownames( cf )
cf = cf[ rn %in% dimnames( dat ), 1]
# Isolate significant coefficients
# nz = which( sm$CT$p_value_UB < error_rate )
# Isolate non-zero coefficients
nz = cf != 0
cf = cf[nz]
selected_vars = names( cf )
}
# Fit metrics
gof = fit_metrics( fit, dat, 'glmnet',
lambda_val = best_pt )
# Run time duration
run_time = ert( start )
# Create a new class
out = list(
fit = fit,
run_time = run_time,
fit_metrics = gof,
coefficients = cf,
intercept = b0,
predictors = dimnames( dat ),
selected_vars = selected_vars,
type = 'glmnet',
levels = levels( dat ),
# Include additional elements
# noting best parameters for glmnet
best_alpha = best_alpha,
best_lambda = best_pt
)
class( out ) = 'bc_estimate'
return( out )
}
###
### 2) bc_set_control.glmnet
###
bc_set_control.glmnet = function( cv, set_control ) {
# Purpose:
# Applies a function to the results of the K-fold
# cross-validation for feature selection.
# Arguments:
# cv - An R object of class 'kfold_cv'
# set_control - A list, where...
# 'cutoff' =
# Returns:
# ...
# Extract coefficients
sm = summary( cv )
# Extract number of folds
K = length( grep( 'Fold', names( cv ) ) )
# Determine which list components are folds
folds = names( cv )[ grep( 'Fold', names( cv ) ) ]
# Determine best value of alpha to use
all_alpha = sapply( cv[folds], function(x) x$best_alpha )
counts = table( all_alpha )
alpha = as.numeric( names( counts )[ which.max( counts ) ] )
# Specify range of lambdas to use
lambda = cv[[ paste( 'Fold', sm$Fold[ sm$Best ], sep = '_' ) ]]$fit$lambda
# Set default options
cutoff = set_control$cutoff
if ( is.null( cutoff ) ) cutoff = .05
# Determine results that are consistently
# non-zero
sig = colSums( sm$Significant )
co = floor( K * cutoff )
sel = names( sig )[ sig > co ]
control = list(
col_sel = which( dimnames( cv$Fold_1 ) %in% sel ),
alpha = alpha,
lambda = list(
lambda,
sm$lambda[sm$Best]
)
)
return( control )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.