# The 'fit_metrics' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-06
# Table of contents
# 1) fit_metrics |
# 1.1) is.fit_metrics | tested
# 1.2) subset.fit_metrics | tested
# 1.3) print.fit_metrics | tested
# 1.4) residualts.fit_metrics | tested
###
### 1) fit_metrics
###
#' Compute Fit Metrics for Binary Classification
#'
#' Given output following model fitting (e.g., logistic
#' regression), computes an assortment of fit metrics
#' for the training and test subsets of the data.
#'
#' @param fit Model fit output (e.g., output from
#' \code{\link[stats]{glm}}).
#' @param dat An R object of class 'train_test'.
#' @param algorithm The type of fitting algorithm. Options
#' include \code{glm} and \code{glmnet}.
#' @param offset_info An optional named list with a vector of
#' offset values associated with each row of the matrix of
#' predictrs (for 'train' and 'test', respectively)
#' used when fitting data with \code{glm}.
#' @param lambda_val An optional value indicating the best-fitting
#' penalty term found when fitting data using \code{glmnet}.
#' @param cutoff An optional integer coding how the
#' probabilities returned from the \code{predict}
#' function with \code{glm} output should be
#' transformed into binary values, where...
#' \itemize{
#' \item 0 = cut-off of 0.5;
#' \item 1 = cut-off set to mean proportion in data;
#' \item 0 < cutoff < 1 = cutoff set to specified value;
#' \item 2 = binary values generated via a binomial distribution.
#' }
#'
#' @details The function computes several metrics:
#' \itemize{
#' \item \code{TPR} - the true positive rate, the ratio of
#' hits against the number of positive trials;
#' \item \code{FPR} - the false positive rate, the ratio of
#' false alarms against the number of negative trials;
#' \item \code{AUC} - the area under the curve, estimated by
#' numerical integration after tracing out the curve by
#' computing the \code{FPR} versus \code{TPR} associated
#' with each predicted probability;
#' \item \code{d_prime} - a measure of discrimabiility;
#' \item \code{criterion} - a measure of bias, where positive values
#' denote greater bias against selecting a positive trial;
#' \item \code{CE} - The mean cross-entropy;
#' \item \code{R} - Pearson's R computed from the confusion matrix;
#' \item \code{Accuracy} - Predictive accuracy, the proportion of
#' cases the model predicted correctly;
#' \item \code{CM} - The confusion matrix, the frequencies
#' of predicted positive/negative instances against the observed
#' frequencies;
#' \item \code{AUC_curve} - a data frame with the cut-offs based on
#' the unique predicted probabilities and the associated
#' true and false positive rates. Allows plotting of the AUC
#' curve;
#' \item \code{theta} - the predicted probability of a positive
#' outcome for each observation in the subset of data;
#' \item \code{residuals} - the difference between the observed outcome
#' and the predicted probability;
#' }
#' The \code{subset} method allows a specified metric to be extracted
#' for either the training (\code{train == TRUE}) or test subsets. The
#' \code{residuals} method extracts the residuals.
#'
#' @return An object of class 'fit_metrics', a list
#' of lists, each providing the set of metrics over
#' the training and test data, respectively.
#'
#' @export
#' @examples
#' # Simulate data
#' sim = bc_simulate( 300, 4, 2 )
#' # Create 'train_test' object
#' index = cv_index( 3, 300 )
#' dat = train_test( 3, index, sim$y, sim$X )
#' # Extract training data as data frame
#' train = as.data.frame( dat, train = T )
#' # Fit data
#' fit = glm( y ~ P1 + P2 + P3 + P4, family = 'binomial', data = train )
#' # Compute metrics
#' fm = fit_metrics( fit, dat )
#' fm
fit_metrics = function( fit, dat,
algorithm = 'glm',
offset_info = NULL,
lambda_val = NULL,
cutoff = NULL ) {
# Initialize output
output = list(
train = list(),
test = list()
)
# Set default value for cutoff
if ( is.null( cutoff ) )
cutoff = 0
# Check dimensions for input
dmn = sapply( dat,
function(x) lapply( x,
function(y) length(y)
)
)
if ( any( dmn[,2] == 0 ) ) {
L = 1
} else L = 2
# Error messages for incorrect input
if ( L < 2 ) {
string = "Data input needs both training and test subsets"
stop( string, call. = FALSE )
}
allowed_algorithms = c( 'glm', 'glmnet', 'svm' )
if ( !( algorithm %in% allowed_algorithms ) ) {
aa = length( allowed_algorithms )
string = paste(
"The option 'algorithm' should be set to either: ",
paste( allowed_algorithms[-aa], collapse = ', ' ),
', or ', allowed_algorithms[aa], sep = '' )
stop( string, call. = FALSE )
}
# Extract levels
levs = levels( dat )
# Loop over training and test sets
for ( i in 1:L ) {
# Indicate if metrics are for training or
# test subset
if ( i == 1 ) {
out = 'train'
cur_y = subset( dat, T, T )
cur_X = subset( dat, F, T )
y_obs = as.integer( dat, train = T )
} else {
out = 'test'
cur_y = subset( dat, T, F )
cur_X = subset( dat, F, F )
y_obs = as.integer( dat, train = F )
}
### Standard logistic regression ###
if ( algorithm == 'glm' ) {
# Extract either training or
# test data
if ( out == 'train' ) {
df = as.data.frame( dat, train = T )
}
if ( out == 'test' ) {
df = as.data.frame( dat, train = F )
}
# Sample size
n = nrow( df )
# Initialize comparison vectors
y_hat = rep( 0, n )
# Generate predictions from logistic regression model
if ( is.null( offset_info ) ) {
pred = predict( fit, newdata = df )
} else {
offset_fit = glm( y ~ 0,
family = 'binomial',
data = df,
offset = offset_info[[out]]
)
pred = predict( offset_fit )
}
# Convert to probability
theta = 1/( 1 + exp(-pred) )
# Convert to binary values
# Cut-off of .5
if ( cutoff == 0 ) {
co = .5
y_hat = as.numeric( theta > co )
}
# Cut-off based on mean proportion
if ( cutoff == 1 ) {
co = mean( y_obs )
y_hat = as.numeric( theta > co )
}
# Numeric cut-off provided
if ( cutoff > 0 & cutoff < 1 ) {
co = cutoff
y_hat = as.numeric( theta > co )
}
# Generate values from a binomial distribution
if ( cutoff == 2 ) {
y_hat = rbinom( n, 1, theta )
}
# Confusion matrix
pred_lbl = rep( levs[2], length( y_hat ) )
pred_lbl[ y_hat == 1 ] = levs[1]
# Convert to factors
prd_lbl = factor( pred_lbl,
levels = levs )
CM = table(
Predicted = pred_lbl,
Observed = cur_y
)
}
### Elastic net logistic regression ###
if ( algorithm == 'glmnet' ) {
if ( out == 'train' ) {
trn = T
} else {
trn = F
}
# Generate predictions from model
if ( is.null( lambda_val ) ) {
lambda_val = mean( fit$lambda )
}
newx = subset( dat, F, train = trn )
col_sel = rownames( coef( fit ) )[-1]
newx = newx[,col_sel]
# Extract predicted values
pred = predict( fit,
s = lambda_val,
newx = newx,
type = 'class' )[,1]
theta = predict( fit,
s = lambda_val,
newx = newx,
type = 'response' )[,1]
y_hat = as.integer( pred == levs[1] )
# Convert to factors
pred = factor( pred,
levels = levs )
# Confusion matrix
CM = table(
Predicted = pred,
Observed = cur_y
)
}
# Check dimensions of confusion matrix
dmn = dim( CM )
if ( dmn[1] < 2 ) {
# Adjust table to have zeros
tmp = rbind( CM, 0 )
rownames( tmp ) = c( levs[ levs %in% rownames(CM) ],
levs[ !( levs %in% rownames(CM) ) ] )
nms = dimnames( tmp )
names( nms ) = c( 'Predicted', 'Observed' )
dimnames( tmp ) = nms
CM = tmp
}
if ( dmn[2] < 2 ) {
# Adjust table to have zeros
tmp = cbind( CM, 0 )
colnames( tmp ) = c( levs[ levs %in% colnames(CM) ],
levs[ !( levs %in% colnames(CM) ) ] )
nms = dimnames( tmp )
names( nms ) = c( 'Predicted', 'Observed' )
dimnames( tmp ) = nms
CM = tmp
}
# Total positive trials
N_pos = sum( CM[ , levs[1] ] )
# Total negative trials
N_neg = sum( CM[ , levs[2] ] )
# Measures based on predicted probabilities and
# observed values
# Residuals
residual_val = y_obs - theta
# Mean cross-entropy
CE = mean( -y_obs*log2( theta ) )
# Area under the curve
co = rev( sort( unique( theta ) ) ) # Define cut-off
co = c( 1, co, 0 )
# Compute true and false positive rate at
# each value of cut-off
AUC_curve = data.frame(
Cutoff = co,
TPR = rep( NA, length( co ) ),
FPR = rep( NA, length( co ) )
)
for ( j in 1:length( co ) ) {
AUC_curve$TPR[j] = sum( theta[ y_obs == 1 ] >= co[j] )/N_pos
AUC_curve$FPR[j] = sum( theta[ y_obs == 0 ] >= co[j] )/N_neg
}
# Compute area using trapezoid approximation
# for integration
a = AUC_curve$TPR[ -nrow( AUC_curve ) ]
b = AUC_curve$TPR[ -1 ]
h = diff( AUC_curve$FPR )
AUC = sum( area_trap( a, b, h ) )
# Measures based on confusion matrix
# Frequencies
Hits = CM[ levs[1], levs[1] ]
Misses = CM[ levs[2], levs[1] ]
False_alarms = CM[ levs[1], levs[2] ]
Correct_rejections = CM[ levs[2], levs[2] ]
# True positive rate/sensitivity
# P( Detection )
TPR = Hits/N_pos
# False positive rate
FPR = False_alarms/N_neg
# If frequencies of 0, adjust using
# log-linear correction
if ( TPR == 0 | FPR == 0 ) {
TPR = (Hits+.5)/(N_pos+1)
FPR = (False_alarms+.5)/(N_neg+1)
}
# Obtain estimate of criterion
crt_est = -.5*( qnorm( TPR ) +
qnorm( FPR ) )
# Obtain estimate of d'
dp_est = 2*( qnorm( TPR ) + crt_est )
# Pearson's R
# Scale to avoid integer overflow
adj = max( Hits, Correct_rejections, Misses, False_alarms )
# Numerator
p1 = (Hits/adj) * (Correct_rejections/adj)
p2 = (Misses/adj) * (False_alarms/adj)
# Denominator
p3 = (Hits + False_alarms)/adj
p4 = (Correct_rejections + Misses)/adj
p5 = (Hits + Misses)/adj
p6 = (False_alarms + Correct_rejections)/adj
R = ( p1 - p2 ) / sqrt( p3*p4*p5*p6 )
# Prediction accuracy
Accuracy = mean( y_hat == y_obs )
# Output
output[[out]] = list(
# Singular values
TPR = TPR,
FPR = FPR,
AUC = AUC,
d_prime = dp_est,
criterion = crt_est,
CE = CE,
R = R,
Accuracy = Accuracy,
# Constructs
CM = CM,
AUC_curve = AUC_curve,
theta = theta,
residuals = residual_val
)
}
# Set class for output
class( output ) = 'fit_metrics'
return( output )
}
# 1.1)
#' @rdname fit_metrics
#' @export
is.fit_metrics = function( x ) {
# Purpose:
# Checks if an object is of class 'fit_metrics'
# Arguments:
# x - An R object
# Returns:
# TRUE if the object is of class 'fit_metrics',
# FALSE otherwise.
return( inherits( x, 'fit_metrics' ) )
}
# 1.2)
#' @rdname fit_metrics
#' @export
subset.fit_metrics = function( x, train = F, metric = 'AUC' ) {
# Purpose:
# Extracts the specified fit metric based on
# the model fit to either the training or
# test data subsets.
# Arguments:
# x - An R object of class 'fit_metrics'
# train - Logical; if TRUE, returns the
# metric for the training data subset
# metric - The type of metric to return; valid
# options are 'TPR', 'FPR', 'AUC',
# 'd_prime', 'criterion', 'CE', 'R',
# 'Accuracy', 'CM', 'AUC_curve',
# 'theta', and 'residuals'
# Returns:
# The specified metric for the data subset.
# Initialize output
out = NULL
# Extract dependent variable
if ( train ) {
out = x$train[[metric]]
} else {
out = x$test[[metric]]
}
return( out )
}
# 1.3)
#' @rdname fit_metrics
#' @export
print.fit_metrics = function( x, digits = 2 ) {
# Purpose:
# Provides a summary of the (singular) fit metrics
# over both the training and test data subsets.
# Arguments:
# x - An R object of class 'fit_metrics'
# digits - Number of digits to round to
string = ' '
names( string ) = ' '
ss = c( 'Training subset',
'Test subset' )
for ( i in 1:2 ) {
string = ss[i]
cat( string, '\n', '\n' )
# Extract metric labels
lbl = names( x[[i]] )
# Remove confusion matrix
lbl = lbl[ !(lbl %in% c( 'CM', 'theta', 'residuals', 'AUC_curve' ) ) ]
# Meaningful labels
m_lbl = rep( ' ', length( lbl ) )
m_lbl[ lbl == 'TPR' ] = 'True positive rate'
m_lbl[ lbl == 'FPR' ] = 'False positive rate'
m_lbl[ lbl == 'AUC' ] = 'Area under the curve'
m_lbl[ lbl == 'd_prime' ] = "d'"
m_lbl[ lbl == 'criterion' ] = 'Bias'
m_lbl[ lbl == 'CE' ] = 'Mean cross-entropy'
m_lbl[ lbl == 'R' ] = "Pearson's R"
m_lbl[ lbl == 'Accuracy' ] = 'Accuracy'
tbl = data.frame(
Value = round( unlist( x[[i]][lbl] ), digits ),
stringsAsFactors = F
)
rownames( tbl ) = m_lbl
print( tbl )
cat( '\n' )
}
}
# 1.4)
#' @rdname fit_metrics
#' @export
residuals.fit_metrics = function( x, train = F ) {
# Purpose:
# Extracts residuals from the 'fit_metrics'
# object.
# Arguments:
# x - An R object of class 'fit_metrics'
# train - Logical; if TRUE, returns the
# metric for the training data subset
# Returns:
# The difference between the predicted probabilities
# and the observed binary outcomes for a logistic
# regression.
out = NULL
if ( train ) {
out = x$train$residuals
} else {
out = x$test$residuals
}
return( out )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.