R/fit_metrics.R

# The 'fit_metrics' function
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2018-10-06

# Table of contents
# 1) fit_metrics                  |
#   1.1) is.fit_metrics           | tested
#   1.2) subset.fit_metrics       | tested
#   1.3) print.fit_metrics        | tested
#   1.4) residualts.fit_metrics   | tested

###
### 1) fit_metrics
###

#' Compute Fit Metrics for Binary Classification
#'
#' Given output following model fitting (e.g., logistic
#' regression), computes an assortment of fit metrics
#' for the training and test subsets of the data.
#'
#' @param fit Model fit output (e.g., output from
#'   \code{\link[stats]{glm}}).
#' @param dat An R object of class 'train_test'.
#' @param algorithm The type of fitting algorithm. Options
#'   include \code{glm} and \code{glmnet}.
#' @param offset_info An optional named list with a vector of
#'   offset values associated with each row of the matrix of
#'   predictrs (for 'train' and 'test', respectively)
#'   used when fitting data with \code{glm}.
#' @param lambda_val An optional value indicating the best-fitting
#'   penalty term found when fitting data using \code{glmnet}.
#' @param cutoff An optional integer coding how the
#'   probabilities returned from the \code{predict}
#'   function with \code{glm} output should be
#'   transformed into binary values, where...
#'   \itemize{
#'     \item 0 = cut-off of 0.5;
#'     \item 1 = cut-off set to mean proportion in data;
#'     \item 0 < cutoff < 1 = cutoff set to specified value;
#'     \item 2 = binary values generated via a binomial distribution.
#'   }
#'
#' @details The function computes several metrics:
#' \itemize{
#'   \item \code{TPR} - the true positive rate, the ratio of
#'     hits against the number of positive trials;
#'   \item \code{FPR} - the false positive rate, the ratio of
#'     false alarms against the number of negative trials;
#'   \item \code{AUC} - the area under the curve, estimated by
#'     numerical integration after tracing out the curve by
#'     computing the \code{FPR} versus \code{TPR} associated
#'     with each predicted probability;
#'   \item \code{d_prime} - a measure of discrimabiility;
#'   \item \code{criterion} - a measure of bias, where positive values
#'     denote greater bias against selecting a positive trial;
#'   \item \code{CE} - The mean cross-entropy;
#'   \item \code{R} - Pearson's R computed from the confusion matrix;
#'   \item \code{Accuracy} - Predictive accuracy, the proportion of
#'     cases the model predicted correctly;
#'   \item \code{CM} - The confusion matrix, the frequencies
#'     of predicted positive/negative instances against the observed
#'     frequencies;
#'   \item \code{AUC_curve} - a data frame with the cut-offs based on
#'     the unique predicted probabilities and the associated
#'     true and false positive rates. Allows plotting of the AUC
#'     curve;
#'   \item \code{theta} - the predicted probability of a positive
#'     outcome for each observation in the subset of data;
#'   \item \code{residuals} - the difference between the observed outcome
#'     and the predicted probability;
#' }
#' The \code{subset} method allows a specified metric to be extracted
#' for either the training (\code{train == TRUE}) or test subsets. The
#' \code{residuals} method extracts the residuals.
#'
#' @return An object of class 'fit_metrics', a list
#'   of lists, each providing the set of metrics over
#'   the training and test data, respectively.
#'
#' @export
#' @examples
#' # Simulate data
#' sim = bc_simulate( 300, 4, 2 )
#' # Create 'train_test' object
#' index = cv_index( 3, 300 )
#' dat = train_test( 3, index, sim$y, sim$X )
#' # Extract training data as data frame
#' train = as.data.frame( dat, train = T )
#' # Fit data
#' fit = glm( y ~ P1 + P2 + P3 + P4, family = 'binomial', data = train )
#' # Compute metrics
#' fm = fit_metrics( fit, dat )
#' fm

fit_metrics = function( fit, dat,
                        algorithm = 'glm',
                        offset_info = NULL,
                        lambda_val = NULL,
                        cutoff = NULL ) {

  # Initialize output
  output = list(
    train = list(),
    test = list()
  )

  # Set default value for cutoff
  if ( is.null( cutoff ) )
    cutoff = 0

  # Check dimensions for input
  dmn = sapply( dat,
                function(x) lapply( x,
                                    function(y) length(y)
                )
  )
  if ( any( dmn[,2] == 0 ) ) {
    L = 1
  } else L = 2

  # Error messages for incorrect input
  if ( L < 2 ) {
    string = "Data input needs both training and test subsets"
    stop( string, call. = FALSE )
  }
  allowed_algorithms = c( 'glm', 'glmnet', 'svm' )
  if ( !( algorithm %in% allowed_algorithms ) ) {
    aa = length( allowed_algorithms )
    string = paste(
      "The option 'algorithm' should be set to either: ",
      paste( allowed_algorithms[-aa], collapse = ', ' ),
      ', or ', allowed_algorithms[aa], sep = '' )
    stop( string, call. = FALSE )
  }

  # Extract levels
  levs = levels( dat )

  # Loop over training and test sets
  for ( i in 1:L ) {

    # Indicate if metrics are for training or
    # test subset
    if ( i == 1 ) {
      out = 'train'
      cur_y = subset( dat, T, T )
      cur_X = subset( dat, F, T )
      y_obs = as.integer( dat, train = T )
    } else {
      out = 'test'
      cur_y = subset( dat, T, F )
      cur_X = subset( dat, F, F )
      y_obs = as.integer( dat, train = F )
    }

    ### Standard logistic regression ###
    if ( algorithm == 'glm' ) {

      # Extract either training or
      # test data
      if ( out == 'train' ) {
        df = as.data.frame( dat, train = T )
      }
      if ( out == 'test' ) {
        df = as.data.frame( dat, train = F )
      }

      # Sample size
      n = nrow( df )

      # Initialize comparison vectors
      y_hat = rep( 0, n )

      # Generate predictions from logistic regression model
      if ( is.null( offset_info ) ) {
        pred = predict( fit, newdata = df )
      } else {
        offset_fit = glm( y ~ 0,
                          family = 'binomial',
                          data = df,
                          offset = offset_info[[out]]
        )
        pred = predict( offset_fit )
      }

      # Convert to probability
      theta = 1/( 1 + exp(-pred) )

      # Convert to binary values

      # Cut-off of .5
      if ( cutoff == 0 ) {
        co = .5
        y_hat = as.numeric( theta > co )
      }
      # Cut-off based on mean proportion
      if ( cutoff == 1 ) {
        co = mean( y_obs )
        y_hat = as.numeric( theta > co )
      }
      # Numeric cut-off provided
      if ( cutoff > 0 & cutoff < 1 ) {
        co = cutoff
        y_hat = as.numeric( theta > co )
      }
      # Generate values from a binomial distribution
      if ( cutoff == 2 ) {
        y_hat = rbinom( n, 1, theta )
      }

      # Confusion matrix
      pred_lbl = rep( levs[2], length( y_hat ) )
      pred_lbl[ y_hat == 1 ] = levs[1]
      # Convert to factors
      prd_lbl = factor( pred_lbl,
                        levels = levs )
      CM = table(
        Predicted = pred_lbl,
        Observed = cur_y
      )

    }

    ### Elastic net logistic regression ###
    if ( algorithm == 'glmnet' ) {

      if ( out == 'train' ) {
        trn = T
      } else {
        trn = F
      }

      # Generate predictions from model
      if ( is.null( lambda_val ) ) {
        lambda_val = mean( fit$lambda )
      }
      newx = subset( dat, F, train = trn )
      col_sel = rownames( coef( fit ) )[-1]
      newx = newx[,col_sel]
      # Extract predicted values
      pred = predict( fit,
                      s = lambda_val,
                      newx = newx,
                      type = 'class' )[,1]
      theta = predict( fit,
                       s = lambda_val,
                       newx = newx,
                       type = 'response' )[,1]
      y_hat = as.integer( pred == levs[1] )

      # Convert to factors
      pred = factor( pred,
                     levels = levs )

      # Confusion matrix
      CM = table(
        Predicted = pred,
        Observed = cur_y
      )

    }

    # Check dimensions of confusion matrix
    dmn = dim( CM )
    if ( dmn[1] < 2 ) {
      # Adjust table to have zeros
      tmp = rbind( CM, 0 )
      rownames( tmp ) = c( levs[ levs %in% rownames(CM) ],
                           levs[ !( levs %in% rownames(CM) ) ] )
      nms = dimnames( tmp )
      names( nms ) = c( 'Predicted', 'Observed' )
      dimnames( tmp ) = nms
      CM = tmp
    }
    if ( dmn[2] < 2 ) {

      # Adjust table to have zeros
      tmp = cbind( CM, 0 )
      colnames( tmp ) = c( levs[ levs %in% colnames(CM) ],
                           levs[ !( levs %in% colnames(CM) ) ] )
      nms = dimnames( tmp )
      names( nms ) = c( 'Predicted', 'Observed' )
      dimnames( tmp ) = nms
      CM = tmp
    }

    # Total positive trials
    N_pos = sum( CM[ , levs[1] ] )
    # Total negative trials
    N_neg = sum( CM[ , levs[2] ] )

    # Measures based on predicted probabilities and
    # observed values

    # Residuals
    residual_val = y_obs - theta

    # Mean cross-entropy
    CE = mean( -y_obs*log2( theta ) )

    # Area under the curve
    co = rev( sort( unique( theta ) ) ) # Define cut-off
    co = c( 1, co, 0 )
    # Compute true and false positive rate at
    # each value of cut-off
    AUC_curve = data.frame(
      Cutoff = co,
      TPR = rep( NA, length( co ) ),
      FPR = rep( NA, length( co ) )
    )
    for ( j in 1:length( co ) ) {
      AUC_curve$TPR[j] = sum( theta[ y_obs == 1 ] >= co[j] )/N_pos
      AUC_curve$FPR[j] = sum( theta[ y_obs == 0 ] >= co[j] )/N_neg
    }


    # Compute area using trapezoid approximation
    # for integration
    a = AUC_curve$TPR[ -nrow( AUC_curve ) ]
    b = AUC_curve$TPR[ -1 ]
    h = diff( AUC_curve$FPR )
    AUC = sum( area_trap( a, b, h ) )

    # Measures based on confusion matrix

    # Frequencies
    Hits = CM[ levs[1], levs[1] ]
    Misses = CM[ levs[2], levs[1] ]
    False_alarms = CM[ levs[1], levs[2] ]
    Correct_rejections = CM[ levs[2], levs[2] ]

    # True positive rate/sensitivity
    # P( Detection )
    TPR = Hits/N_pos
    # False positive rate
    FPR = False_alarms/N_neg

    # If frequencies of 0, adjust using
    # log-linear correction
    if ( TPR == 0 | FPR == 0 ) {
      TPR = (Hits+.5)/(N_pos+1)
      FPR = (False_alarms+.5)/(N_neg+1)
    }

    # Obtain estimate of criterion
    crt_est = -.5*( qnorm( TPR ) +
                      qnorm( FPR ) )

    # Obtain estimate of d'
    dp_est = 2*( qnorm( TPR ) + crt_est )

    # Pearson's R

    # Scale to avoid integer overflow
    adj = max( Hits, Correct_rejections, Misses, False_alarms )

    # Numerator
    p1 = (Hits/adj) * (Correct_rejections/adj)
    p2 = (Misses/adj) * (False_alarms/adj)
    # Denominator
    p3 = (Hits + False_alarms)/adj
    p4 = (Correct_rejections + Misses)/adj
    p5 = (Hits + Misses)/adj
    p6 = (False_alarms + Correct_rejections)/adj
    R = ( p1 - p2 ) / sqrt( p3*p4*p5*p6 )

    # Prediction accuracy
    Accuracy = mean( y_hat == y_obs )

    # Output
    output[[out]] = list(
      # Singular values
      TPR = TPR,
      FPR = FPR,
      AUC = AUC,
      d_prime = dp_est,
      criterion = crt_est,
      CE = CE,
      R = R,
      Accuracy = Accuracy,
      # Constructs
      CM = CM,
      AUC_curve = AUC_curve,
      theta = theta,
      residuals = residual_val
    )

  }

  # Set class for output
  class( output ) = 'fit_metrics'

  return( output )
}

# 1.1)
#' @rdname fit_metrics
#' @export

is.fit_metrics = function( x ) {
  # Purpose:
  # Checks if an object is of class 'fit_metrics'
  # Arguments:
  # x - An R object
  # Returns:
  # TRUE if the object is of class 'fit_metrics',
  # FALSE otherwise.

  return( inherits( x, 'fit_metrics' ) )
}

# 1.2)
#' @rdname fit_metrics
#' @export

subset.fit_metrics = function( x, train = F, metric = 'AUC' ) {
  # Purpose:
  # Extracts the specified fit metric based on
  # the model fit to either the training or
  # test data subsets.
  # Arguments:
  # x      - An R object of class 'fit_metrics'
  # train  - Logical; if TRUE, returns the
  #          metric for the training data subset
  # metric - The type of metric to return; valid
  #          options are 'TPR', 'FPR', 'AUC',
  #          'd_prime', 'criterion', 'CE', 'R',
  #          'Accuracy', 'CM', 'AUC_curve',
  #          'theta', and 'residuals'
  # Returns:
  # The specified metric for the data subset.

  # Initialize output
  out = NULL

  # Extract dependent variable
  if ( train ) {
    out = x$train[[metric]]
  } else {
    out = x$test[[metric]]
  }

  return( out )
}

# 1.3)
#' @rdname fit_metrics
#' @export

print.fit_metrics = function( x, digits = 2 ) {
  # Purpose:
  # Provides a summary of the (singular) fit metrics
  # over both the training and test data subsets.
  # Arguments:
  # x      - An R object of class 'fit_metrics'
  # digits - Number of digits to round to

  string = ' '
  names( string ) = ' '

  ss = c( 'Training subset',
          'Test subset' )

  for ( i in 1:2 ) {
    string = ss[i]
    cat( string, '\n', '\n' )

    # Extract metric labels
    lbl = names( x[[i]] )
    # Remove confusion matrix
    lbl = lbl[ !(lbl %in% c( 'CM', 'theta', 'residuals', 'AUC_curve' ) ) ]
    # Meaningful labels
    m_lbl = rep( ' ', length( lbl ) )
    m_lbl[ lbl == 'TPR' ] = 'True positive rate'
    m_lbl[ lbl == 'FPR' ] = 'False positive rate'
    m_lbl[ lbl == 'AUC' ] = 'Area under the curve'
    m_lbl[ lbl == 'd_prime' ] = "d'"
    m_lbl[ lbl == 'criterion' ] = 'Bias'
    m_lbl[ lbl == 'CE' ] = 'Mean cross-entropy'
    m_lbl[ lbl == 'R' ] = "Pearson's R"
    m_lbl[ lbl == 'Accuracy' ] = 'Accuracy'

    tbl = data.frame(
      Value = round( unlist( x[[i]][lbl] ), digits ),
      stringsAsFactors = F
    )
    rownames( tbl ) = m_lbl

    print( tbl )
    cat( '\n' )
  }

}

# 1.4)
#' @rdname fit_metrics
#' @export

residuals.fit_metrics = function( x, train = F ) {
  # Purpose:
  # Extracts residuals from the 'fit_metrics'
  # object.
  # Arguments:
  # x      - An R object of class 'fit_metrics'
  # train  - Logical; if TRUE, returns the
  #          metric for the training data subset
  # Returns:
  # The difference between the predicted probabilities
  # and the observed binary outcomes for a logistic
  # regression.

  out = NULL

  if ( train ) {
    out = x$train$residuals
  } else {
    out = x$test$residuals
  }

  return( out )
}
rettopnivek/binclass documentation built on May 13, 2019, 4:46 p.m.