qacReg: Multiple Regression and Logistic Regression

Documented in performance.glm

#' @title Model Performance for Logistic Regression
#' @description
#' Produce indices of model performance for logistic regression
#'
#' @details
#' The model in the \code{glm} object is used to predict the
#' the predicted probability of being a "positive" case for each observation
#' in the data frame. If a data frame is not specified, the model is
#' applied to the training data (data frame contained in the model component
#' of the \code{glm} object).
#' If the predicted probability is greater then \code{p}, the case is
#' classified as "positive", "not positive" otherwise. The actual and
#' predicted classifications are compared to calculate performance
#' metrics.
#'
#' The function returns general information
#' about the model, a confusion matrix and statistics generated by
#' the \code{\link[caret]{confusionMatrix}} function from the \code{caret}
#' package, and a graph visualizing the results.
#'
#' @param x an object of class \code{"glm"}.
#' @param data a data frame. If no data frame is provided, the model training
#' data is used.
#' @param digits integer; number of digits to print (default=4).
#' @param p numeric; probability cutoff for classifying cases (default = 0.5)
#' @param plot logical; If \code{TRUE} the results are plotted.
#' @param ... not currently used
#'
#' @import caret
#' @importFrom stats as.formula binomial predict
#'
#' @return The results of the methods \code{\link{performance.glm}}
#' @export
#' @seealso \code{\link[caret]{confusionMatrix}}, \code{\link{roc_plot}},
#' \code{\link{lift_plot}}.
#' @examples
#' # performance on training sample
#' fit <- regress(caesarian ~ ., data = caesarian)
#' performance(fit, plot=TRUE)
performance.glm <- function(x, data, digits=4, p=0.5, plot=TRUE, ...){

  formula <- stats::as.formula(x$call[[2]])
  dv <- as.factor(x$model[[1]])
  dvname <- names(x$model)[1]
  dvLevels <- levels(dv)

  if(missing(data)){
    data <- eval(x$call[[4]])
    dataname <- as.character(x$call[[4]])
  } else {
    dataname <- as.character(substitute(data))
  }

  prob <- stats::predict(x, data, type="response")
  pred <- factor(ifelse(prob > p, dvLevels[2], dvLevels[1]))
  stats <- caret::confusionMatrix(pred, dv, positive=dvLevels[2])

  cattbl <- round(prop.table(table(x$model[1])), 2)
  catnames <- dimnames(cattbl)[[1]]


  heading("LOGISTIC REGRESSION PERFORMANCE")

  # category frequencies

  cat("Data              :", dataname, "\n")
  cat("N                 :", nrow(x$model), "\n")
  cat("Response variable :", dvname, "\n")
  cat("Category Balance  : ",
      catnames[1], " (", cattbl[1], "%", ") ",
      catnames[2], " (", cattbl[2], "%", ") ", "\n", sep="")
  cat("Predicted category:", stats$positive, "\n")
  cat("Prob to classify  : >=", p, "\n\n", sep="")

  mdl <- as.character(x$call)
  cat("Model: ",
      mdl[1], "(", mdl[2], ",\n",
      "           family = ", mdl[3],
      ", data = ", mdl[4], ")", "\n\n", sep="")

  heading("Confusion Matrix")
  cat("\n")
  names(dimnames(stats$table)) <- c("Predicted", "Actual")
  print(stats$table)
  cat("\n")
  heading("Overall Statistics")
  cat("\n")
  cat("Accuracy:",
      round(stats$overall[1], digits), "\n")
  cat("97% CI  : (",
      round(stats$overall[3], digits),
      ", ", round(stats$overall[4], digits), ")\n", sep="")
  cat("No Information Rate: ",
      round(stats$overall[5], digits), "\n")
  cat("P-Value [Acc > NIR]: ",
      format.pval(stats$overall[6], digits), "\n")
  cat("\n")

  heading("Statistics by Category")
  dfstats <- as.data.frame(stats$byClass, drop=FALSE)
  row.names(dfstats) <- names(stats$byClass)
  row.names(dfstats)[1] <- "Sensitivity"
  row.names(dfstats)[3] <- "Pos Pred Value"
  names(dfstats) <- ""
  print(dfstats[c(1,2,3,4,7),,drop=FALSE], digits=digits)
  cat("---\n")
  cat("Note: recall = sensitivity, \n")
  cat("      precision = pos pred value.\n")

  # plot results
  if(plot){
    print(binaryPlot(stats))
  }


 invisible(stats)
}