R/logitModel.R

Defines functions pairs.logitModel plot.logitModel print.summary.logitModel print.logitModel summary.logitModel newtonRaphson logitModel

Documented in logitModel newtonRaphson pairs.logitModel plot.logitModel print.logitModel print.summary.logitModel summary.logitModel

#' Alternative Logistic Regression Implementation
#'
#' \code{logitModel} computes a logistic regression for a binary response
#'   variable via \code{\link{newtonRaphson}} estimation method.
#'
#' @param formula an object of class "formula".
#' @param data an optional data frame or coercible to one. If empty, the
#' variables are taken from the Global Environment.
#'
#' @param precision degree of precision of optimisation algorithm.
#' @param iterMax maximum number of iterations of optimisation algorithm.
#'
#' @return \code{logitModel} returns an object of \code{\link[base]{class}}
#'     \emph{logitModel}, which includes the coefficients, fitted vlaues,
#'     variance-covariance matrix, residuals, degree of freedom, loglikelihood
#'     value and deviance (these last three values also from Null model), and
#'     AIC. A few extra extra inforamtion such as the formula, the function
#'     call, the response variable and the design matrix, as well as the weight
#'     matrix and the number of iterations needed for convergence.
#'
#' @examples
#' set.seed(42)
#' X1 <- rnorm(100);    X2 <- rnorm(100);
#' points <- 2 * X1 - 3 * X2
#' y <- rbinom(100, 1, exp(points) / (1 + exp(points)))
#' X <- cbind(b1 = 1, b2 = X1, b3 = X2)
#' fit <- logitModel(y ~ X1 + X2)
#' @export
logitModel <- function(formula, data, precision = 1e-10, iterMax = 25) {

  ############################################################################
  # Initialisation                                                           #
  ############################################################################

  # generate model.frame. If "data" missing, search variables in Parent Env
  if(missing(data)) {
    modelFrame <- model.frame(formula, data = parent.frame())
  } else {
    modelFrame <- model.frame(formula, as.data.frame(data))
  }

  # Extract design matrix (X) and response var (y)
  X <- model.matrix(formula, modelFrame)
  y <- model.response(modelFrame)

  # make sure response variable is coded with 0s and 1s
  if (!(0 %in% y && 1 %in% y)) {
    y <- factor(y, labels = c(0, 1))
  }
  y <- as.numeric(as.character(y))

  # sanity check
  if (length(unique(y)) != 2) {
    stop("Response variable is expected to be binary")
  }

  ############################################################################
  # Computation                                                              #
  ############################################################################

  # Restricted Model
  XNull <- as.matrix(rep(1, length(y)))
  restricModel <- newtonRaphson(y = y,
                                X = XNull,
                                precision = precision,
                                iterMax = iterMax)

  bNull <- restricModel$beta
  logLikelihoodNull <- (sum((y * XNull %*% bNull) -
                              (log(1 + exp(XNull %*% bNull)))))
  devianceNull <- -2 * logLikelihoodNull
  dfNull <- nrow(X) - 1       # DF Null Model


  # Estimation Model
  MLEstimation <- newtonRaphson(y, X, precision = precision, iterMax = iterMax)

  # Degrees of Freedom
  dfRes <- nrow(X) - ncol(X)

  # Compute covariace matrix (eq 22 from Czepiel, 2002)
  W <- MLEstimation$W
  vcov <- solve(t(X) %*% W %*% X)

  # Compute residual deviance
  eta <- MLEstimation$eta
  s <- ifelse(y == 0, -1, y)
  residuals <- as.numeric(s * sqrt(-2*((y * eta) - (log(1 + exp(eta))))))

  # calculate max value of logLikelihood
  beta <- MLEstimation$beta
  logLikelihood <- sum((y * X %*% beta) - (log(1 + exp(X %*% beta))))

  # Information Criteria
  AIC <- -2 * logLikelihood + 2 * ncol(X)

  # calculate Residual Deviance
  devianceResid <- -2 * logLikelihood

  # Return list of results
  result <- list(coefficients = beta[,],
                 fittedValues = MLEstimation$pi,
                 vcov = vcov,
                 residuals = residuals,
                 dfRes = dfRes,
                 logLikelihood = logLikelihood,
                 devianceResid = devianceResid,
                 AIC = AIC,

                 # NULL Model
                 dfNull = dfNull,
                 devianceNull = devianceNull,
                 logLikelihoodNull = logLikelihoodNull,

                 # Extras
                 formula = formula,
                 call = match.call(),
                 y = y,
                 X = X,
                 W = W,
                 iterCount = MLEstimation$iterCount)

  # Assign Class
  class(result) <- "logitModel"

  result
}


#' Newton-Raphson for Logistic Regression Estimation
#'
#' This function calculates the maximum likelihood for binary logistic
#' regression via the Newton Raphson algorithm.
#'
#' This algorithm is used on the \code{\link{logitModel}} regression estimation
#' function.
#'
#' @param y Matrix/vector containing the response variable.
#' @param X Matrix/vector containing the design matrix (including intercept).
#' @param precision Degree of precision of algorithm.
#' @param iterMax Maximum number of iterations of optimisation algorithm.
#'
#' @return A list with maximum likelihood estimation results for further
#'   computation.
#'
#' @references Agresti, A (2013) Categorical Data Analysis,
#' John Wiley & Sons, 2013, Volume 792 of Wiley Series in Probability
#' and Statistics, ISSN 1940-6517
#'
#' @references Czepiel, S.A. (2002) Maximum Likelihood Estimation of Logistic
#' Regression Models: Theory and Implementation. Available at
#' \href{https://czep.net/stat/mlelr.pdf}{czep.net/stat/mlelr.pdf}. [visited:
#' 25.03.2018]
#'
#' @examples
#' set.seed(42)
#' X1 <- rnorm(100);    X2 <- rnorm(100);
#' points <- 2 * X1 - 3 * X2
#' y <- rbinom(100, 1, exp(points) / (1 + exp(points)))
#' X <- cbind(b1 = 1, b2 = X1, b3 = X2)
#' fit <- newtonRaphson(y = y, X = X)
#'
#' @export
newtonRaphson <- function(y, X, precision = 1e-7, iterMax = 25) {
  #01 initiate variables
  beta <- rep(0, times = ncol(X))
  i <- 0
  convergence <- FALSE

  #02 compute coefficients until convergence or maximum iteration reached
  while (i <= iterMax & convergence == FALSE) {
    # update i
    i <- i + 1

    # calculate probabilities (pi)
    eta <- X %*% beta
    pi <- as.numeric(logist(X %*% beta))

    # init / update Matrix W
    W <- diag(pi * (1 - pi))

    # calculate and update beta (eq. 23 from Czepiel, 2002)
    deltaBeta <- solve(t(X) %*% W %*% X) %*% t(X) %*% (y - pi)
    beta <- beta + deltaBeta

    # check convergence condition
    if (sum(abs(deltaBeta)) < precision) {
      convergence <- TRUE
      break
    }
  }

  if (convergence == FALSE) {
    stop(paste(i, "iterations reached without convergence. Increase iterMax?"))
  }

  #03 return list of results
  return(list(beta = beta,
              pi = pi,
              W = W,
              eta = eta,
              iterCount = i))
}


#' S3 Summary Method for logitModel Objects
#'
#' This internal function defines a \code{\link{summary}} method for an object
#' of logitModel class, where also further statistics are computed for model
#' comparison and for the print.summary method.
#'
#' @param model a logitModel object
#' @param ... methods are required to include (at least) the same arguments
#' as their generic functions.
#'
#' @return a list with more detailed statistics and computations on the
#' estimated model
#'
#' @export
summary.logitModel <- function(model, ...) {

  # Standard Error
  betaSE <- sqrt(diag(model$vcov))
  model$betaSE <- betaSE

  # z-values
  zStat <- model$coefficients / betaSE
  model$zStat <- zStat

  # p-values
  pValue <- 2 * pnorm(-abs(zStat))
  model$pValue <- pValue

  # Extra Statistics
  model$devianceNull <- model$devianceNull
  model$devianceResid <- model$devianceResid
  model$AIC <- (-2 * model$logLikelihood + 2 * ncol(model$X))
  model$BIC <- -2 * model$logLikelihood + ncol(model$X) * log(length(model$y))

  class(model) <- "summary.logitModel"

  model
}


#' S3 Print Method for logitModel Objects
#'
#' This internal function defines a \code{\link{print}} method for an object of
#' logitModel class.
#'
#' @param model a logitModel object
#' @param ... methods are required to include (at least) the same arguments
#' as their generic functions.
#'
#' @return a brief summary of the model results.
#'
#' @export
print.logitModel <- function(model, ...){

  # based on stats:::print.lm()
  cat("Call: ", paste0(deparse(model$call)), fill = TRUE)

  cat("\nCoefficients:\n")

  print.default(format(coef(model),digits = 4L),
                print.gap = 2L, quote = FALSE, right = TRUE)

  # create named vector for printing two columns of summary stats. detail: cols
  # must be of even length for sprintf to properly iterate over the columns
  cols <- c("Obs."          = length(model$y),
            "DF"           = length(model$y) - ncol(model$X),
            "Deviance"     = round(model$devianceResid, 2),
            "AIC"          = model$AIC
  )

  # the following lines is just to make sure the output fits each
  # in its columns, no matter how many characters the variables have
  i2 <- max(nchar(format(cols[seq(1, length(cols), 2)], scientific=FALSE))) + 1
  i4 <- max(nchar(format(cols[seq(2, length(cols), 2)], scientific=FALSE))) + 1

  # concatenate over "four columns", as in: (names_l: 12.123 \t names_r: 45.678)
  cat(sprintf(paste0("\n%-9s:%",i2,".2f\t%-4s:%",i4,".2f"),
              names(cols[seq(1, length(cols), 2)]),    # first (names) col
              cols[seq(1, length(cols), 2)],           # second (number) col
              names(cols[seq(2, length(cols), 2)]),    # third (names) col
              cols[seq(2, length(cols), 2)])           # fourth (numbers) col
  )

  invisible(model)
}


#' S3 Print Method for summary.logitModel Objects
#'
#' This internal function defines a \code{\link{print}} method for an object
#' of \code{\link{summary.logitModel}} class.
#'
#' @param model an object of class "summary.logitModel".
#' @param ... methods are required to include (at least) the same arguments
#' as their generic functions.
#'
#' @return a summary of the model results.
#'
#' @export
print.summary.logitModel <- function(model, ...) {

  cat("Call: ", deparse(model$call), fill = TRUE)

  cat("\nDeviance Residuals:\n")
  print.summaryDefault(summary(model$residuals), digits = 5L)

  cat("\nCoefficients:\n")

  model$coefTable <- cbind("Estimate" = model$coefficients,
                       "Std. error" = model$betaSE,
                       "z value" = model$zStat,
                       "Pr(>|z|)" = model$pValue)

  printCoefmat(model$coefTable, signif.legend = FALSE, digits = 5L,
               print.gap = 2)

  # create named vector for printing two columns of summary stats. detail: cols
  # must be of even length for sprintf to properly iterate over the columns
  cols <- c("Obs."            = length(model$y),
            "DF"              = length(model$y) - ncol(model$X),
            "Resid. Deviance" = round(model$devianceResid, 2),
            "AIC"             = model$AIC,
            "Null Devinance"  = model$devianceNull,
            "BIC"             = model$BIC,
            "Log-Likelihood"  = model$logLikelihood,
            "NR Iterat."      = model$iterCount
  )


  # the following lines is just to make sure the output fits each
  # in its columns, no matter how many characters each variable have
  i2 <- max(nchar(format(cols[seq(1, length(cols), 2)], scientific=FALSE))) + 1
  i4 <- max(nchar(format(cols[seq(2, length(cols), 2)], scientific=FALSE))) + 1

  # concatenate over "four columns", as in: (names_l: 12.123 \t names_r: 45.678)
  cat(sprintf(paste0("\n%-16s:%",i2,".2f\t%-11s:%",i4,".2f"),
              names(cols[seq(1, length(cols), 2)]),    # first (names) col
              cols[seq(1, length(cols), 2)],           # second (number) col
              names(cols[seq(2, length(cols), 2)]),    # third (names) col
              cols[seq(2, length(cols), 2)])           # fourth (numbers) col
  )


  invisible(model)
}


#' S3 Plot Method for logitModel Objects
#'
#' This internal function defines a \code{\link{plot}} method for an object
#' of logitModel class.
#'
#' @param model an object of class logitModel.
#' @param which numeric vector indicating which plot to be plotted.
#' 1: Residuals vs Fitted. 2: Normal Q-Q. 3: Scale Location. 4: Residual vs
#' Leverage
#' @param cookLevels vector indicating levels of Cook's Distance to be drawn in
#' plot 4 (Residual vs Leverage).
#' @param col optional string or numeric vector to indicate colours of data
#' points for response variable. If missing, assigned values for 0
#' and 1 will be c("#483D8BD0", "#B22222D0"), blue and red respectively.
#' @param ask optional boolean wether inteded to plot interactively or not.
#' Default value checks if graphics device are in interactive mode and
#' wheter the user previously defined a matrix plotting layout in order to
#' make a sensible guess about asking for each plot or not.
#' @param ... methods are required to include (at least) the same arguments
#' as their generic functions.
#'
#' @export
plot.logitModel <- function(model, which = 1:4, col, ask,
                            cookLevels = c(.5, 1), ...) {

  # Common Parameters
  oldPar <- par(no.readonly = TRUE)
  on.exit(par(oldPar))
  par(mar = c(2, 0, 0, 0) + 2.2)
  if (missing(ask)) ask <- prod(par("mfcol")) < length(which) && dev.interactive()
  if (missing(col)) col <- c("#483D8BD0", "#B22222D0")
  else if (length(col) < 2) col <- rep(col, 2)

  # PLOT 01
  if (1 %in% which) {
    x <- model$X %*% model$coefficients
    y <- model$residuals
    plot(x = x, y = y,
         main = "Residuals vs Fitted",
         ylab = "Residuals",
         xlab = paste("Predicted Values\n", deparse(model$formula)),
         col = col[1 + model$y], ...)
    abline(a = 0, b = 0, lty = 3, col = "gray")

    # add label of highest absolute residuals
    idx <- order(abs(y), decreasing = T)[1:5]
    text(x[idx], y[idx], model$residuals[idx], labels = idx, cex = .66, pos = 2,
         col = col[model$y[idx] + 1])

    # add smooth curve via LOWESS regression
    lines(lowess(x = x, y = y), col = "red")
  }

  # Plot 02
  if (2 %in% which) {
    if (length(which) > 1 && ask && nextPlot() == "no") {
      return(invisible())
    }
    # save plot as p for extracting (x,y) points for plotting labels
    p <- qqnorm(model$residuals,
                main = "Normal Q-Q",
                ylab = "Std. deviance resid.",
                xlab = paste("Theoretical Quantiles\n", deparse(model$formula)),
                col = col[1 + model$y], ...)
    qqline(model$residuals, lty = 3, col = "gray")

    # add label of highest absolute residuals
    idx <- order(abs(model$residuals), decreasing = TRUE)[1:5]
    text(p$x[idx], p$y[idx],
         model$residuals[idx], labels = idx, cex = .66, pos = 2,
         col = col[model$y[idx] + 1])
  }

  # Plot 03
  if (3 %in% which) {
    if (length(which) > 1 && ask && nextPlot() == "no") {
      return(invisible())
    }

    x <- model$X %*% model$coefficients
    y <- sqrt(abs(model$residuals))
    ylab <- as.expression(substitute(sqrt(abs(x)),
                                     list(x = as.name("Std. Deviance Resid."))))
    plot(x = x, y = y,
         main = "Scale Location",
         ylab = ylab,
         xlab = paste("Predicted Values\n", deparse(model$formula)),
         col = col[1 + model$y], ...)

    # Plot Smoothing via LOWESS regression
    lines(lowess(x = model$X %*% model$coefficients,
                  y = sqrt(abs(model$residuals))), col = "red")

    # add label of highest absolute residuals
    idx <- order(abs(y), decreasing = T)[1:5]
    text(x[idx], y[idx],
         model$residuals[idx], labels = idx, cex = .66, pos = 2,
         col = col[model$y[idx] + 1])

  }

  # Plot 04
  if (4 %in% which) {
    if (length(which) > 1 && ask && nextPlot() == "no") {
      return(invisible())
    }

    leverage <- diag(sqrt(model$W) %*% model$X %*%
                       (solve(t(model$X) %*%model$W %*% model$X)) %*%
                       t(model$X) %*% sqrt(model$W))
    pearsonResidual <- (model$y - model$fittedValues) /
                        sqrt(model$fittedValues*(1 - model$fittedValues))

    plot(x = leverage, y = pearsonResidual,
         main = "Residual vs Leverage",
         ylab = "Std. Pearson resid.",
         xlab = paste("Leverage\n", deparse(model$formula)),
         col = col[1 + model$y],
         ylim = range(pearsonResidual) * 1.1, ...)

    abline(a = 0, b = 0, lty = 3, col = "gray")

    # Plot Smoothing via LOWESS regression
    lines(lowess(x = leverage,
                 y = pearsonResidual), col = "red",
          xlab = "")

    # PLOT COOKS DISTANCE
    rangeLev <- range(leverage)
    modelRank <- ncol(model$X)
    plotLim <- par("usr")
    seqPlot <- seq.int(min(rangeLev[1L], rangeLev[2L]/100),
                       plotLim[2L], length.out = 101)

    for (crit in cookLevels) {
      cl.h <- sqrt(crit * modelRank * (1 - seqPlot)/seqPlot)
      lines(seqPlot, cl.h, lty = 2, col = 2)
      lines(seqPlot, -cl.h, lty = 2, col = 2)
    }


    legend("bottomleft", legend = paste0("Cook's Distance [",
                                        paste(cookLevels, collapse = ", "),"]"),
           lty = 2, col = 2, bty = "n", cex = 0.75, text.col = "#000000AA")


    # add label of highest absolute residuals
    idx <- order(abs(model$residuals), decreasing = T)[1:5]
    text(leverage[idx], pearsonResidual[idx],
         model$residuals[idx], labels = idx, cex = .66, pos = 2,
         col = col[model$y[idx] + 1])
  }
}


#' S3 Pairs Method for logitModel Objects
#'
#' This internal function defines a \code{\link{pairs}} method for an object
#' of logitModel class. The main idea is to plot all explanatory variables
#' agaist the response variable of the model in order to quickly assess the
#' influence of each one.
#'
#' @param model an object of class logitModel.
#' @param nRow optional numeric vector indicating the number of rows the overall
#' plot should have.
#' @param single boolean to indicate whether each variable should be plotted
#' against the response variable individually.
#'
#' @examples
#' set.seed(42)
#' n <- 100
#' dummies <- sample(LETTERS[1:5], n, replace = TRUE)
#' X1 <- rnorm(n); X2 <- rnorm(n)
#' points <- 2 * X1 - 3 * X2
#' y <- rbinom(n, 1, exp(points) / (1 + exp(points)))
#' df <- as.data.frame(cbind(b2 = X1, b3 = X2))
#' myFitD <- logitModel(y ~ X1 + X2 + dummies)
#' pairs(myFitD)
#' pairs(myFitD, single = TRUE)
#' @export
pairs.logitModel <- function(model, nRow, single = FALSE,
                           col, ask,  ...) {

  # INIT
  if (missing(col)) col <- c("#483D8BD0", "#B22222D0")
  if (missing(ask)) ask <- prod(par("mfcol")) < length(which) && dev.interactive()
  intercept <- attr(terms(model$formula), "intercept")
  if (intercept == 1) {
    X <- as.data.frame(model$X)[-1]
    betas <- model$coefficients[-1]
  } else {
    X <- as.data.frame(model$X)
    betas <- model$coefficients
  }

  # GRAPHICS PARAMETERS
  # keep previous parameters
  oldPar <- par(no.readonly = TRUE)
  on.exit(par(oldPar))

  # get a (more or less) squared layout if missing nRow arg.
  if (missing(nRow)) nRow <- ceiling(sqrt(length(model$coefficients)))
  parCol <- ceiling(length(X) / nRow)
  parRow <- min(nRow, length(X))

  if (isTRUE(!single)) {
    par(mfrow = c(parRow, parCol), mar = c(0, 0, 0, -0.5) + 1, cex = 0.9,
        cex.axis = 0.8, tcl = -0.15, oma = c(0, 0, 0, 0), mgp = c(2, 0, 0),
        font.main = 1, cex.main = 0.9)
  }

  # plot each
  for (i in seq_along(X)) {
    main <- paste0(as.character(model$formula[2]), " ~ ", names(X)[i])

    plot(X[, i], jitter(model$y, 1/10),
         xlab = "", ylab = "", main = main,
         col = col[model$y + 1], ...)

    # col=rgb(0, 0, 0, 0.4))
    curve(logist(intercept + (betas[i] * x)),
          lwd=2, lty=3, col = "gray", add = TRUE)

    # points
    points(X[, i], logist(intercept + betas[i] * X[, i]),
           col = col[model$y + 1])

    if (isTRUE(single && ask)) {
      prompt <- paste0("Plot ", i, "/", ncol(X),
                       ": Enter anything to continue or [q] to quit: ")
      UInput <- readline(prompt=prompt)
      if (UInput %in% c("q", "Q")) return(invisible())
    }
  }
}
avila/logitModel documentation built on Dec. 19, 2021, 6:34 a.m.