Calibration Functions

Documented in calibrate calibrate.default calibrate.formula calibrate.lm

#' Calibration for the simple linear regression model
#' 
#' The function \code{calibrate} computes the maximum likelihood estimate and a
#' condfidence interval for the unknown predictor value that corresponds to an 
#' observed value of the response (or vector thereof) or specified value of the 
#' mean response. See the reference listed below for more details.
#'  
#' @param object A matrix, list, data frame, or object that inherits from class 
#' \code{\link[stats]{lm}}.
#' 
#' @param formula A formula of the form \code{y ~ x}.
#' 
#' @param data an optional data frame, list or environment (or object coercible 
#' by \code{as.data.frame} to a data frame) containing the variables in the 
#' model. If not found in data, the variables are taken from 
#' \code{environment(formula)}, typically the environment from which 
#' \code{\link[stats]{lm}} was called. 
#' 
#' @param subset An optional vector specifying a subset of observations to be 
#' used in the fitting process.
#' 
#' @param na.action a function which indicates what should happen when the data 
#' contain \code{NA}s. 
#' 
#' @param y0 The value of the observed response(s) or specified value of the
#' mean response.
#' 
#' @param interval The method to use for forming a confidence interval.
#' 
#' @param level A numeric scalar between 0 and 1 giving the confidence level for 
#' the interval to be calculated. 
#' 
#' @param mean.response Logicial indicating whether confidence intervals should 
#' correspond to an observed response(s) (\code{FALSE}) or a specified value of 
#' the mean response (\code{TRUE}). Default is \code{FALSE}.
#' 
#' @param adjust A logical value indicating if an adjustment should be made to
#' the critical value used in constructing the confidence interval. This useful 
#' when the calibration curve is to be used k > 0 times.
#' 
#' @param k The number of times the calibration curve is to be used for 
#' computing a confidence interval. Only needed when \code{adjust = TRUE}.
#' 
#' @param ... Additional optional arguments. At present, no optional arguments 
#' are used.
#'            
#' @return An object of class \code{"invest"} containing the following 
#'         components:
#' \itemize{
#'   \item \code{estimate} The estimate of x0.
#'   \item \code{lwr} The lower confidence limit for x0.
#'   \item \code{upr} The upper confidence limit for x0.
#'   \item \code{se} An estimate of the standard error (Wald interval only).
#'   \item \code{interval} The method used for calculating \code{lower} and 
#'                   \code{upper} (only used by \code{print} method).
#' }
#' 
#' @references 
#' Graybill, F. A., and Iyer, H. K. (1994)
#' \emph{Regression analysis: Concepts and Applications}. Duxbury Press.
#' 
#' Miller, R. G. (1981)
#' \emph{Simultaneous Statistical Inference}. Springer-Verlag.
#' 
#' @rdname calibrate
#' 
#' @aliases print.calibrate
#' 
#' @export
#'
#' @note The \code{\link{invest}} function is more general, but is based on 
#' numerical techniques to find the solution. When the underlying model is that 
#' of the simple linear regression model with normal errors, closed-form 
#' expressions exist which are utilized by the \code{calibrate} function.
#' 
#' @examples
#' #
#' # Arsenic example (simple linear regression with replication)
#' #
#' 
#' # Inverting a prediction interval for an individual response
#' arsenic.lm <- stats::lm(measured ~ actual, data = arsenic)
#' plotFit(arsenic.lm, interval = "prediction", shade = TRUE, 
#'         col.pred = "lightblue")
#' (cal <- calibrate(arsenic.lm, y0 = 3, interval = "inversion"))
#' abline(h = 3)
#' segments(cal$estimate, 3, cal$estimate, par()$usr[3])
#' arrows(cal$lower, 3, cal$lower, par()$usr[3])
#' arrows(cal$upper, 3, cal$upper, par()$usr[3])
#' 
#' #
#' # Crystal weight example (simple linear regression)
#' #
#' 
#' # Inverting a confidence interval for the mean response
#' crystal.lm <- stats::lm(weight ~ time, data = crystal)
#' plotFit(crystal.lm, interval = "confidence", shade = TRUE,
#'         col.conf = "lightblue")
#' (cal <- calibrate(crystal.lm, y0 = 8, interval = "inversion", 
#'                   mean.response = TRUE))
#' abline(h = 8)
#' segments(cal$estimate, 8, cal$estimate, par()$usr[3])
#' arrows(cal$lower, 8, cal$lower, par()$usr[3])
#' arrows(cal$upper, 8, cal$upper, par()$usr[3])
#'
#' # Wald interval and approximate standard error based on the delta method
#' calibrate(crystal.lm, y0 = 8, interval = "Wald", mean.response = TRUE)
calibrate <- function(object, ...) {
  UseMethod("calibrate")
}


#' @rdname calibrate
#' 
#' @export
#' 
#' @method calibrate default
calibrate.default <- function(object, y0, 
                              interval = c("inversion", "Wald", "none"), 
                              level = 0.95, mean.response = FALSE, 
                              adjust = c("none", "Bonferroni", "Scheffe"), k, 
                              ...) {

  # Extract needed components from fitted model
  if (inherits(object, "matrix")) {
    x <- object[, 1]
    y <- object[, 2]
  } else if (inherits(object, "data.frame")) {
    object <- data.matrix(object)
    x <- object[, 1]
    y <- object[, 2]
  } else if (inherits(object, "list")) {
    x <- object[[1]]
    y <- object[[2]]
    if (length(x) != length(y)) {
      stop(paste("Components of '", deparse(substitute(object)), 
                 "' not of same length.", sep = ""), call. = FALSE)
    }
  } else {
    stop("'", paste(deparse(substitute(object)), 
                    "' is not a valid matrix, list, or data frame.", sep = ""),
         call. = FALSE)
  }
  eta <- mean(y0)  # mean of new observations
  m <- length(y0)  # number of new observations
  if (mean.response && m > 1) stop("Only one mean response value allowed.")
  
  # Fit a simple linear regression model and compute necessary components
  z <- stats::lm.fit(cbind(1, x), y)
  b <- unname(z$coefficients)
  n <- length(r <- z$residuals)  # sample size and residuals
  DF <- (DF1 <- n - 2) + (DF2 <- m - 1)  # degrees of freedom
  var1 <- sum(r ^ 2) / z$df.residual  # stage 1 variance estimate
  var2 <- if (m == 1) 0 else stats::var(y0)  # stage 2 variance estimate
  var.pooled <- (DF1 * var1 + DF2 * var2) / DF  # pooled estimate of variance
  sigma.pooled <- sqrt(var.pooled)  # sqrt of pooled variance estimate
  ssx <- sum((x - mean(x))^2)  # sum-of-squares for x, Sxx
  x0.mle <- (eta - b[1L])/b[2L]  # MLE of x0
  
  # Return point estimate only
  interval <- match.arg(interval)
  if (interval == "none") return(x0.mle)
  
  # Adjustment for simultaneous intervals
  adjust <- match.arg(adjust)  # FIXME: Does simultaneous work for m > 1?
  crit <- if (m != 1 || adjust == "none") stats::qt((1 + level)/2, n+m-3) else {
    switch(adjust,
          "Bonferroni" = stats::qt((level + 2*k - 1) / (2*k), n+m-3),
          "Scheffe"    = sqrt(k * stats::qf(level, k, n+m-3)))
  }

  # Inversion interval --------------------------------------------------------
  if (interval == "inversion") { 

    # Define constants; see Graybill and Iyer (1994)
    c1 <- b[2L]^2 - (sigma.pooled^2 * crit^2)/ssx
    c2 <- if (mean.response) {
      c1/n + (eta - mean(y))^2/ssx
    } else {
      c1*(1/m + 1/n) + (eta - mean(y))^2/ssx
    }
    c3 <- b[2L] * (eta - mean(y))
    c4 <- crit * sigma.pooled
    
    # Calculate inversion interval
    if (c1 < 0 && c2 <= 0) {
      
      # Throw warning if interval is the real line
      warning("The calibration line is not well determined.", call. = FALSE)
      lwr <- -Inf
      upr <- Inf
      
    } else {
      
      # Construct inversion interval
      lwr <- mean(x) + (c3 - c4*sqrt(c2))/c1
      upr <- mean(x) + (c3 + c4*sqrt(c2))/c1
      
      # Throw error if result is not a single interval
      if (c1 < 0 && c2 > 0) {
        stop(paste("The calibration line is not well determined. The resulting \nconfidence region is the union of two semi-infinite intervals:\n(", -Inf, ",", 
                   round(upr, 4), ") U (", round(lwr, 4), ",", Inf, ")"), 
             call. = FALSE)
      }
      
    }
    
    # Store results in a list
    res <- list("estimate" = x0.mle, 
                "lower"    = lwr, 
                "upper"    = upr, 
                "interval" = interval)
  
  }
    
  # Wald interval  
  if (interval == "Wald") { 
    
    # Compute standard error for Wald interval
    se <- if (mean.response) {
      abs((sigma.pooled/b[2]))*sqrt((1/n + (x0.mle - mean(x))^2/ssx))
    } else {
      abs((sigma.pooled/b[2]))*sqrt((1/m + 1/n + (x0.mle - mean(x))^2/ssx))
    }

    # Store results in a list
    res <- list("estimate" = x0.mle, 
                "lower"    = x0.mle - crit * se,
                "upper"    = x0.mle + crit * se, 
                "se"       = se, 
                "interval" = interval)
    
  } 
  
  # Assign class label and return results
  class(res) <- "invest"
  res
  
} 


#' @rdname calibrate
#' 
#' @export
#' 
#' @method calibrate formula
calibrate.formula <- function(formula, data = NULL, ..., subset, 
                              na.action = stats::na.fail) {
  m <- match.call(expand.dots = FALSE)
  if (is.matrix(eval(m$data, sys.parent()))) {
    m$data <- as.data.frame(data)
  }
  m$... <- NULL
  m[[1]] <- as.name("model.frame")
  m <- eval(m, sys.parent())
  Terms <- attr(m, "terms")
  attr(Terms, "intercept") <- 0
  y <- stats::model.extract(m, "response")
  mm <- stats::model.matrix(Terms, m)
  if (ncol(mm) > 1) {
    stop("This function only works for the simple ",
         "linear regression model (i.e., y ~ x).", call. = FALSE)
  }
  x <- as.numeric(mm)
  calibrate(cbind(x, y), ...)
} 


#' @rdname calibrate
#' 
#' @export
#' 
#' @method calibrate lm
calibrate.lm <- function(object, y0, interval = c("inversion", "Wald", "none"), 
                         level = 0.95, mean.response = FALSE, 
                         adjust = c("none", "Bonferroni", "Scheffe"), k, ...) {
  
  # Check model formula for correctness
  xname <- all.vars(stats::formula(object)[[3L]])
  yname <- all.vars(stats::formula(object)[[2L]])
  if (length(xname) != 1L) {
    stop("Only one independent variable allowed.")
  }
  if (length(yname) != 1L) {
    stop("Only one dependent variable allowed.")
  }
  
  # Check for intercept using terms object from model fit. Alternatively, this
  # can also be checked by testing if the first column name in model.matrix is 
  # equal to "(Intercept)".
  if (!attr(stats::terms(object), "intercept")) {
    stop(paste(deparse(substitute(object)), "must contain an intercept."))
  }
  
  # Extract x values and y values from model frame
  mf <- stats::model.frame(object)
  if (ncol(mf) != 2) {
    stop("calibrate only works for the simple linear regression model.")
  } 
  x <- stats::model.matrix(object)[, 2]
  y <- stats::model.response(mf)

  # Eta - mean response or mean of observed respone values
  eta <- mean(y0)  # mean of new observations
  m <- length(y0)  # number of new observations
  if (mean.response && m > 1) stop("Only one mean response value allowed.")
  
  # Fit a simple linear regression model and compute necessary components
  b <- unname(object$coefficients)
  n <- length(r <- object$residuals)  # sample size and residuals
  DF <- (DF1 <- n - 2) + (DF2 <- m - 1)  # degrees of freedom
  var1 <- sum(r ^ 2) / object$df.residual  # stage 1 variance estimate
  var2 <- if (m == 1) 0 else stats::var(y0)  # stage 2 variance estimate
  var.pooled <- (DF1 * var1 + DF2 * var2) / DF  # pooled estimate of variance
  sigma.pooled <- sqrt(var.pooled)  # sqrt of pooled variance estimate
  ssx <- sum((x - mean(x))^2)  # sum-of-squares for x, Sxx
  x0.mle <- (eta - b[1L])/b[2L]  # MLE of x0
  
  # Return point estimate only
  interval <- match.arg(interval)
  if (interval == "none") return(x0.mle)
  
  # Adjustment for simultaneous intervals
  adjust <- match.arg(adjust)  # FIXME: Does simultaneous work for m > 1?
  crit <- if (m != 1 || adjust == "none") stats::qt((1 + level)/2, n+m-3) else {
    switch(adjust,
           "Bonferroni" = stats::qt((level + 2*k - 1) / (2*k), n+m-3),
           "Scheffe"    = sqrt(k * stats::qf(level, k, n+m-3)))
  }
  
  # Inversion interval --------------------------------------------------------
  if (interval == "inversion") { 
    
    c1 <- b[2L]^2 - (sigma.pooled^2 * crit^2)/ssx
    c2 <- if (mean.response) {
      c1/n + (eta - mean(y))^2/ssx
    } else {
      c1*(1/m + 1/n) + (eta - mean(y))^2/ssx
    }
    c3 <- b[2L] * (eta - mean(y))
    c4 <- crit * sigma.pooled
    
    # FIXME: catch errors and throw an appropriate warning
    if (c1 < 0 && c2 <= 0) {
      
      warning("The calibration line is not well determined.", call. = FALSE)
      lwr <- -Inf
      upr <- Inf
      
    } else {
      
      lwr <- mean(x) + (c3 - c4*sqrt(c2))/c1
      upr <- mean(x) + (c3 + c4*sqrt(c2))/c1
      if (c1 < 0 && c2 > 0) {
        stop(paste("The calibration line is not well determined. The resulting \nconfidence region is the union of two semi-infinite intervals:\n(", -Inf, ",", 
                   round(upr, 4), ") U (", round(lwr, 4), ",", Inf, ")"), 
             call. = FALSE)
      }
      
    }
    res <- list("estimate" = x0.mle, 
                "lower"    = lwr, 
                "upper"    = upr, 
                "interval" = interval)
    
  }
  
  # Wald interval  
  if (interval == "Wald") { 
    
    # Compute standard error for Wald interval
    se <- if (mean.response) {
      abs((sigma.pooled/b[2]))*sqrt((1/n + (x0.mle - mean(x))^2/ssx))
    } else {
      abs((sigma.pooled/b[2]))*sqrt((1/m + 1/n + (x0.mle - mean(x))^2/ssx))
    }
    
    # Store results in a list
    res <- list("estimate" = x0.mle, 
                "lower"    = x0.mle - crit * se,
                "upper"    = x0.mle + crit * se, 
                "se"       = se, 
                "interval" = interval)
    
  } 
  
  # Assign class label and return results
  class(res) <- "invest"
  res
  
}