R/setupDiagnosticPlot.R

Defines functions setupDiagnosticPlot.perrySparseLTS setupDiagnosticPlotSparseLTSFit setupDiagnosticPlotSparseLTSStep setupDiagnosticPlot.sparseLTS setupDiagnosticPlot.tslars setupDiagnosticPlot.perrySeqModel setupDiagnosticPlotSeqModelStep setupDiagnosticPlot.seqModel setupDiagnosticPlot

Documented in setupDiagnosticPlot setupDiagnosticPlot.perrySeqModel setupDiagnosticPlot.perrySparseLTS setupDiagnosticPlot.seqModel setupDiagnosticPlot.sparseLTS setupDiagnosticPlot.tslars

# --------------------------------------
# Author: Andreas Alfons
#         Erasmus Universiteit Rotterdam
# --------------------------------------


#' Set up a diagnostic plot for a sequence of regression models
#'
#' Extract the fitted values and residuals of a sequence of regression models
#' (such as robust least angle regression models or sparse least trimmed
#' squares regression models) and other useful information for diagnostic
#' plots.
#'
#' Note that the argument \code{alpha} for controlling the subset size
#' behaves differently for \code{\link{sparseLTS}} than for
#' \code{\link[robustbase]{covMcd}}.  For \code{\link{sparseLTS}}, the subset
#' size \eqn{h} is determined by the fraction \code{alpha} of the number of
#' observations \eqn{n}.  For \code{\link[robustbase]{covMcd}}, on the other
#' hand, the subset size also depends on the number of variables \eqn{p} (see
#' \code{\link[robustbase]{h.alpha.n}}).  However, the \code{"sparseLTS"} and
#' \code{"perrySparseLTS"} methods attempt to compute the MCD using the same
#' subset size that is used to compute the sparse least trimmed squares
#' regressions.  This may not be possible if the number of selected variables
#' is large compared to the number of observations, in which case a warning is
#' given and \code{NA}s are returned for the robust Mahalanobis distances.
#'
#' @aliases setupDiagnosticPlot.rlars setupDiagnosticPlot.grplars
#' setupDiagnosticPlot.tslarsP
#'
#' @param object  the model fit from which to extract information.
#' @param s  for the \code{"seqModel"} method, an integer vector giving the
#' steps of the submodels from which to extract information (the default is to
#' use the optimal submodel).  For the \code{"sparseLTS"} method, an integer
#' vector giving the indices of the models from which to extract information
#' (the default is to use the optimal model for each of the requested fits).
#' @param p  an integer giving the lag length for which to extract information
#' (the default is to use the optimal lag length).
#' @param fit  a character string specifying from which fit to extract
#' information.  Possible values are \code{"reweighted"} (the default) to
#' convert the reweighted fit, \code{"raw"} to convert the raw fit, or
#' \code{"both"} to convert both fits.
#' @param covArgs  a list of arguments to be passed to
#' \code{\link[robustbase]{covMcd}} for computing robust Mahalanobis distances.
#' @param \dots  additional arguments to be passed to
#' \code{\link[robustbase]{covMcd}} can be specified directly instead of via
#' \code{covArgs}.
#'
#' @return  An object of class \code{"setupDiagnosticPlot"} with the following
#' components:
#' \describe{
#'   \item{\code{data}}{a data frame containing the columns listed below.
#'   \describe{
#'     \item{\code{step}}{the steps (for the \code{"seqModel"} method) or
#'     indices (for the \code{"sparseLTS"} method) of the models (only returned
#'     if more than one model is requested).}
#'     \item{\code{fit}}{the model fits (only returned if both the reweighted
#'     and raw fit are requested in the \code{"sparseLTS"} method).}
#'     \item{\code{index}}{the indices of the observations.}
#'     \item{\code{fitted}}{the fitted values.}
#'     \item{\code{residual}}{the standardized residuals.}
#'     \item{\code{theoretical}}{the corresponding theoretical quantiles from
#'     the standard normal distribution.}
#'     \item{\code{qqd}}{the absolute distances from a reference line through
#'     the first and third sample and theoretical quartiles.}
#'     \item{\code{rd}}{the robust Mahalanobis distances computed via the
#'     minimum covariance determinant (MCD) estimator (see
#'     \code{\link[robustbase]{covMcd}}).}
#'     \item{\code{xyd}}{the pairwise maxima of the absolute values of the
#'     standardized residuals and the robust Mahalanobis distances, divided by
#'     the respective other outlier detection cutoff point.}
#'     \item{\code{weight}}{the weights indicating regression outliers.}
#'     \item{\code{leverage}}{logicals indicating leverage points (i.e.,
#'     outliers in the predictor space).}
#'     \item{\code{Diagnostics}}{a factor with levels \code{"Potential outlier"}
#'     (potential regression outliers) and \code{"Regular observation"} (data
#'     points following the model).}
#'   }}
#'   \item{\code{qqLine}}{a data frame containing the intercepts and slopes of
#'   the respective reference lines to be displayed in residual Q-Q plots.}
#'   \item{\code{q}}{a data frame containing the quantiles of the Mahalanobis
#'   distribution used as cutoff points for detecting leverage points.}
#'   \item{\code{facets}}{default faceting formula for the diagnostic plots
#'   (only returned where applicable).}
#' }
#'
#' @author Andreas Alfons
#'
#' @seealso \code{\link{diagnosticPlot}}, \code{\link{rlars}},
#' \code{\link{grplars}}, \code{\link{rgrplars}}, \code{\link{tslarsP}},
#' \code{\link{rtslarsP}}, \code{\link{tslars}}, \code{\link{rtslars}},
#' \code{\link{sparseLTS}}
#'
#' @example inst/doc/examples/example-setupDiagnosticPlot.R
#'
#' @keywords utilities
#'
#' @export

setupDiagnosticPlot <- function(object, ...) UseMethod("setupDiagnosticPlot")


#' @rdname setupDiagnosticPlot
#' @export
#'

setupDiagnosticPlot.seqModel <- function(object, s = NA, covArgs = list(...),
                                         ...) {
  ## initializations
  if (!object$robust) {
    stop("diagnostic plots not yet implemented for nonrobust methods")
  }
  # check the scale estimate
  scale <- getComponent(object, "scale", s = s)
  if (any(scale <= 0)) stop("residual scale equal to 0")
  # check if model data is available to compute robust MCD distances
  terms <- delete.response(object$terms)  # extract terms for model matrix
  x <- object$x
  if (is.null(x)) {
    x <- try(model.matrix(terms), silent = TRUE)
    if (inherits(x, "try-error")) {
      x <- NULL
      warning("model data not available")
    }
  }
  if (!is.null(x)) x <- removeIntercept(x)
  ## construct data frame with all information for plotting
  steps <- object$s
  if (length(steps) > 1) {
    ## check steps
    if (is.null(s)) s <- steps
    else if (isTRUE(is.na(s))) s <- getSOpt(object)  # defaults to optimal step
    else s <- checkSteps(s, sMin = steps[1], sMax = steps[length(steps)])
  } else s <- NA
  ## extract data for the requested steps
  if (length(s) > 1) {
    ## extract the information from each requested step
    out_list <- lapply(s, setupDiagnosticPlotSeqModelStep, object = object,
                       x = x, covArgs = covArgs)
    data_list <- lapply(out_list, "[[", "data")
    qql_list <- lapply(out_list, "[[", "qqLine")
    q_list <- lapply(out_list, "[[", "q")
    ## combine the information from the steps
    data <- cbind(step = rep.int(s, sapply(data_list, nrow)),
                  do.call(rbind, data_list))
    qql <- cbind(step = rep.int(s, sapply(qql_list, nrow)),
                 do.call(rbind, qql_list))
    q <- cbind(step = rep.int(s, sapply(q_list, nrow)),
               do.call(rbind, q_list))
    ## construct return object
    out <- list(data = data, qqLine = qql, q = q, facets = ~step)
    class(out) <- "setupDiagnosticPlot"
  } else {
    # extract the information from the selected step
    out <- setupDiagnosticPlotSeqModelStep(s, object = object, x = x,
                                           covArgs = covArgs)
  }
  ## return data frame
  out
}


## workhorse function for a single step from a sequence of regression models
setupDiagnosticPlotSeqModelStep <- function(s, object, x = NULL,
                                            covArgs = list()) {
  ## extract fitted values
  fitted <- fitted(object, s = s)
  ## extract standardized residuals
  residuals <- rstandard(object, s = s)
  n <- length(residuals)  # number of observations
  ## compute 0/1 outlier weights
  wt <- as.integer(abs(residuals) <= qnorm(0.9875))
  ## compute theoretical quantiles and distances from Q-Q reference line
  theoretical <- qqNorm(residuals)
  qql <- qqLine(residuals)  # Q-Q reference line
  qqd <- abs(residuals - qql$intercept - qql$slope * theoretical)
  ## compute MCD distances using selected variables
  # extract predictor matrix
  ok <- (is.na(s) || s > 0) && !is.null(x)
  if (ok) {
    # extract coefficients
    coefficients <- removeIntercept(coef(object, s = s))
    selected <- which(coefficients != 0)
    p <- length(selected)
    if (p == 0) {
      ok <- FALSE
      warning("all coefficients equal to 0")
    }
  }
  if (ok) {
    # compute distances
    rd <- try({
      xs <- x[, selected, drop = FALSE]
      callCovFun <- getCallFun(covArgs)
      mcd <- callCovFun(xs, fun = covMcd, args = covArgs)
      sqrt(mahalanobis(xs, mcd$center, mcd$cov))
    }, silent = TRUE)
    if(inherits(rd, "try-error")) {
      ok <- FALSE
      warning("robust distances cannot be computed")
    }
  }
  if (!ok) rd <- rep.int(NA, n)
  # take maximum of the distances in the x- and y-space, divided by the
  # respective other cutoff point
  q <- sqrt(qchisq(0.975, p))
  xyd <- pmax.int(abs(rd/2.5), abs(residuals/q))
  ## construct indicator variables for leverage points
  leverage <- rd > q
  ## classify data points
  labs <- c("Potential outlier", "Regular observation")
  class <- ifelse(wt == 0, labs[1], labs[2])
  class <- factor(class, levels = labs)
  ## construct data frame containing main information
  data <- data.frame(index = seq_len(n), fitted = fitted, residual = residuals,
                     theoretical = theoretical, qqd = qqd, rd = rd, xyd = xyd,
                     weight = wt, leverage = leverage, Diagnostics = class)
  ## construct return object
  out <- list(data = data, qqLine = as.data.frame(qql),
              q = data.frame(q = max(q, 2.5)))
  class(out) <- "setupDiagnosticPlot"
  out
}


#' @rdname setupDiagnosticPlot
#' @export

setupDiagnosticPlot.perrySeqModel <- function(object, ...) {
  # call method for component 'finalModel'
  setupDiagnosticPlot(object$finalModel, ...)
}


#' @rdname setupDiagnosticPlot
#' @export

setupDiagnosticPlot.tslars <- function(object, p, ...) {
  ## check lag length
  if (missing(p) || !is.numeric(p) || length(p) == 0) p <- object$pOpt
  if (length(p) > 1) {
    p <- p[1]
    warning(sprintf("multiple lag lengths not yet supported; using p = %d", p))
  }
  pMax <- object$pMax
  if (p < 1) {
    p <- 1
    warning("lag length too small, using lag length 1")
  } else if (p > pMax) {
    p <- pMax
    warning(sprintf("lag length too large, using maximum lag length %d", p))
  }
  ## call method for specified lag length
  setupDiagnosticPlot(object$pFit[[p]], ...)
}


#' @rdname setupDiagnosticPlot
#' @export

setupDiagnosticPlot.sparseLTS <- function(object, s = NA,
                                          fit = c("reweighted", "raw", "both"),
                                          covArgs = list(...), ...) {
  ## initializations
  fit <- match.arg(fit)
  # check the scale estimate
  scale <- getComponent(object, "scale", s = s, fit = fit)
  if (any(scale <= 0)) stop("residual scale equal to 0")
  # check if model data is available to compute robust MCD distances
  terms <- delete.response(object$terms)  # extract terms for model matrix
  x <- object$x
  if (is.null(x)) {
    x <- try(model.matrix(terms), silent = TRUE)
    if(inherits(x, "try-error")) {
      x <- NULL
      warning("model data not available")
    }
  }
  if (!is.null(x) && object$intercept) x <- removeIntercept(x)
  ## construct data frame with all information for plotting
  lambda <- object$lambda
  bothOpt <- FALSE
  if (length(lambda) > 1) {
    ## check steps
    steps <- seq_along(lambda)
    if (is.null(s)) s <- steps
    else if (isTRUE(is.na(s))) {
      s <- getSOpt(object, fit = fit)  # defaults to optimal step
      if(fit == "both") {
        s <- unique(s)
        bothOpt <- length(s) == 2
      }
    } else s <- checkSteps(s, sMin = 1, sMax = length(steps))
  } else s <- NA
  ## extract data for the requested steps
  if (bothOpt) {
    # extract the data from the respecitve optimal lambda
    fits <- c("reweighted", "raw")
    ## recursive call for each fit
    reweighted <- setupDiagnosticPlotSparseLTSFit(object, s = s[1],
                                                  fit = "reweighted",
                                                  x = x)
    raw <- setupDiagnosticPlotSparseLTSFit(object, s = s[2], fit = "raw",
                                           x = x)
    ## combine data for Q-Q reference line
    qql <- data.frame(fit = factor(fits, levels = fits),
                      rbind(reweighted$qqLine, raw$qqLine),
                      row.names = NULL)
    ## combine data for cutoff chi-squared quantile
    q <- data.frame(fit = factor(fits, levels = fits),
                    rbind(reweighted$q, raw$q),
                    row.names = NULL)
    ## combine main information
    n <- c(nrow(reweighted$data), nrow(raw$data))
    data <- data.frame(fit = rep.int(factor(fits, levels = fits), n),
                       rbind(reweighted$data, raw$data),
                       row.names = NULL)
    ## construct return object
    out <- list(data = data, qqLine = qql, q = q, facets = . ~ fit)
    class(out) <- "setupDiagnosticPlot"
  } else if (length(s) > 1) {
    ## extract the information from each requested step
    out_list <- lapply(s, setupDiagnosticPlotSparseLTSStep, object = object,
                       fit = fit, x = x, covArgs = covArgs)
    data_list <- lapply(out_list, "[[", "data")
    qql_list <- lapply(out_list, "[[", "qqLine")
    q_list <- lapply(out_list, "[[", "q")
    ## combine the information from the steps
    data <- cbind(step = rep.int(s, sapply(data_list, nrow)),
                  do.call(rbind, data_list))
    qql <- cbind(step = rep.int(s, sapply(qql_list, nrow)),
                 do.call(rbind, qql_list))
    q <- cbind(step = rep.int(s, sapply(q_list, nrow)),
               do.call(rbind, q_list))
    ## construct return object
    facets <- if (fit == "both") step ~ fit else ~step
    out <- list(data = data, qqLine = qql, q = q, facets = facets)
    class(out) <- "setupDiagnosticPlot"
  } else {
    # extract the information from the selected step
    out <- setupDiagnosticPlotSparseLTSStep(s, object = object, fit = fit,
                                            x = x, covArgs = covArgs)
  }
  ## return object
  out
}


## workhorse functions

# setup diagnostic plots for a single sparse LTS step
setupDiagnosticPlotSparseLTSStep <- function(s, object, fit = "reweighted",
                                             x = NULL, covArgs = list()) {
  ## construct data frame with all information for plotting
  if (fit == "both") {
    fits <- c("reweighted", "raw")
    ## call workhorse function for each fit
    reweighted <- setupDiagnosticPlotSparseLTSFit(object, s = s,
                                                  fit = "reweighted",
                                                  x = x, covArgs = covArgs)
    raw <- setupDiagnosticPlotSparseLTSFit(object, s = s, fit = "raw",
                                           x = x, covArgs = covArgs)
    ## combine data for Q-Q reference line
    qql <- data.frame(fit = factor(fits, levels = fits),
                      rbind(reweighted$qqLine, raw$qqLine),
                      row.names = NULL)
    ## combine data for cutoff chi-squared quantile
    q <- data.frame(fit = factor(fits, levels = fits),
                    rbind(reweighted$q, raw$q),
                    row.names = NULL)
    ## combine main information
    n <- c(nrow(reweighted$data), nrow(raw$data))
    data <- data.frame(fit = rep.int(factor(fits, levels = fits), n),
                       rbind(reweighted$data, raw$data), row.names = NULL)
    ## construct return object
    out <- list(data = data, qqLine = qql, q = q, facets = . ~ fit)
    class(out) <- "setupDiagnosticPlot"
  } else {
    out <- setupDiagnosticPlotSparseLTSFit(object, s = s, fit = fit, x = x,
                                           covArgs = covArgs)
  }
  ## return object
  out
}

# setup diagnostic plots for a single sparse LTS fit
setupDiagnosticPlotSparseLTSFit <- function(object, s, fit = "reweighted",
                                            x = NULL, covArgs = list()) {
  ## extract fitted values
  fitted <- fitted(object, s = s, fit = fit)
  ## extract standardized residuals
  residuals <- rstandard(object, s = s, fit = fit)
  n <- length(residuals)  # number of observations
  ## extract outlier weights
  wt <- weights(object, type = "robustness", s = s, fit = fit)
  ## compute theoretical quantiles and distances from Q-Q reference line
  theoretical <- qqNorm(residuals)
  qql <- qqLine(residuals)  # Q-Q reference line
  qqd <- abs(residuals - qql$intercept - qql$slope * theoretical)
  ## compute MCD distances using selected variables
  # extract predictor matrix
  ok <- !is.null(x)
  if (ok) {
    # extract coefficients
    coefficients <- coef(object, s = s, fit = fit)
    if (object$intercept) coefficients <- removeIntercept(coefficients)
    selected <- which(coefficients != 0)
    p <- length(selected)
    if (p == 0) {
      ok <- FALSE
      warning("all coefficients equal to 0")
    }
  }
  if (ok) {
    # adjust alpha since MCD computes subset size depending on n and p
    h <- object$quan
    n2 <- (n + p + 1) %/% 2
    alpha <- pmin((h - 2*n2 + n) / (2 * (n - n2)), 1)
    # check fraction for subset size
    if (alpha < 0.5) {
      ok <- FALSE
      msg <- sprintf("cannot compute robust distances based on MCD with h = %d",
                     object$quan)
      warning(msg)
    }
  }
  if (ok) {
    # compute distances
    rd <- try({
      xs <- x[, selected, drop = FALSE]
      covArgs$alpha <- alpha
      callCovFun <- getCallFun(covArgs)
      mcd <- callCovFun(xs, fun = covMcd, args = covArgs)
      if (fit == "reweighted") {
        center <- mcd$center
        cov <- mcd$cov
      } else {
        center <- mcd$raw.center
        cov <- mcd$raw.cov
      }
      sqrt(mahalanobis(xs, center, cov))
    }, silent = TRUE)
    if(inherits(rd, "try-error")) {
      ok <- FALSE
      warning("robust distances cannot be computed")
    }
  }
  if (!ok) rd <- rep.int(NA_real_, n)
  # take maximum of the distances in the x- and y-space, divided by the
  # respective other cutoff point
  q <- sqrt(qchisq(0.975, p))
  xyd <- pmax.int(abs(rd/2.5), abs(residuals/q))
  ## construct indicator variables for leverage points
  leverage <- rd > q
  ## classify data points
  labs <- c("Potential outlier", "Regular observation")
  class <- ifelse(wt == 0, labs[1], labs[2])
  class <- factor(class, levels = labs)
  ## construct data frame containing main information
  data <- data.frame(index = seq_len(n), fitted = fitted, residual = residuals,
                     theoretical = theoretical, qqd = qqd, rd = rd, xyd = xyd,
                     weight = wt, leverage = leverage, Diagnostics = class)
  ## construct return object
  out <- list(data = data, qqLine = as.data.frame(qql),
              q = data.frame(q = max(q, 2.5)))
  class(out) <- "setupDiagnosticPlot"
  out
}


#' @rdname setupDiagnosticPlot
#' @export

setupDiagnosticPlot.perrySparseLTS <- function(object, ...) {
  # call method for component 'finalModel'
  setupDiagnosticPlot(object$finalModel, ...)
}

Try the robustHD package in your browser

Any scripts or data that you put into this service are public.

robustHD documentation built on Sept. 27, 2023, 1:07 a.m.