R/diagnostics.R

Defines functions error .quantilePercentiles residualDiagnostics.lm is.residualDiagnostics as.residualDiagnostics residualDiagnostics testDistribution.default is.testDistribution as.testDistribution testDistribution

Documented in as.residualDiagnostics as.testDistribution is.residualDiagnostics is.testDistribution .quantilePercentiles residualDiagnostics residualDiagnostics.lm testDistribution testDistribution.default

#' @title Test the distribution of a variable against a specific distribution
#'
#' @description Function designed to help examine distributions.
#' It also includes an option for assessing multivariate normality using the
#' (squared) Mahalanobis distance. A generic function, some methods, and
#' constructor (\code{as.testDistribution}) and function to check class
#' (\code{is.testDistribution}) also are provided.
#'
#' Note that for the \code{use} argument, several options are possible.
#' By default it is \dQuote{complete.obs}, which uses only cases with complete
#' data on all variables.
#' Another option is \dQuote{pairwise.complete.obs}, which uses
#' all available data for each variable indivdiually to estimate the means and
#' variances, and all pairwise complete observation pairs for each covariance. Because
#' the same cases are not used for all estimates, it is possible to obtain a covariance
#' matrix that is not positive definite (e.g., correlations > +1 or < -1).
#'
#' Finally, the last option is \dQuote{fiml}, which uses full information maximum likelihood
#' estimates of the means and covariance matrix.  Depending on the number of cases,
#' missing data patterns, and variables, this may be quite slow and computationally
#' demanding.
#'
#' The \code{robust} argument determines whether to use robust estimates or not
#' when calculating densities, etc.  By default it is \code{FALSE}, but if
#' \code{TRUE} and a univariate or multivariate normal distribution is tested,
#' then robust estimates of the means and covariance matrix (a variance if univariate)
#' will be used based on \code{covMcd} from the \pkg{robustbase} package.
#'
#' @param x The data as a single variable or vector to check the distribution unless
#'   the distribution is \dQuote{mvnormal} in which case it should be a data frame or
#'   data table.
#' @return A logical whether or not an object is of class
#'   \code{testDistribution} or an object of the same class.
#' @seealso \code{\link{SEMSummary}}
#' @export
#' @keywords multivariate
#' @rdname testDistribution
testDistribution <- function(x, ...) {
  UseMethod("testDistribution", x)
}

#' @importFrom data.table is.data.table as.data.table
#' @export
#' @rdname testDistribution
as.testDistribution <- function(x) {
  if (isFALSE(is.testDistribution(x))) {
    if(isFALSE(is.list(x))) {
      stop("Input must be a list or a testDistribution object")
    }
    x <- list(
      Data = x[[1]],
      Distribution = x[[2]],
      EVLimits = x[[3]],
      NOK = x[[4]],
      distr = x[[5]],
      na.rm = x[[6]],
      extremevalues = x[[7]],
      ev.perc = x[[8]],
      use = x[[9]],
      robust = x[[10]])
    class(x) <- "testDistribution"
  }

  ## checks that the object is not malformed
  stopifnot(identical(length(x), 10L))
  stopifnot(identical(names(x), c("Data", "Distribution", "EVLimits", "NOK",
                                  "distr", "na.rm", "extremevalues", "ev.perc", "use", "robust")))

  stopifnot(identical(length(x$Distribution), 5L))
  stopifnot(identical(length(x$EVLimits), 2L))
  stopifnot(identical(length(x$NOK), 1L))
  stopifnot(identical(length(x$distr), 1L))
  stopifnot(identical(length(x$na.rm), 1L))
  stopifnot(identical(length(x$extremevalues), 1L))
  stopifnot(identical(length(x$ev.perc), 1L))
  stopifnot(identical(length(x$use), 1L))
  stopifnot(identical(length(x$robust), 1L))

  stopifnot(is.list(x$Distribution))
  stopifnot(is.numeric(x$EVLimits))
  stopifnot(is.numeric(x$NOK))
  stopifnot(is.character(x$distr))
  stopifnot(is.logical(x$na.rm))
  stopifnot(is.character(x$extremevalues))
  stopifnot(is.numeric(x$ev.perc))
  stopifnot(is.character(x$use))
  stopifnot(is.logical(x$robust))

  return(x)
}

#' @export
#' @rdname testDistribution
is.testDistribution <- function(x) {
  inherits(x, "testDistribution")
}


#' @param distr A character string indicating the distribution to be tested.
#'   Currently one of: \dQuote{normal}, \dQuote{beta}, \dQuote{chisq} (chi-squared),
#'   \dQuote{f}, \dQuote{gamma}, \dQuote{geometric}, \dQuote{nbinom} (negative binomial),
#'   \dQuote{poisson}, \dQuote{uniform}, or \dQuote{mvnormal} for multivariate normal where Mahalanobis
#'   distances are calculated and compared against a Chi-squared distribution with
#'   degrees of freedom equal to the number of variables.
#' @param na.rm A logical value whether to omit missing values. Defaults to \code{TRUE}.
#' @param starts A named list of the starting values. Not required for all distributions.
#'   Passed on to \code{fitdistr} which fits the maximum likelihood estimates of the
#'   distribution parameters.
#' @param extremevalues A character vector whether to indicate extreme values.
#'   Should be \dQuote{no} to do nothing, \dQuote{empirical} to show extreme
#'   values based on the observed data percentiles, or \dQuote{theoretical}
#'   to show extreme values based on percentiles of the theoretical distribution.
#' @param ev.perc Percentile to use for extreme values.  For example if .01,
#'   then the lowest 1 percent and highest 1 percent will be labelled
#'   extreme values.  Defaults to the lowest and highest 0.1 percent.
#' @param use A character vector indicating how the moments
#'   (means and covariance matrix) should be estimated in the presence of
#'   missing data when \code{distr = mvnormal}.
#'   The default is to use complete observations, but
#'   full information maximum likelihood based on functions in
#'   \pkg{lavaan} is also available.  See details.
#' @param robust A logical whether to use robust estimation or not.
#'   Currently only applies to normally distributed data
#'   (univariate or multivariate).  Also, when \code{robust = TRUE},
#'   only complete observations are used (i.e., \code{use = "complete.obs"}).
#'   See details.
#' @param ... Additional arguments. If these include mu and sigma and the distribution
#'   is multivariate normal, then it will use the passed values instead of calculating
#'   the mean and covariances of the data.
#' @return A list with information about the distribution (parameter estimates,
#'   name, log likelihood (useful for comparing the fit of different distributions
#'   to the data), and a dataset with the sorted data and theoretical quantiles.
#' @importFrom MASS fitdistr
#' @importFrom stats dnorm qnorm dbeta qbeta dchisq qchisq dunif qunif dgeom qgeom
#' @importFrom stats df qf dgamma qgamma dnbinom qnbinom dpois qpois
#' @importFrom stats logLik ppoints
#' @importFrom stats mahalanobis qchisq ppoints
#' @importFrom stats lm resid offset
#' @importFrom data.table melt
#' @importFrom extraoperators %gele%
#' @importFrom robustbase covMcd
#' @method testDistribution default
#' @export
#' @rdname testDistribution
#' @examples
#'
#' ## example data
#' set.seed(1234)
#' d <- data.table::data.table(
#'   Ynorm = rnorm(200),
#'   Ybeta = rbeta(200, 1, 4),
#'   Ychisq = rchisq(200, 8),
#'   Yf = rf(200, 5, 10),
#'   Ygamma = rgamma(200, 2, 2),
#'   Ynbinom = rnbinom(200, mu = 4, size = 9),
#'   Ypois = rpois(200, 4))
#'
#' ## testing and graphing
#' testDistribution(d$Ybeta, "beta", starts = list(shape1 = 1, shape2 = 4))
#' testDistribution(d$Ychisq, "chisq", starts = list(df = 8))
#'
#' ## for chi-square distribution, extreme values only on
#' ## the right tail
#' testDistribution(d$Ychisq, "chisq", starts = list(df = 8),
#'   extremevalues = "empirical", ev.perc = .1)
#' testDistribution(d$Ychisq, "chisq", starts = list(df = 8),
#'   extremevalues = "theoretical", ev.perc = .1)
#'
#' \dontrun{
#'
#' testDistribution(d$Yf, "uniform")
#' testDistribution(d$Ypois, "geometric")
#'
#' testDistribution(d$Yf, "f", starts = list(df1 = 5, df2 = 10))
#' testDistribution(d$Ygamma, "gamma")
#' testDistribution(d$Ynbinom, "poisson")
#' testDistribution(d$Ynbinom, "nbinom")
#' testDistribution(d$Ypois, "poisson")
#'
#' ## compare log likelihood of two different distributions
#' testDistribution(d$Ygamma, "normal")$Distribution$LL
#' testDistribution(d$Ygamma, "gamma")$Distribution$LL
#'
#' testDistribution(d$Ynorm, "normal")
#' testDistribution(c(d$Ynorm, 10, 1000), "normal",
#'   extremevalues = "theoretical")
#' testDistribution(c(d$Ynorm, 10, 1000), "normal",
#'   extremevalues = "theoretical", robust = TRUE)
#'
#' testDistribution(mtcars, "mvnormal")
#'
#' ## for multivariate normal mahalanobis distance
#' ## which follows a chi-square distribution, extreme values only on
#' ## the right tail
#' testDistribution(mtcars, "mvnormal", extremevalues = "empirical",
#'   ev.perc = .1)
#' testDistribution(mtcars, "mvnormal", extremevalues = "theoretical",
#'   ev.perc = .1)
#'
#' rm(d) ## cleanup
#' }
testDistribution.default <- function(x,
  distr = c("normal", "beta", "chisq", "f", "gamma", "geometric", "nbinom", "poisson", "uniform", "mvnormal"),
  na.rm = TRUE, starts,
  extremevalues = c("no", "theoretical", "empirical"),
  ev.perc = .001,
  use = c("complete.obs", "pairwise.complete.obs", "fiml"),
  robust = FALSE, ...) {

  distr <- match.arg(distr)
  use <- match.arg(use)
  if (use != "complete.obs" & isTRUE(robust)) {
    use <- "complete.obs"
    message("use set to 'complete.obs' as robust = TRUE")
  }

  extremevalues <- match.arg(extremevalues)
  stopifnot(ev.perc %gele% c(0L, 1L))

  if (identical(distr, "mvnormal")) {
    if (anyNA(x)) {
      OK <- rowSums(is.na(x)) == 0
    } else if (!anyNA(x)) {
      use <- "complete.obs"
      OK <- rep(TRUE, nrow(x))
    }

    optargs <- list(...)
    if (c("mu", "sigma") %ain% names(optargs)) {
      desc <- list(mu = optargs$mu,
                   sigma = optargs$sigma)
    } else {

      if (isTRUE(robust)) {
        tmp <- covMcd(x[OK, , drop = FALSE])
        desc <- list(mu = tmp$center, sigma = tmp$cov)
        rm(tmp)
      } else {
        desc <- switch(
          match.arg(use),
          fiml = {moments(x)},
          pairwise.complete.obs = {
            list(mu = colMeans(x, na.rm = TRUE),
                 sigma = cov(x, use = "pairwise.complete.obs"))
          },
          complete.obs = {
            list(mu = colMeans(x[OK,]),
                 sigma = cov(x[OK,]))
          })
      }
    }

    ## if any variances near zero, set to at least 1e-10
    if (any(diag(desc$sigma) < 1e-10)) {
      diag(desc$sigma) <- pmax(diag(desc$sigma), 1e-10)
    }
    ## if covariance matrix is singular, inflate diagonal (variances)
    ## 2.5 percent increase per iteration, up to 20 iterations
    i <- 1L
    while(isTRUE(tryCatch(solve(desc$sigma), error = function(e) TRUE)) && i <= 20) {
      diag(desc$sigma) <- diag(desc$sigma) * 1.025
      i <- i + 1L
    }

    starts <- list(df = ncol(x))
    x <- mahalanobis(x[OK,], desc$mu, desc$sigma)
  }

  if (missing(starts)) {
    base <- "starts must be a named list as below with start values for 'XX':\n%s"
    switch(distr,
           beta = stop(sprintf(base, "list(shape1 = XX, shape2 = XX)")),
           chisq = stop(sprintf(base, "list(df = XX)")),
           f = stop(sprintf(base, "list(df1 = XX, df2 = XX)")),
           "")
  }

  if (anyNA(x)) {
    if (na.rm) {
      if (!is.null(dim(x))) {
        OKindex <- rowSums(is.na(x)) == 0
      } else {
        OKindex <- !is.na(x)
      }
      x <- x[OKindex]
      } else {
        stop("Missing values cannot be present when na.rm = FALSE")
      }
    } else {
      OKindex <- rep(TRUE, length(x))
    }

  if (identical(distr, "normal") && isTRUE(robust)) {
    estimate <- covMcd(x)
    estimate <- c(
      mean = as.vector(estimate$center),
      sd = sqrt(as.vector(estimate$cov)))

    distribution <- list(
      d = dnorm,
      q = qnorm,
      Name = "Normal",
      fit =  structure(list(
        estimate = estimate,
        sd = c(mean = NA_real_, sd = NA_real_),
        vcov = structure(c(NA_real_, NA_real_, NA_real_, NA_real_),
                         .Dim = c(2L, 2L),
                         .Dimnames = list(c("mean", "sd"), c("mean", "sd"))),
        n = length(x),
        loglik = sum(dnorm(x, estimate["mean"], estimate["sd"],
                           log = TRUE))), class = "fitdistr"))
    rm(estimate)
  } else {
    distribution <- switch(
      distr,
      normal = list(
        d = dnorm,
        q = qnorm,
        Name = "Normal",
        fit = fitdistr(x, "normal")),
      beta = list(
        d = dbeta,
        q = qbeta,
        Name = "Beta",
        fit = fitdistr(x, "beta", start = starts)),
      chisq = list(
        d = dchisq,
        q = qchisq,
        Name = "Chi-squared",
        fit = fitdistr(x, "chi-squared", start = starts, lower = .01)),
      f = list(
        d = df,
        q = qf,
        Name = "F",
        fit = fitdistr(x, "f", start = starts)),
      gamma = list(
        d = dgamma,
        q = qgamma,
        Name = "Gamma",
        fit = fitdistr(x, "gamma", lower = c(.001, .001))),
      geometric = list(
        d = dgeom,
        q = qgeom,
        Name = "Geometric",
        fit = fitdistr(x, "geometric")),
      nbinom = list(
        d = dnbinom,
        q = qnbinom,
        Name = "Negative Binomial",
        fit = fitdistr(x, "negative binomial")),
      poisson = list(
        d = dpois,
        q = qpois,
        Name = "Poisson",
        fit = fitdistr(x, "poisson")),
      uniform = list(
        d = dunif,
        q = qunif,
        Name = "Uniform",
        fit = structure(list(
          estimate = c(min = min(x), max = max(x)),
          sd = c(min = NA_real_, max = NA_real_),
          vcov = structure(c(NA_real_, NA_real_, NA_real_, NA_real_), .Dim = c(2L, 2L),
                           .Dimnames = list(
                             c("min", "max"), c("min", "max"))),
          n = length(x),
          loglik = NA_real_),
          class = "fitdistr")),
      mvnormal = list(
        d = dchisq,
        q = qchisq,
        Name = "Chi-squared",
        fit = list(estimate = list(df = starts$df))))
  }

  if (!identical(distr, "mvnormal")) {
    distribution$LL <- logLik(distribution$fit)
  } else {
    distribution$LL <- structure(NA_real_, nobs = length(x),
                                 df = starts$df, class = "logLik")
  }

  d <- data.table(
    X = do.call(distribution$q,
                c(list(p = ppoints(length(x))),
                  as.list(distribution$fit$estimate))),
    Y = sort(x),
    OriginalOrder = which(OKindex)[order(x)])

  ev.limits <- switch(extremevalues,
    no = c(-Inf, Inf),
    empirical = if (!identical(distr, "mvnormal") && !identical(distr, "chisq")) {
                  quantile(x, probs = c(ev.perc, 1 - ev.perc), na.rm = TRUE)
                } else {
                  c(-Inf, quantile(x, probs = 1 - ev.perc, na.rm = TRUE))
                },
    theoretical = do.call(distribution$q,
      c(list(
        p = if (!identical(distr, "mvnormal") && !identical(distr, "chisq")) {
              c(ev.perc, 1 - ev.perc)
            } else {
              c(0, 1 - ev.perc)
            }),
        as.list(distribution$fit$estimate))))

  d[, isEV := factor(as.numeric(Y) %gele% ev.limits,
                     levels = c(TRUE, FALSE), labels = c("No", "Yes"))]

  nok <- sum(!is.na(d$Y))

  d$YDeviates <- resid(lm(Y ~ 0 + offset(X), data = d))

  as.testDistribution(list(
    Data = d,
    Distribution = distribution,
    EVLimits = ev.limits,
    NOK = nok,
    distr = distr,
    na.rm = na.rm,
    extremevalues = extremevalues,
    ev.perc = ev.perc,
    use = use,
    robust = robust))
}

#' @title Residual Diagnostics Functions
#' @description A set of functions to calculate
#'   residual diagnostics on models, including constructors,
#'   a generic function, a test of whether an object is of the
#'   \code{residualDiagnostics} class, and methods.
#' @param object A fitted model object, with methods for
#'   \code{model.frame}, \code{resid} and \code{fitted}.
#' @param ev.perc A real number between 0 and 1 indicating the
#'   proportion of the theoretical distribution beyond which
#'   values are considered extreme values (possible outliers).
#'   Defaults to .001.
#' @param robust Whether to use robust mean and standard deviation estimates
#'   for normal distribution
#' @param distr A character string given the assumed distribution.
#'   Passed on to \code{\link{testDistribution}}.
#'   Defaults to \dQuote{normal}.
#' @param standardized A logical whether to use standardized residuals.
#'   Defaults to \code{TRUE} generally where possible but may depend on
#'   method.
#' @param cut An integer, how many unique predicted values
#'   there have to be at least for predicted values to be
#'   treated continuously, otherwise they are treated as discrete values.
#'   Defaults to 4.
#' @param ... Additional arguments, not currently used.
#' @return A logical (\code{is.residualDiagnostics}) or
#'   a residualDiagnostics object (list) for
#'   \code{as.residualDiagnostics} and \code{residualDiagnostics}.
#' @export
#' @rdname residualDiagnostics
residualDiagnostics <- function(object, ...) {
  UseMethod("residualDiagnostics", object)
}

#' @param x A object (e.g., list or a modelDiagnostics object) to
#'   test or attempt coercing to a residualDiagnostics object.
#' @importFrom data.table is.data.table as.data.table
#' @rdname residualDiagnostics
#' @export
as.residualDiagnostics <- function(x) {
  if (!is.residualDiagnostics(x)) {
    if(!is.list(x)) {
      stop("Input must be a list or a residualDiagnostics object")
    }
    stopifnot(identical(length(x), 5L))
    augmentClass <- attr(x, "augmentClass")

    x <- list(
      Residuals = x[[1]],
      Frame = x[[2]],
      Hat = x[[3]],
      testDistribution = x[[4]],
      Outcome = x[[5]],
      N = nrow(x[[1]]),
      K = ncol(x[[2]]))

    if(is.null(augmentClass)) {
      class(x) <- "residualDiagnostics"
    } else {
      class(x) <- c(paste0("residualDiagnostics.", augmentClass), "residualDiagnostics")
    }
  }

  ## checks that the object is not malformed
  stopifnot(identical(length(x), 7L))
  stopifnot(identical(
    names(x),
    c("Residuals", "Frame", "Hat",
      "testDistribution", "Outcome", "N", "K")))
  stopifnot(is.character(x$Outcome))
  stopifnot(is.data.table(x$Residuals))
  stopifnot(is.data.table(x$Frame))
  stopifnot(is.testDistribution(x$testDistribution))
  stopifnot(identical(nrow(x$Residuals), nrow(x$Frame)))
  stopifnot(identical(ncol(x$Residuals), 4L))
  stopifnot(x$Outcome %in% names(x$Frame))

  return(x)
}

#' @rdname residualDiagnostics
#' @export
is.residualDiagnostics <- function(x) {
  inherits(x, "residualDiagnostics")
}

## clear R CMD CHECK notes
if (getRversion() >= "2.15.1")  utils::globalVariables(c("originalindex"))

#' @importFrom stats rstandard residuals fitted coef predict model.frame
#' @importFrom data.table data.table :=
#' @method residualDiagnostics lm
#' @export
#' @rdname residualDiagnostics
#' @examples
#' testm <- stats::lm(mpg ~ hp * factor(cyl), data = mtcars)
#'
#' resm <- residualDiagnostics(testm)
#' plot(resm$testDistribution)
#'
#' resm <- residualDiagnostics(testm, standardized = FALSE)
#' plot(resm$testDistribution)
#'
#' ## clean up
#' rm(testm, resm)
#' \dontrun{
#'
#' testdat <- data.frame(
#'   y = c(1, 2, 2, 3, 3, NA, 9000000, 2, 2, 1),
#'   x = c(1, 2, 3, 4, 5, 6, 5, 4, 3, 2))
#'
#' residualDiagnostics(
#'   lm(y ~ x, data = testdat, na.action = "na.omit"),
#'   ev.perc = .1)$Residuals
#'
#' residualDiagnostics(
#'   lm(y ~ x, data = testdat, na.action = "na.exclude"),
#'   ev.perc = .1)$Residuals
#'
#' residualDiagnostics(
#'   lm(sqrt(mpg) ~ hp, data = mtcars, na.action = "na.omit"),
#'   ev.perc = .1)$Residuals
#' }
residualDiagnostics.lm <- function(object, ev.perc = .001,
                                   robust = FALSE, distr = "normal",
                                   standardized = TRUE, cut = 4L, ...) {
  d.frame <- model.frame(object)
  naaction <- attr(d.frame, "na.action")
  if (isFALSE(is.null(naaction))) {
    if (isTRUE(inherits(naaction, "omit"))) {
      origindex <- index <- 1:(nrow(d.frame) + length(naaction))
      index[naaction] <- NA
      index[-naaction] <- 1:nrow(d.frame)
      key <- data.table(
        originalindex = origindex,
        index = index)[!is.na(index)]
    }
  } else {
    key <- data.table(
      originalindex = seq_len(nrow(d.frame)),
      index = seq_len(nrow(d.frame)))[!is.na(index)]
  }

  d.frame <- as.data.table(d.frame)
  dv <- names(d.frame)[1]

  d.res <- data.table(
    Residuals = if (standardized) rstandard(object) else residuals(object),
    Predicted = fitted(object))

  d.hat <- .quantilePercentiles(
    data = d.res,
    LL = .1, UL = .9,
    cut = cut)

  d.dist <- testDistribution(
    x = d.res$Residuals,
    distr = "normal",
    na.rm = TRUE,
    extremevalues = "theoretical",
    ev.perc = ev.perc,
    use = "complete.obs",
    robust = robust)

  d.res[!is.na(Residuals), isEV := d.dist$Data[order(OriginalOrder), isEV]]
  d.res[, Index := seq_len(.N)]

  ## fix the index to match original data if missing data existed and were omitted
  if (isFALSE(is.null(naaction))) {
    if (isTRUE(inherits(naaction, "omit"))) {
      d.res[, Index := key[, originalindex]]
      d.dist$Data[, OriginalOrder := key[match(OriginalOrder, index), originalindex]]
    }
  }

  out <- list(na.omit(d.res), d.frame, d.hat, d.dist, dv)
  attr(out, "augmentClass") <- "lm"

  as.residualDiagnostics(out)
}

#' Internal Function to Calculate Quantiles
#'
#' Function calculates smoothing spline quantiles
#' or linear quantiles as a fall back. Not intended for general use.
#' Expected predicted and residual data.
#' Exported to support related packages.
#'
#' @param data A dataset of predicted and residual values.
#'   Assumed from some sort of (probably parametric) model.
#' @param LL The lower limit for prediction. Defaults to
#'   \code{.1} to give the 10th percentile.
#' @param UL The upper limit for prediction. Defaults to
#'   \code{.9} to give the 90th percentile.
#' @param na.rm A logical whether to remove missing values.
#'   Defaults to \code{TRUE}
#' @param cut An integer, how many unique predicted values
#'   there have to be at least for it to use quantile regression
#'   or treat the predicted values as discrete.
#'   Defaults to 4.
#' @return A data.table with the scores and predicted LL and UL,
#'   possibly missing if quantile regression models do not
#'   converge.
#' @importFrom quantreg qss rq rqss
#' @importFrom data.table data.table :=
#' @export
.quantilePercentiles <- function(data, LL = .1, UL = .9, na.rm = TRUE, cut = 4L) {
  ## some predictions may be functionally identically but vary very slightly
  ## identify range in predicted values and use this to adjust the rounding
  ## to address slight discrepancies when identifying "unique" values
  delta <- diff(range(data$Predicted, na.rm = na.rm))  
  ## if there are <= cut unique non missing values, treat as discrete
  pred <- round(data$Predicted, digits = ifelse(isTRUE(delta > 1), 8, 16))
  uvals <- unique(na.omit(pred))
  if (length(uvals) <= cut) {
    data$Predicted <- round(data$Predicted, digits = ifelse(isTRUE(delta > 1), 8, 16))
    d.hat <- data[, .(
      LL = quantile(Residuals, probs = LL, na.rm = na.rm),
      UL = quantile(Residuals, probs = UL, na.rm = na.rm),
      cut = TRUE),
      by = Predicted]
  ## if more than cut unique non missing values, use quantile regression
  } else {
    d.hat <- data.table(
      Predicted = seq(
        min(data$Predicted, na.rm = na.rm),
        max(data$Predicted, na.rm = na.rm),
        length.out = 1000))

    tau.LL <- tryCatch(
      rqss(Residuals ~ qss(Predicted, lambda = 1),
           tau = LL, data = data),
      error = function(e) TRUE)
    if (!isTRUE(tau.LL)) {
      tau.UL <- tryCatch(
        rqss(Residuals ~ qss(Predicted, lambda = 1),
             tau = UL, data = data),
        error = function(e) TRUE)
    } else {
      tau.UL <- TRUE
    }
    if (!isTRUE(tau.LL) && !isTRUE(tau.UL)) {
      d.hat[, LL := predict(tau.LL, d.hat)]
      d.hat[, UL := predict(tau.UL, d.hat)]
    }
    if (isTRUE(tau.LL) || isTRUE(tau.UL) ||
          isTRUE(all.equal(d.hat$LL, d.hat$UL))) {
      tau.2LL <- tryCatch(
        rq(Residuals ~ Predicted, tau = LL, data = data),
        error = function(e) TRUE)
      if (!isTRUE(tau.2LL)) {
        tau.2UL <- tryCatch(
          rq(Residuals ~ Predicted, tau = UL, data = data),
          error = function(e) TRUE)
      } else {
        tau.2UL <- TRUE
      }
      if (!isTRUE(tau.2LL) && !isTRUE(tau.2UL)) {
        d.hat[, LL := predict(tau.2LL, d.hat)]
        d.hat[, UL := predict(tau.2UL, d.hat)]
      }
    }
    d.hat[, cut := FALSE]
  }
  if (isTRUE(all.equal(d.hat$LL, d.hat$UL))) {
    d.hat[, LL := NA_real_]
    d.hat[, UL := NA_real_]
  }
  return(d.hat)
}


#' @title Model Diagnostics Functions
#' @description A set of functions to calculate
#'   model diagnostics on models, including constructors,
#'   a generic function, a test of whether an object is of the
#'   \code{modelDiagnostics} class, and methods.
#' @param object A fitted model object, with methods for
#'   \code{model.frame}, \code{resid} and \code{fitted}.
#' @param x An object to test or a list to coerce to a
#'   \code{modelDiagnostics} object.
#' @param ev.perc A real number between 0 and 1 indicating the
#'   proportion of the theoretical distribution beyond which
#'   values are considered extreme values (possible outliers).
#'   Defaults to .001.
#' @param robust Whether to use robust mean and standard deviation estimates
#'   for normal distribution
#' @param distr A character string given the assumed distribution.
#'   Passed on to \code{\link{testDistribution}}.
#'   Defaults to \dQuote{normal}.
#' @param standardized A logical whether to use standardized residuals.
#'   Defaults to \code{TRUE} generally where possible but may depend on
#'   method.
#' @param ... Additional arguments, passed to \code{residualDiagnostics}.
#' @return A logical (\code{is.modelDiagnostics}) or
#'   a modelDiagnostics object (list) for
#'   \code{as.modelDiagnostics} and \code{modelDiagnostics}.
#' @export
#' @rdname modelDiagnostics
modelDiagnostics <- function(object, ...) {
  UseMethod("modelDiagnostics", object)
}


#' @importFrom data.table is.data.table as.data.table
#' @export
#' @rdname modelDiagnostics
as.modelDiagnostics <- function(x) {
  stopifnot(identical(length(x), 3L))
  if (!is.modelDiagnostics(x)) {
    if(!is.list(x)) {
      stop("Input must be a list or a modelDiagnostics object")
    }
    augmentClass <- attr(x, "augmentClass")
    x <- list(
      residualDiagnostics = x[[1]],
      modelDiagnostics = x[[2]],
      extremeValues = x[[3]])
    if (is.null(augmentClass)) {
      class(x) <- "modelDiagnostics"
    } else {
      class(x) <- c(paste0("modelDiagnostics.", augmentClass), "modelDiagnostics")
    }
  }

  stopifnot(identical(length(x), 3L))
  stopifnot(identical(names(x),
                      c("residualDiagnostics",
                        "modelDiagnostics",
                        "extremeValues")))

  stopifnot(is.residualDiagnostics(
    x$residualDiagnostics))

  return(x)
}

#' @export
#' @rdname modelDiagnostics
is.modelDiagnostics <- function(x) {
  inherits(x, "modelDiagnostics")
}

#' @export
#' @rdname modelDiagnostics
#' @examples
#' testm <- stats::lm(mpg ~ hp * factor(cyl), data = mtcars)
#'
#' md <- modelDiagnostics(testm)
#' plot(md$residualDiagnostics$testDistribution)
#' md$extremeValues
#'
#' plot(md)
#'
#' md <- modelDiagnostics(testm, ev.perc = .1)
#' md$extremeValues
#' plot(md, ncol = 2)
#'
#' testdat <- data.frame(
#'   y = c(1, 2, 2, 3, 3, NA, 9000000, 2, 2, 1),
#'   x = c(1, 2, 3, 4, 5, 6, 5, 4, 3, 2))
#'
#' modelDiagnostics(
#'   lm(y ~ x, data = testdat, na.action = "na.omit"),
#'   ev.perc = .1)$extremeValues
#'
#' modelDiagnostics(
#'   lm(y ~ x, data = testdat, na.action = "na.exclude"),
#'   ev.perc = .1)$extremeValues
#'
#' ## clean up
#' rm(testm, md, testdat)
modelDiagnostics.lm <- function(object, ev.perc = .001,
                                   robust = FALSE, distr = "normal",
                                   standardized = TRUE, ...) {

  x <- residualDiagnostics(object,
                           ev.perc = ev.perc,
                           robust = robust,
                           distr = distr,
                           standardized = standardized,
                           ...)

  ## data for outliers
  d.extreme <- data.table(dv = NA_real_,
                          Index = NA_integer_,
                          EffectType = NA_character_)
  setnames(d.extreme, names(d.extreme), c(x$Outcome, "Index", "EffectType"))
  if ("EffectType" %in% x$Outcome) {
    stop("EffectType is used internally and cannot be a variable in the model")
  }

  x$Frame <- cbind(
    x$Frame, x$testDistribution$Data[order(OriginalOrder), .(isEV, OriginalOrder)])
  setnames(x$Frame, "OriginalOrder", "Index")
  x$Frame[, EffectType := "Residuals"]

  if (isTRUE(any(x$Frame[, isEV] == "Yes"))) {
    d.extreme <- rbind(
      d.extreme,
      x$Frame[isEV == "Yes",
              c(x$Outcome, "Index", "EffectType"),
              with = FALSE])

  }

  ## fix the frame to remove extra information
  x$Frame[, isEV := NULL]
  x$Frame[, Index := NULL]
  x$Frame[, EffectType := NULL]

  out <- list(x, NA_real_, na.omit(d.extreme))
  attr(out, "augmentClass") <- "lm"

  as.modelDiagnostics(out)
}
JWiley/JWileymisc documentation built on Feb. 15, 2024, 12:23 p.m.