cvreg: Cross Validation and Robust Estimation Utilities

Documented in dfBetas

#' Assess influence of observations on parameter estimates for OLS & GLMs
#'
#' @description DFBETAs are a measure of stability in the parameter estimates for a model. They aim to
#' assess whether there are an observations with strong leverage over the parameter estimate. The DFBETAs
#' for the ith observation are the difference between the full-data parameter estimates and the parameter
#' estimates without the ith row of observations: \deqn{\hat{\beta}_j - \tilde{\beta}_{(-i, j)}}. DFBETAs
#' for an observaton-variable pair greater than +/- 2/sqrt(n) are considered to indicate that the ith
#' observation causes potential instability in the estimate for the jth variable.
#'
#' @param formula a model formula
#' @param data a data frame
#' @param family a character string identifying the GLM family to use. The default is "gaussian".
#' @param plot whether to plot or not. defaults to TRUE. if FALSE a data frame of each dfbeta is returned.
#' @param which which variable to plot. defaults to "plotall",
#' which creates a facetplot with ggplot2. otherwise, a specific variable's name can be entered, such as
#' "Sepal.Length", and a single plot will be created with base graphics.
#' @param nrows alter the number of rows for the facetplot if you wish.
#' @param ncols alter the number of columns for the facetplot if you wish.
#'
#' @return a plot or a data frame.
#' @export
#'
#' @references
#' Belsley, Kuh, and Welsch (1980) Regression Diagnostics. pages 13-14.
#'
dfBetas <- function(formula, data, family = c("gaussian", "poisson", "binomial", "quasibinomial", "quasipoisson", "inverse.gaussian", "Gamma"),plot = TRUE, which = "plotall", nrows = NULL, ncols = NULL){
  family <- match.arg(family)
  x <- model.matrix(formula, Scale(data))
  betas <- matrix(0, nrow = nrow(x), ncol = ncol(x))
  if (family=="gaussian"){
    y <- model.response(model.frame(formula, Scale(data)))
    for (i in 1:nrow(x)){
      betas[i,] <- t(pseudoinverse(crossprod(x[-i,]))%*%crossprod(x[-i,], y[-i]))
    }
    betas <- sweep(betas, 2, as.vector(pseudoinverse(crossprod(x))%*%crossprod(x, y)),"-")
  }
  else{
    y <- model.response(model.frame(formula, data))
    if (family=="binomial" || family=="quasibinomial"){
      if(!is.numeric(y)) y <- as.numeric(as.factor(y))
      if (min(y)==1) y <- y-1
    }
    family <- switch(family,
                     "gaussian" = gaussian(),
                     "poisson" = quasipoisson(),
                     "quasipoisson" = quasipoisson(),
                     "binomial" = quasibinomial(),
                     "quasibinomial" = quasibinomial(),
                     "Gamma" = Gamma(),
                     "inverse.gaussian" = inverse.gaussian())

    for (i in 1:nrow(x)){
      betas[i,] <- glm.fit(x[-i,], y[-i], family = family)$coef
    }
    betas <- sweep(betas, 2, glm.fit(x, y, family = family)$coef, "-")
  }
  colnames(betas) <- colnames(x)
  betas <- as.data.frame(betas)
  if (plot){
    if (which=="plotall"){
      tibble::rownames_to_column(betas) %>%
        tidyr::pivot_longer(cols = colnames(.)[-1], names_to = "variable") %>%
        dplyr::mutate(rowname = as.numeric(rowname),
                      variable = as.factor(variable),
                      dfbeta = as.numeric(value)) %>%
        ggplot(aes(x = rowname, y = dfbeta, color = variable)) +
        facet_wrap(~ variable, ncol = ncols, nrow = nrows) +
        geom_line() +
        geom_hline(yintercept = (2 / sqrt(nrow(betas))), color = "red", size = 1) +
        geom_hline(yintercept = (-2 / sqrt(nrow(betas))), color="red", size = 1)
    }
    else{
      thr <- 2 / sqrt(nrow(betas))
      plot(betas[,which], type = "l", col = "#6d22ecCC",
           ylim = c(min(betas[,which],-thr*1.05), max(betas[,which],thr*1.05)), ylab = which)
      abline(a =  thr, b = 0, col = "red")
      abline(a = -thr, b = 0, col = "red")
    }
  }
  else{
    return(betas)
  }
}


#' Hadi's influence measure
#'
#' @description Hadi's measure is based on the fact that influential observations
#' can be present as either regression outliers in the response variable,
#' leverage points in the design matrix, or even both. Motivated for similar
#' reasons as Scwheppe weights, Hadi's measure allows for identification of
#' observations that are in some way unduly influential on a regression fit.
#' This can be used for linear models as well as generalized linear models fit with
#' lm() or glm().
#'
#' @param fit the model fit (can be an lm or glm object)
#' @param plot if TRUE (the default) a plot is generated. Otherwise, a vector of the Haidi Values
#' is returned.
#'
#' @return a plot or a vector.
#' @export
#' @references
#' Hadi, A. S. (1992). A new measure of overall potential influence in linear regression. Computational Statistics & Data Analysis, 14(1), 1–27. doi:10.1016/0167-9473(92)90078-t \cr \cr
#' Chatterjee, S. and Hadi, A. (2012) Regression Analysis by Example. 5th ed.
#'
hadivalues <- function(fit, plot = T){

  .pot <- function(fit) {lev<-hatvalues(fit);pii<-1-lev;lev/pii}
  .res <- function(fit) {
    Df<-NULL;pii<-1-hatvalues(fit);q<-fit$rank;p<-q-1
    aov_m<-anova(fit);j<-length(aov_m$Df);den<-sqrt(aov_m[j, 2])
    dii<-(fit$residuals/den)^2;first<-(p+1)/ pii;second<-dii/(1-dii)
    first * second
  }
  hadi<- .pot(fit) + .res(fit)
  if(plot){
    plot(hadi, type="h", col = "#3f2752CC", xlab = "observation")
    abline(hdquantile(hadi, 0.90), 0, col="red", lwd = 1.25)
  }
  else{
    hadi
  }
}


#' Identify good and bad leverage points and regression outliers
#'
#' @description Search for good and bad leverage points using the method advocated by Rousseuw and van Zomeren (1990).
#' This supports regression on numeric outcomes assumed to follow a Gaussian-like distribution (i.e., not generalized linear models).
#' Several options for calculating the robust mahalanobis distances and regression residuals are supported.
#'
#' @param formula formula
#' @param data data frame
#' @param plot should it be plotted? defaults to TRUE.
#' @param interact if TRUE you can click on points to identify which observation number to which a given point corresponds.
#' @param cov.method The options are as follows: \cr \cr
#' - "mvv" - Minimum Variance Vector. The default option. \cr
#' - "mcd" - Minimum (Regularized) Covariance Determinant \cr
#' - "mgv" - Minimum Generalized Variance \cr
#' - "stu" - Student's T Covariance Matrix \cr
#' @param reg.method The options are as follows: \cr \cr
#' - "lqd" - Least Quantile Differences Regression GS-Estimator \cr
#' - "lta" - Least Trimmed Absolute Residuals Regression S-Estimator \cr
#' - "ts"  - Theil-Sen estimator \cr
#' - "lad" - Least Absolute Deviations Regression Estimator \cr
#' - "ols" - Ordinary Least Squares Non-Robust Regression Estimator
#' @return a plot or a list.
#' @export
#'
#' @references
#' Rousseuw, P.J.; van Zomeren, B.C. (1990) Unmasking Multivariate Outliers and Leverage Points. Journal of the American Statistical Association. 85(411):633-639 doi:10.1080/01621459.1990.10474920
#'
reglev<-function(formula, data, plot=TRUE,interact=F,
                 cov.method=c("mvv", "mcd", "mgv", "stu"),
                 reg.method=c("lqd","lta","ts","lad","ols")){

  cov.method <- match.arg(cov.method)
  reg.method <- match.arg(reg.method)
  x=model.matrix(formula, data);
  if(all(x[,1]==1)){x<-x[,-1]}
  y= as.vector(model.response(model.frame(formula, data)))
  p=ncol(x); n=nrow(x)
  plot<-as.logical(plot)

  res <- switch(reg.method,
                "lqd" = roblm.lqd(formula, data),
                "lta" = roblm.lta(formula, data),
                "ts" = roblm.ts(formula, data),
                "lad" = qreg(formula,data, q=0.5),
                "ols" = lm(formula, data)
  )
  stanres<-res$residuals/(cvreg:::HDMAD(res$residuals)*1.4826)
  if(ncol(x)>=2) {
    mve<-switch(cov.method,
                "mvv" = cov.mvv(x),
                "mcd" = cov.mrcd(x),
                "mgv" = cov.mrgv(x),
                "stu" = cov.st(x)
    )
  }
  if(ncol(x)==1){
    mve<-vector("list")
    mve$center <- hdmedian(x)
    mve$cov <- (cvreg:::HDMAD(x)*1.4826)^2
  }
  dis<-sqrt(mahalanobis_dist(x,mve$center,mve$cov))
  if (p<=10){pcrit <- (0.241-0.0029*p)/sqrt(n)};if(p>=11){pcrit <- (0.252-0.0018*p)/sqrt(n)}
  crit<-sqrt(qchisq(1-pcrit,p));chk <- (dis>crit)
  vec<-c(1:nrow(x))
  id<-vec[chk==1]
  chkreg<-ifelse(abs(stanres)>2,1,0)
  idreg<-vec[chkreg==1]
  if(plot){
    plot(dis,stanres,xlab="MD",ylab="Std. Res.",col="#ffffffCC",ylim=c(min(min(stanres,-3)), max(max(stanres),3)), xlim=c(0, max(dis)))

    df=cbind.data.frame(lev=as.numeric(chk),out=ifelse(abs(stanres)>2,1,0))
    df$lev.out<-rep("clean", nrow(df))
    for (i in 1:nrow(df)){
      if(sum(df$lev[i],df$out[i])==2){df$lev.out[i] <- "lev.out"}
      if(sum(df$lev[i],df$out[i])==1 && df$lev[i] == 1){df$lev.out[i] <- "lev"}
      if(sum(df$lev[i],df$out[i])==1 && df$out[i] == 1){df$lev.out[i] <- "regout"}
    }
    points(dis[df$lev.out=="lev.out"], stanres[df$lev.out=="lev.out"], pch = 25, col = "#640000D9", bg= "#bd0b2cF2")
    points(dis[df$lev.out=="lev"], stanres[df$lev.out=="lev"], pch = 23, col = "#142050D9", bg = "#8e22f6CC")
    points(dis[df$lev.out=="regout"], stanres[df$lev.out=="regout"], pch = 24, col = "#8a3808E6", bg = "#e76609F2")
    points(dis[df$lev.out=="clean"], stanres[df$lev.out=="clean"], pch = 21, col = "#153d8cD9", bg = "#2988d6B9")

    abline(-2,0,col="#450e0eCC",lwd=2)
    abline(2,0,col="#450e0eCC",lwd=2)
    abline(v=crit,col="#450e0eCC",lwd=2)
    if(interact) {identify(dis,stanres)}
  }
  else{
    list(levpoints=id,outliers=idreg,mahal.dist = dis,std.resid=stanres,critval=crit)
  }
}


#' Quickly calculate the rank, condition, positive definiteness, and VIFs for a model
#'
#' @param formula a model formula
#' @param data a data frame
#' @param family The glm family being used (required for the VIFs calculation). One of "gaussian" (the default), "binomial", "Gamma", "inverse.gaussian", "poisson", "quasi", "quasibinomial", or "quasipoisson"
#'
#' @description This lets you quickly and easily calculate a few important things to know about a model matrix. \cr
#' \cr
#' The first is the column rank of a matrix. If this returns a value other than the number of columns full.rank will display FALSE.\cr
#' \cr
#' The second is the ratio the maximum to minimum singular value of the model matrix, which gives the conditioning number, C.
#' When a  matrix has a bad condition number that indicates the model is ill-conditioned. This means
#' a linear regression will be extremely sensitive to even the smallest errors and give untrustworthy results.
#' In the worst case, if C is infinite the model matrix is singular and shrinkage methods must be used to
#' obtain a solution at all. Alternatively, the conditioning number can be defined as the ratio of the minimum to maximum singular
#' value of the model matrix, Cinv. Here, larger is better.  \cr
#' \cr
#' A rough estimate of how many digits the estimated y values will have can be manually calculated via the formula mean(D) - log(C),
#' where mean(D) is the average number of decimal digits in the entries of the vector y.
#' \cr
#' The third 'vital sign' is a check for positive definiteness of the covariance matrix, cov(Matrix).
#' If the covariance matrix has zero or negative eigenvalues, it will fail the positive definiteness check.
#' \cr
#' \cr
#' Also caculated are the variance inflation factors. These indicate the factor by which the standard error is inflated for
#' a variable due to correlation with other variables. A common threshold for a VIF being bad is 5. Other commonly used
#' thresholds are 3, 6, 8, and, 10. However, these should be interpreted with some degree of caution. If the model has
#' a sufficiently low conditioning number and passes all of the other checks a decent fit may still be obtained (O’Brien, 2007).
#' \cr
#' \cr
#' A sufficiently bad conditioning number, a lack of positive-definiteness, or
#' lack of full rank can, in the worst case, mean the model matrix may be non-invertible and OLS or GLMs will not work.
#' Shrinkage methods will have to be used to fit the model.  Fortunately, this package provides a great number of regularized regression models.
#' Otherwise, sufficiently bad values can result in
#' the model being "ill-posed", which means one of the three following conditions of a well-posed mathematical problem
#' are violated: \cr
#' \cr
#' 1. A solution exists \cr
#' 2. A unique solution exists \cr
#' 3. The output of a function changes continuously with the input(s) \cr
#' \cr
#' \cr
#' @return
#' a list
#' @export
#' @examples
#' vitals(matrix = model.matrix(Sepal.Width ~ ., iris)[,-1], y = iris$Sepal.Width)
#' vitals(Sepal.Width ~ ., iris)
#'
#' @references O'Brien, R.M.(2007) A Caution Regarding Rules of Thumb for Variance Inflation Factors Qual Quant 41: 673. https://doi.org/10.1007/s11135-006-9018-6
#'
vitals = function(formula = NULL, data = NULL, family = "gaussian"){

  sigdig = function(x)
  {
    str<-as.character(x)
    if(is.na(strsplit(str,"\\.")[[1]][2])) return(0)
    else return(nchar(strsplit(str,"\\.")[[1]][2]))
  }

  calculate.vifs =  function (mod, ...)
  {
    if (any(is.na(coef(mod))))
      stop("There are perfectly collinear predictors in the model. Cannot calculate VIFs as the variance
           inflation is infinite.")
    v <- vcov(mod)
    assign <- attr(model.matrix(mod), "assign")
    if (names(coefficients(mod)[1]) == "(Intercept)") {
      v <- v[-1, -1]
      assign <- assign[-1]
    }
    else warning("No intercept: vifs may not be appropriate..")
    terms <- labels(terms(mod))
    n.terms <- length(terms)
    if (n.terms < 2)
      stop("model contains fewer than 2 terms")
    R <- cov2cor(v)
    detR <- det(R)
    result <- matrix(0, n.terms, 3)
    rownames(result) <- terms
    colnames(result) <- c("GVIF", "Df", "GVIF^(1/(2*Df))")
    for (term in 1:n.terms) {
      subs <- which(assign == term)
      result[term, 1] <- det(as.matrix(R[subs, subs])) * det(as.matrix(R[-subs,-subs]))/detR
      result[term, 2] <- length(subs)
    }
    if (all(result[, 2] == 1))
      result <- result[, 1]
    else result <- result[, 1]
    result
  }

    matrix = as.matrix(model.matrix(formula, data))[,-1]
    y = model.frame(formula, data)[,1]
    d = svd(matrix)$d
    rank = suppressWarnings(suppressMessages(Matrix::rankMatrix(matrix, method = "qrLINPACK")))
    full.rank = isTRUE(ncol(matrix) - rank == 0)

    if (isTRUE(full.rank)){
      model = model = glm(formula=formula,data=data,family=family)
      vifs = calculate.vifs(model)
    } else {
      vifs = "Cannot calculate VIFs, model is not full rank."
    }


    df = c(
        rank =  format(round(rank,3), nsmall = 3),
        full.rank = full.rank,
        Cn = format(round(max(d) / min(d),3),nsmall=3),
        Cn_inv = format(round(min(d) / max(d),5),nsmall=5),
        pos.def = pdcheck(cov(matrix))
        )

    VIFs = vifs
    cat(crayon::blue("Design Matrix Information: \n"))
    print(noquote(df))
    cat(crayon::blue("\nVariance Inflation Factors: \n"))
    print(noquote(format(round(VIFs,digits=3), nsmall = 4)))
}


dfnorm <- function(x, type = c("F","O","I","M","2")){
  type <- match.arg(type)
  full.norm <- norm(x, type = type)
  norms <- vector()
  for (i in 1:nrow(x)){
    norms[i] <- norm(x[-i, ], type = type) - full.norm
  }
  plot(norms, type = "h")
}

dfcond <- function(x, plot = T, tol = 1e-6){
  eigenvals <- function(x){(svd(sweep(x,2,colMeans(x),"-"))$d^2) / nrow(x)}
  condnum <- function(x){ev <- eigenvals(x); if (min(ev) <= tol){Inf}else{max(ev)/min(ev)}}
  full.cond <- norm(x, type = type); conditions<-vector()
  for (i in 1:ncol(x)){
    conditions[i] <- condnum(x[, -i]) - full.cond
  }
  if(plot){
    plot(conditions,type="h",lwd=1.25,col=ifelse(sign(conditions)>=0,"red","blue"))
  }else{
    return(conditions)
  }
}