cv-extend.R
In cv: Cross-Validating Regression Models

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  message = TRUE,
  warning = TRUE,
  fig.align = "center",
  fig.height = 6,
  fig.width = 7,
  fig.path = "fig/",
  dev = "png",
  comment = "#>" #,
  # eval = nzchar(Sys.getenv("REBUILD_VIGNETTES"))
)

# save some typing
knitr::set_alias(w = "fig.width",
                 h = "fig.height",
                 cap = "fig.cap")

# colorize text: use inline as `r colorize(text, color)`
colorize <- function(x, color) {
  if (knitr::is_latex_output()) {
    sprintf("\\textcolor{%s}{%s}", color, x)
  } else if (knitr::is_html_output()) {
    sprintf("<span style='color: %s;'>%s</span>", color,
      x)
  } else x
}


.opts <- options(digits = 5)

## ----AUCcomp------------------------------------------------------------------
AUCcomp <- function(y, yhat) 1 - Metrics::auc(y, yhat)

## ----Mroz-regression----------------------------------------------------------
data("Mroz", package="carData")
m.mroz <- glm(lfp ~ ., data=Mroz, family=binomial)
summary(m.mroz)

AUCcomp(with(Mroz, as.numeric(lfp == "yes")), fitted(m.mroz))

## ----Mroz-CV-ROC--------------------------------------------------------------
library("cv")
cv(m.mroz, criterion=AUCcomp, seed=3639)

## -----------------------------------------------------------------------------
mse

cv:::getLossFn(mse(rnorm(100), rnorm(100)))

## ----BEPS-data----------------------------------------------------------------
data("BEPS", package="carData")
head(BEPS)

## ----BEPS-model---------------------------------------------------------------
library("nnet")
m.beps <- multinom(
  vote ~ age + gender + economic.cond.national +
    economic.cond.household + Blair + Hague + Kennedy +
    Europe * political.knowledge,
  data = BEPS
)

car::Anova(m.beps)

## ----BEPS-plot, fig.width=9, fig.height=5-------------------------------------
plot(
  effects::Effect(
    c("Europe", "political.knowledge"),
    m.beps,
    xlevels = list(Europe = 1:11, political.knowledge = 0:3),
    fixed.predictors = list(given.values = c(gendermale = 0.5))
  ),
  lines = list(col = c("blue", "red", "orange")),
  axes = list(x = list(rug = FALSE), y = list(style = "stacked"))
)

## ----BayesRuleMulti-----------------------------------------------------------
head(BEPS$vote)
yhat <- predict(m.beps, type = "class")
head(yhat)

BayesRuleMulti <- function(y, yhat) {
  result <- mean(y != yhat)
  attr(result, "casewise loss") <- "y != yhat"
  result
}

BayesRuleMulti(BEPS$vote, yhat)

## ----BEPS-response-distribution-----------------------------------------------
xtabs(~ vote, data=BEPS)/nrow(BEPS)

## ----BEPS-test-default, error=TRUE--------------------------------------------
cv(m.beps, seed=3465, criterion=BayesRuleMulti)

## ----GetResponse.multinom-----------------------------------------------------
GetResponse.multinom <- function(model, ...) {
  insight::get_response(model)
}

head(GetResponse(m.beps))

## ----BEPS-test-default-2, error=TRUE------------------------------------------
cv(m.beps, seed=3465, criterion=BayesRuleMulti)

## ----cv.nultinom--------------------------------------------------------------
cv.multinom <-
  function (model, data, criterion = BayesRuleMulti, k, reps,
            seed, ...) {
    model <- update(model, trace = FALSE)
    NextMethod(
      type = "class",
      criterion = criterion,
      criterion.name = deparse(substitute(criterion))
    )
  }

## ----BEPS-cv------------------------------------------------------------------
summary(cv(m.beps, seed=3465))

## ----cv.multinom-alternative--------------------------------------------------
cv.multinom <- function(model,
                        data = insight::get_data(model),
                        criterion = BayesRuleMulti,
                        k = 10,
                        reps = 1,
                        seed = NULL,
                        details = k <= 10,
                        confint = n >= 400,
                        level = 0.95,
                        ncores = 1,
                        start = FALSE,
                        ...) {
  f <- function(i) {
    # helper function to compute to compute fitted values,
    #  etc., for each fold i
    
    indices.i <- fold(folds, i)
    model.i <- if (start) {
      update(model,
             data = data[-indices.i,],
             start = b,
             trace = FALSE)
    } else {
      update(model, data = data[-indices.i,], trace = FALSE)
    }
    fit.all.i <- predict(model.i, newdata = data, type = "class")
    fit.i <- fit.all.i[indices.i]
    # returns:
    #  fit.i: fitted values for the i-th fold
    #  crit.all.i: CV criterion for all cases based on model with
    #              i-th fold omitted
    #  coef.i: coefficients for the model with i-th fold omitted
    list(
      fit.i = fit.i,
      crit.all.i = criterion(y, fit.all.i),
      coef.i = coef(model.i)
    )
  }
  
  fPara <- function(i, multinom, ...) {
    # helper function for parallel computation
    #   argument multinom makes multinom() locally available
    #   ... is necessary but not used
    indices.i <- fold(folds, i)
    model.i <- if (start) {
      update(model,
             data = data[-indices.i,],
             start = b,
             trace = FALSE)
    } else {
      update(model, data = data[-indices.i,], trace = FALSE)
    }
    fit.all.i <- predict(model.i, newdata = data, type = "class")
    fit.i <- fit.all.i[indices.i]
    list(
      fit.i = fit.i,
      crit.all.i = criterion(y, fit.all.i),
      coef.i = coef(model.i)
    )
  }
  
  n <- nrow(data)
  
  # see ?cvCompute for definitions of arguments
  cvCompute(
    model = model,
    data = data,
    criterion = criterion,
    criterion.name = deparse(substitute(criterion)),
    k = k,
    reps = reps,
    seed = seed,
    details = details,
    confint = confint,
    level = level,
    ncores = ncores,
    type = "class",
    start = start,
    f = f,
    fPara = fPara,
    multinom = nnet::multinom
  )
}

## ----BEPS-cv-alt-version------------------------------------------------------
summary(cv(m.beps, seed=3465))

## ----cv.lme-------------------------------------------------------------------
cv:::cv.lme

## ----GetResponse.glmmPQL------------------------------------------------------
GetResponse.glmmPQL <- function(model, ...) {
  f <- formula(model)
  f[[3]] <- 1 # regression constant only on RHS
  model <-
    suppressWarnings(glm(
      f,
      data = model$data,
      family = model$family,
      control = list(maxit = 1)
    ))
  cv::GetResponse(model)
}

## ----cv.glmmPQL---------------------------------------------------------------
cv.glmmPQL <- function(model,
                       data = model$data,
                       criterion = mse,
                       k,
                       reps = 1,
                       seed,
                       ncores = 1,
                       clusterVariables,
                       ...) {
  cvMixed(
    model,
    package = "MASS",
    data = data,
    criterion = criterion,
    k = k,
    reps = reps,
    seed = seed,
    ncores = ncores,
    clusterVariables = clusterVariables,
    predict.clusters.args = list(
      object = model,
      newdata = data,
      level = 0,
      type = "response"
    ),
    predict.cases.args = list(
      object = model,
      newdata = data,
      level = 1,
      type = "response"
    ),
    fixed.effects = nlme::fixef, 
    verbose = FALSE,
    ...
  )
}

## ----glmmPQL-example----------------------------------------------------------
library("MASS")
m.pql <- glmmPQL(
  y ~ trt + I(week > 2),
  random = ~ 1 | ID,
  family = binomial,
  data = bacteria
)
summary(m.pql)

## ----compare-to-lme4----------------------------------------------------------
library("lme4")
m.glmer <- glmer(y ~ trt + I(week > 2) + (1 | ID),
                 family = binomial, data = bacteria)
summary(m.glmer)

# comparison of fixed effects:
car::compareCoefs(m.pql, m.glmer) 

## ----cv-example-glmmPQL, cache=TRUE-------------------------------------------
summary(cv(m.pql, clusterVariables="ID", criterion=BayesRule))

summary(cv(m.pql, data=bacteria, criterion=BayesRule, seed=1490))

## ----cv-example-glmer, cache=TRUE---------------------------------------------
summary(cv(m.glmer, clusterVariables="ID", criterion=BayesRule))

summary(cv(m.glmer, data=bacteria, criterion=BayesRule, seed=1490))

## ----swiss--------------------------------------------------------------------
library("leaps")
head(swiss)
nrow(swiss)

## ----swiss-lm-----------------------------------------------------------------
m.swiss <- lm(Fertility ~ ., data=swiss)
summary(m.swiss)

summary(cv(m.swiss, seed=8433))

## ----subset-selection---------------------------------------------------------
swiss.sub <- regsubsets(Fertility ~ ., data=swiss)
summary(swiss.sub)

(bics <- summary(swiss.sub)$bic)
which.min(bics)

car::subsets(swiss.sub, legend="topright")

## ----best-model---------------------------------------------------------------
m.best <- update(m.swiss, . ~ . - Examination)
summary(m.best)

summary(cv(m.best, seed=8433)) # use same folds as before

## ----selectSubsets------------------------------------------------------------
selectSubsets <- function(data = insight::get_data(model),
                          model,
                          indices,
                          criterion = mse,
                          details = TRUE,
                          seed,
                          save.model = FALSE,
                          ...) {
  if (inherits(model, "lm", which = TRUE) != 1)
    stop("selectSubsets is appropriate only for 'lm' models")
  
  y <- GetResponse(model)
  formula <- formula(model)
  X <- model.matrix(model)
  
  if (missing(indices)) {
    if (missing(seed) || is.null(seed))
      seed <- sample(1e6, 1L)
    # select the best model from the full data by BIC
    sel <- leaps::regsubsets(formula, data = data, ...)
    bics <- summary(sel)$bic
    best <- coef(sel, 1:length(bics))[[which.min(bics)]]
    x.names <- names(best)
    # fit the best model; intercept is already in X, hence - 1:
    m.best <- lm(y ~ X[, x.names] - 1)
    fit.all <- predict(m.best, newdata = data)
    return(list(
      criterion = criterion(y, fit.all),
      model = if (save.model)
        m.best # return best model
      else
        NULL
    ))
  }
  
  # select the best model omitting the i-th fold (given by indices)
  sel.i <- leaps::regsubsets(formula, data[-indices,], ...)
  bics.i <- summary(sel.i)$bic
  best.i <- coef(sel.i, 1:length(bics.i))[[which.min(bics.i)]]
  x.names.i <- names(best.i)
  m.best.i <- lm(y[-indices] ~ X[-indices, x.names.i] - 1)
  # predict() doesn't work here:
  fit.all.i <- as.vector(X[, x.names.i] %*% coef(m.best.i))
  fit.i <- fit.all.i[indices]
  # return the fitted values for i-th fold, CV criterion for all cases,
  #   and the regression coefficients
  list(
    fit.i = fit.i,
    # fitted values for i-th fold
    crit.all.i = criterion(y, fit.all.i),
    # CV crit for all cases
    coefficients = if (details) {
      # regression coefficients
      coefs <- coef(m.best.i)
      
      # fix coefficient names
      names(coefs) <- sub("X\\[-indices, x.names.i\\]", "",
                          names(coefs))
      
      coefs
    }  else {
      NULL
    }
  )
}

## ----test-selectSubsets-------------------------------------------------------
selectSubsets(model=m.swiss)

## ----test-selectSubsets-fold--------------------------------------------------
selectSubsets(model=m.swiss, indices=seq(5, 45, by=10))

## ----cvSelect-selectSubsets---------------------------------------------------
cv.swiss <- cv(
  selectSubsets,
  working.model = m.swiss,
  data = swiss,
  seed = 8433 # use same folds
)
summary(cv.swiss)

## ----best-models-by-folds-----------------------------------------------------
compareFolds(cv.swiss)