R/commonglm.R

Defines functions .isInterceptOnly .durbinWatsonTest.default .durbinWatsonTest.lm .reglogisticSetError .reglogisticVovkSellke .glmInfluenceTableFill .glmInfluenceTable .glmRobustSE .invLogit .glmLogRegRibbon .predLevel .formatTerm .brierScore .confusionMatrix .stdEst .bic .coxSnellCompute .coxSnell .tjur .nagelkerke .mcFadden .hasNuisance .lrtest .confusionMatAddColInfo .messagePvalAdjustment .emmMessageAveragedOver .emmMessageTestNull .vif.default .capitalize .constInfoTransName .constInfoTransform .regressionExportResiduals .glmFillPlotResQQ .glmPlotResQQ .glmInsertPlot .glmStdResidCompute .glmCreatePlotPlaceholder .hasNuisance .glmFitErrorMessageHelper .glmComputeModel .is.wholenumber .glmStep .createNullFormula .createLogisticRegFormula .createGLMFormula .reglogisticComputeModel

# Compute Model
.reglogisticComputeModel <- function(jaspResults, dataset, options, ready) {
  if(!is.null(jaspResults[["glmRes"]]) || isFALSE(ready))
    return()

    # Logistic regression
  modelTerms <- options[["modelTerms"]]
  dependent <- options[["dependent"]]
  interceptTerm <- options[["interceptTerm"]]
  if (options[["method"]] == "enter") {
    
    glmRes <- list()
    for (modelIndex in seq_along(modelTerms)) {

      model <- glm(.createLogisticRegFormula(modelTerms[[modelIndex]], dependent, interceptTerm), 
                  family = "binomial", dataset)
      glmRes[[modelIndex]] <- model
      
    }
    
  } else if (options[["method"]] != "enter") {
    nullMod <- glm(.createLogisticRegFormula(modelTerms[[1]], dependent, interceptTerm), 
                   family = "binomial", dataset)
    fullMod <- glm(.createLogisticRegFormula(modelTerms[[length(modelTerms)]], dependent, interceptTerm), 
                   family = "binomial", dataset)
    glmRes  <- .glmStep(nullMod, fullMod, dataset, method = options$method)
  }

  jaspResults[["glmRes"]] <- createJaspState(glmRes)
  jaspResults[["glmRes"]]$dependOn( c("dependent", "method", "modelTerms", "interceptTerm"))
  return(glmRes)
}

# create GLM formula
.createGLMFormula <- function(options, nullModel = FALSE) {
  
  leftTerm <- options$dependent
  modelTerms <- options$modelTerms
  includeIntercept <- options$interceptTerm
  offsetTerm <- options$offset
  
  if (includeIntercept)
    rightTerms <- "1"
  else
    rightTerms <- "-1"
  
  if (offsetTerm != "")
    rightTerms <- c(rightTerms, paste("offset(", offsetTerm, ")", sep = ""))
  
  if (length(modelTerms) == 0) {
    f <- formula(paste(leftTerm, "~", rightTerms))
  } else {
    
    if (nullModel) {
      for (i in seq_along(modelTerms)) {
        nuisance <- modelTerms[[i]][["isNuisance"]]
        if (!is.null(nuisance) && nuisance) {
          term <- modelTerms[[i]][["components"]]
          if (length(term) == 1)
            rightTerms <- c(rightTerms, term)
          else
            rightTerms <- c(rightTerms, paste(term, collapse = ":"))
        }
      }
      
    } else {
      for (i in seq_along(modelTerms)) {
        term <- modelTerms[[i]][["components"]]
        if (length(term) == 1)
          rightTerms <- c(rightTerms, term)
        else
          rightTerms <- c(rightTerms, paste(term, collapse = ":"))
      }
    }
    f <- formula(paste(leftTerm, "~", paste(rightTerms, collapse = "+")))
  }
  return(f)
}

.createLogisticRegFormula <- function(modelOptions, dependent, interceptTerm) {
  # this function outputs a formula name with base64 values as varnames
  f <- NULL

  if (dependent == "")
    f <- 0~1 # mock formula, always works

  modelTerms <- modelOptions$components
  interceptTerm <- interceptTerm
  if (length(modelTerms) == 0) {
    if (interceptTerm)
      f <- formula(paste(dependent, "~ 1"))
    else
      f <- formula(paste(dependent, "~ 0"))
  } else {
    if (interceptTerm)
      t <- character(0)
    else
      t <- "0"
    for (i in seq_along(modelTerms)) {
      term <- modelTerms[[i]]
      if (length(term) == 1)
        t <- c(t, term)
      else
        t <- c(t, paste(unlist(term), collapse = ":"))
    }
    f <- formula(paste(dependent, "~", paste(t, collapse = "+")))
  }
  return(f)
}

.createNullFormula <- function(options) {
  # this function outputs a formula name with base64 values as varnames
  f <- NULL
  dependent <- options$dependent
  if (dependent == "")
    return(NULL)

  modelTerms <- options$modelTerms
  interceptTerm <- options$interceptTerm

  t <- character(0)
  for (i in seq_along(modelTerms)) {
    nui <- modelTerms[[i]][["isNuisance"]]
    if (!is.null(nui) && nui) {
      term <- modelTerms[[i]][["components"]]
      if (length(term) == 1)
        t <- c(t, .v(term))
      else
        t <- c(t, paste(unlist(term), collapse = ":"))
    }
  }
  if (!interceptTerm)
    t <- c(t, "0")
  else
    t <- c(t, "1")

  return(formula(paste(dependent, "~", paste(t, collapse = "+"))))
}

.glmStep <- function(nullModel, fullModel, dataset, method = "enter") {
  # .glmStep function
  # -----------------
  # INPUT: calculated glm objects: nullModel, fullModel, and a dataset
  # nullModel and fullModel should have a data argument
  #
  # OUTPUT: List of glm objects, where the nullModel is the first model
  # (method = enter, forward, stepwise) or the fullModel (method = backward)
  # The last model is the final model that was converged on.

  # first, create temporary environment with dataset for stepAIC() calls
  tempenv <- new.env()
  datname <- as.character(as.list(getCall(fullModel))$data)
  assign(datname, dataset, envir = tempenv)

  null <- nullModel$formula
  full <- fullModel$formula

  assign("nf", null, envir = tempenv)
  assign("ff", full, envir = tempenv)

  if (method == "enter") {
    modlist <- vector("list", 2)
    modlist[[1]] <- nullModel
    modlist[[2]] <- fullModel
  } else if (method == "backward") {
    stepOut <- MASS::stepAIC(fullModel,
                             scope = list(upper=full, lower = null),
                             trace = 0,
                             direction = "backward",
                             keep = function(m, b) list(m))
    modlist <- as.list(stepOut$keep[,1:ncol(stepOut$keep)])
  } else {
    # translate method to stepAIC direction
    direct <- ifelse(method == "forward", method, "both")
    stepOut <- JASPstepAIC(nullModel, full, trace = 0,
                           direction = direct,
                           keep = function(m, b) list(m))
    modlist <- as.list(stepOut$keep[,1:ncol(stepOut$keep)])
  }

  return(modlist)
}

# as explained in ?is.integer
.is.wholenumber <- function(x, tol = .Machine$double.eps^0.5)  abs(x - round(x)) < tol


.glmComputeModel <- function(jaspResults, dataset, options) {
  if (!is.null(jaspResults[["glmModels"]]))
    return(jaspResults[["glmModels"]]$object)
  
  # make formulas for the full model and the null model
  ff <- .createGLMFormula(options, nullModel = FALSE)
  environment(ff) <- environment()
  nf <- .createGLMFormula(options, nullModel = TRUE)
  environment(nf) <- environment()
  
  weights <- dataset[[options[["weights"]]]]
  
  if (options$family != "other") {
    # specify family and link
    if (options$family == "bernoulli") {
      family <- "binomial"
    }
    else if (options$family == "gamma") {
      family <- "Gamma"
    }
    else if (options$family == "inverseGaussian") {
      family <- "inverse.gaussian"
    }
    else {
      family <- options$family
    }
    
    familyLink <- eval(call(family, link = options$link))
    # compute full and null models
    fullModel <- try(stats::glm(ff,
                                family = familyLink,
                                data = dataset,
                                weights = weights))
    nullModel <- try(stats::glm(nf,
                                family = familyLink,
                                data = dataset,
                                weights = weights))
  } else {
    
    if (options$otherGlmModel == "multinomialLogistic") {
      fullModel <- try(VGAM::vglm(ff,
                                  family = VGAM::multinomial(),
                                  data = dataset,
                                  weights = weights))
      nullModel <- try(VGAM::vglm(nf,
                                  family = VGAM::multinomial(),
                                  data = dataset,
                                  weights = weights))
    }
    
    if (options$otherGlmModel == "ordinalLogistic") {
      fullModel <- try(VGAM::vglm(ff,
                                  family = VGAM::cumulative(link = "logitlink", parallel = TRUE),
                                  data = dataset,
                                  weights = weights))
      nullModel <- try(VGAM::vglm(nf,
                                  family = VGAM::cumulative(link = "logitlink", parallel = TRUE),
                                  data = dataset,
                                  weights = weights))
    }
    
    if (options$otherGlmModel == "firthLogistic") {
      fullModel <- try(logistf::logistf(ff,
                                        data = dataset,
                                        pl = TRUE,
                                        firth = TRUE,
                                        weights = weights))
      nullModel <- try(logistf::logistf(nf,
                                        data = dataset,
                                        pl = TRUE,
                                        firth = TRUE,
                                        weights = weights))
    }
  }
  
  if (jaspBase::isTryError(fullModel)) {
    msg <- .glmFitErrorMessageHelper(fullModel)
    msg <- gettextf("The full model could not be fitted to the data, with the following error message: '%s'", msg)
    jaspBase::.quitAnalysis(msg)
  }
  
  if (jaspBase::isTryError(nullModel)) {
    msg <- .glmFitErrorMessageHelper(nullModel)
    msg <- gettextf("The null model could not be fitted to the data, with the following error message: '%s'", msg)
    jaspBase::.quitAnalysis(msg)
  }
  
  glmModels <- list("nullModel" = nullModel,
                    "fullModel" = fullModel)
  # combine both models
  jaspResults[["glmModels"]] <- createJaspState()
  jaspResults[["glmModels"]]$dependOn(optionsFromObject = jaspResults[["modelSummary"]])
  jaspResults[["glmModels"]]$object <- glmModels
  
  return(glmModels)
}

.glmFitErrorMessageHelper <- function(tryError) {
  msg <- jaspBase::.extractErrorMessage(tryError)
  if(msg == "cannot find valid starting values: please specify some")
    msg <- gettext("Could not find valid starting values. Check feasibility of the model to fit the data.")
  
  return(msg)
}


.hasNuisance <- function(options) {
  return(any(sapply(options$modelTerms, function(x) x[["isNuisance"]])))
}

.glmCreatePlotPlaceholder <- function(container, index, title) {
  jaspPlot            <- createJaspPlot(title = title)
  jaspPlot$status     <- "running"
  container[[index]]  <- jaspPlot
  return()
}


.glmStdResidCompute <- function(model, residType, options) {
  if (residType == "deviance") {
    stdResid <- rstandard(model)
  }
  else if (residType == "Pearson") {
    phiEst <- summary(model)[["dispersion"]]
    resid <- resid(model, type="pearson")
    stdResid <- resid / sqrt(phiEst * (1 - hatvalues(model)))
  }
  else if (residType == "quantile") {
    jaspBase::.setSeedJASP(options)
    resid <- statmod::qresid(model)
    stdResid <- resid / sqrt(1 - hatvalues(model))
  } else {
    stdResid <- NULL
  }
  
  return(stdResid)
}

.glmInsertPlot <- function(jaspPlot, func, ...) {
  p <- try(func(...))
  
  if (inherits(p, "try-error")) {
    errorMessage <- .extractErrorMessage(p)
    jaspPlot$setError(gettextf("Plotting is not possible: %s", errorMessage))
  } else {
    jaspPlot$plotObject <- p
    jaspPlot$status     <- "complete"
  }
}

# Plots: Residuals Q-Q
.glmPlotResQQ <- function(jaspResults, dataset, options, ready, position) {
  
  plotNames <- c("devianceResidualQqPlot", "pearsonResidualQqPlot", "quantileResidualQqPlot")
  if (!ready || !any(unlist(options[plotNames])))
    return()
  
  residNames <- c("deviance", "Pearson", "quantile")
  
  glmPlotResQQContainer <- createJaspContainer(gettext("Q-Q Plots"))
  glmPlotResQQContainer$dependOn(optionsFromObject = jaspResults[["modelSummary"]],
                                 options           = c(plotNames, "seed", "setSeed"))
  glmPlotResQQContainer$position <- position
  jaspResults[["diagnosticsContainer"]][["glmPlotResQQ"]] <- glmPlotResQQContainer
  
  
  if (!is.null(jaspResults[["glmModels"]])) {
    glmFullModel <- jaspResults[["glmModels"]][["object"]][["fullModel"]]
    for (i in 1:length(plotNames)) {
      if (options[[plotNames[[i]]]]) {
        .glmCreatePlotPlaceholder(glmPlotResQQContainer,
                                  index = plotNames[[i]],
                                  title = gettextf("Q-Q plot: Standardized %1s residuals", residNames[[i]]))
        
        .glmInsertPlot(glmPlotResQQContainer[[plotNames[[i]]]],
                       .glmFillPlotResQQ,
                       residType = residNames[[i]],
                       model = glmFullModel,
                       options = options)
      }
    }
  }
  return()
}

.glmFillPlotResQQ <- function(residType, model, options) {
  
  # compute residuals
  stdResid <- .glmStdResidCompute(model = model, residType = residType, options = options)
  
  p <- jaspGraphs::plotQQnorm(stdResid, ablineColor = "darkred", ablineOrigin = TRUE, identicalAxes = TRUE)
  
  return(p)
}


.regressionExportResiduals <- function(container, model, dataset, options, ready) {
  
  if (isFALSE(options[["residualsSavedToData"]]))
    return()
  
  if (is.null(container[["residualsSavedToDataColumn"]]) && 
      isFALSE(is.null(options[["residualsSavedToDataColumn"]])) &&
      options[["residualsSavedToDataColumn"]] != "") {
    
    residuals <- model[["residuals"]] # extract residuals
    
    container[["residualsSavedToDataColumn"]] <- createJaspColumn(columnName = options[["residualsSavedToDataColumn"]])
    container[["residualsSavedToDataColumn"]]$dependOn(options = c("residualsSavedToDataColumn", "residualsSavedToData"))
    container[["residualsSavedToDataColumn"]]$setScale(residuals)
    
  }
  
}

.constInfoTransform <- function(family, x) {
  switch(family,
         "bernoulli" = 1/(sin(sqrt(x))),
         "binomial" = 1/(sin(sqrt(x))),
         "poisson" = sqrt(x),
         "Gamma" = log(x),
         "inverse.gaussian" = 1/sqrt(x),
         "gaussian" = x)
}

.constInfoTransName <- function(family) {
  switch(family,
         "bernoulli" = expression(sin^-1 * sqrt("Fitted values")),
         "binomial" = expression(sin^-1 * sqrt("Fitted values")),
         "poisson" = expression(sqrt("Fitted values")),
         "Gamma" = expression(log("Fitted values")),
         "inverse.gaussian" = expression(1/sqrt("Fitted Values")),
         "gaussian" = "Fitted values")
}

.capitalize <- function(x) {
  x_new <- paste(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)), sep="")
  return(x_new)
}

# function for multicollineary statistics, taken from the source code of car::vif
# car version: 3.0-13 see https://rdrr.io/cran/car/src/R/vif.R
# the reason is to remove the dependency on the car package. When importing the entire car package,
# JASP crashes when loading the regression module (at least on Windows 10).

.vif.default <- function(mod, ...) {
  # modified to fix bug: https://github.com/jasp-stats/jasp-test-release/issues/2487
  if (length(coef(mod)) == 0) {
    stop(gettext("There are no predictors to test for multicollinearity"))
  }
  # end modification
  if (any(is.na(coef(mod))))
    stop ("there are aliased coefficients in the model")
  v <- vcov(mod)
  assign <- attr(model.matrix(mod), "assign")
  if (names(coefficients(mod)[1]) == "(Intercept)") {
    v <- v[-1, -1]
    assign <- assign[-1]
  }
  else warning("No intercept: vifs may not be sensible.")
  terms <- labels(terms(mod))
  n.terms <- length(terms)
  if (n.terms < 2) stop("model contains fewer than 2 terms")
  R <- cov2cor(v)
  detR <- det(R)
  result <- matrix(0, n.terms, 3)
  rownames(result) <- terms
  colnames(result) <- c("GVIF", "Df", "GVIF^(1/(2*Df))")
  for (term in 1:n.terms) {
    subs <- which(assign == term)
    result[term, 1] <- det(as.matrix(R[subs, subs])) *
      det(as.matrix(R[-subs, -subs])) / detR
    result[term, 2] <- length(subs)
  }
  if (all(result[, 2] == 1)) result <- result[, 1]
  else result[, 3] <- result[, 1]^(1/(2 * result[, 2]))
  result
}

# message for estimated marginal means table
.emmMessageTestNull     <- function(value)  gettextf("P-values correspond to test of null hypothesis against %s.", value)
.emmMessageAveragedOver <- function(terms)  gettextf("Results are averaged over the levels of: %s.",paste(terms, collapse = ", "))
.messagePvalAdjustment  <- function(adjustment) {
  if (adjustment == "none") {
    return(gettext("P-values are not adjusted."))
  }
  adjustment <- switch(adjustment,
                       "holm"       = gettext("Holm"),
                       "hommel"     = gettext("Homel"),
                       "hochberg"   = gettext("Hochberg"),
                       "mvt"        = gettext("Multivariate-t"),
                       "tukey"      = gettext("Tukey"),
                       "BH"         = gettext("Benjamini-Hochberg"),
                       "BY"         = gettext("Benjamini-Yekutieli"),
                       "scheffe"    = gettext("Scheffé"),
                       "sidak"      = gettext("Sidak"),
                       "dunnettx"   = gettext("Dunnett"),
                       "bonferroni" = gettext("Bonferroni")
  )
  return(gettextf("P-values are adjusted using %s adjustment.", adjustment))
}



.confusionMatAddColInfo <- function(table, levs, type) {
  table$addColumnInfo(name = "pred0", title = paste0(levs[1]), type = type, overtitle = gettext("Predicted"))
  table$addColumnInfo(name = "pred1", title = paste0(levs[2]), type = type, overtitle = gettext("Predicted"))
}

# Helper functions for the above.
.lrtest <- function(glmModel1, glmModel2) {
  # likelihood ratio test for model against null model
  if (glmModel1[["df.residual"]] > glmModel2[["df.residual"]]) {
    superModel <- glmModel2
    subModel <- glmModel1
  } else {
    superModel <- glmModel1
    subModel <- glmModel2
  }

  chisq <- max(0, subModel[["deviance"]] - superModel[["deviance"]])
  df <- subModel[["df.residual"]] - superModel[["df.residual"]]

  if (chisq == 0 || df == 0)
    p <- NULL
  else
    p <- 1-pchisq(chisq, df)
  return(list(stat = chisq, df = df, pval = p))
}

.hasNuisance <- function(options) {
  return(any(sapply(options$modelTerms, function(x) x[["isNuisance"]])))
}

.mcFadden <- function(glmModel, nullModel) {
  # https://eml.berkeley.edu/reprints/mcfadden/zarembka.pdf
  if (.isInterceptOnly(glmModel)) {
    # intercept-only model needs fix because of computer precision limits
    return(0)
  } else
    return(max(c(0,1-(glmModel[["deviance"]]/nullModel[["deviance"]]))))
}

.nagelkerke <- function(glmModel, nullModel) {
  # https://doi.org/10.1093/biomet/78.3.691
  if (.isInterceptOnly(glmModel)) {
    # intercept-only model needs fix because of computer precision limits
    return(NULL)
  } else {
    l0 <- -0.5*nullModel[["deviance"]]
    lm <- as.numeric(logLik(glmModel))
    n  <- length(glmModel[["y"]])
    coxSnell <- .coxSnellCompute(l0, lm, n)
    denom <- -expm1(2*l0/n)
    return(max(c(0,coxSnell / denom)))
  }
}

.tjur <- function(glmModel) {
  # http://dx.doi.org/10.1198/tast.2009.08210
  ps <- predict(glmModel, type = "response")
  ys <- glmModel[["y"]]
  return(abs(mean(ps[as.logical(ys)])-mean(ps[!as.logical(ys)])))
}

.coxSnell <- function(glmModel, nullModel) {
  if (.isInterceptOnly(glmModel)) {
    # intercept-only model needs fix because of computer precision limits
    return(NULL)
  } else {
    l0 <- -0.5*nullModel[["deviance"]]
    lm <- as.numeric(logLik(glmModel))
    n  <- length(glmModel[["y"]])
    coxSnell <- .coxSnellCompute(l0, lm, n)
    return(max(c(0,coxSnell)))
  }
}

.coxSnellCompute <- function(l0, lm, n) {
  return(-expm1(2*(l0 - lm)/n))
}

.bic <- function(glmModel) {
  return(log(length(glmModel[["y"]]))*length(coef(glmModel))+glmModel[["deviance"]])
}

.stdEst <- function(glmModel, type = "X") {
  # This function fits a new model with scaled variables and outputs the coeffs:
  # type = "X" : covariates scaled
  # type = "Y" : outcome scaled
  # type = "XY" : covariates and outcomes scaled
  # NB: factors (dummy-coded) will never be scaled.
  if (attr(glmModel[["terms"]], "intercept"))
    b <- summary(glmModel)[["coefficients"]][-1,1]
  else
    b <- summary(glmModel)[["coefficients"]][,1]

  factors <- names(glmModel[["xlevels"]])
  xmod <- glmModel[["model"]][!names(glmModel[["model"]]) %in% factors][,-1]
  xfac <- glmModel[["model"]][names(glmModel[["model"]]) %in% factors]
  ymod <- glmModel[["model"]][1]

  if (type == "X")
    stdDat <- cbind(ymod, scale(xmod), xfac)
  else if (type == "Y")
    stdDat <- cbind(scale(ymod), xmod, xfac)
  else if (type == "XY")
    stdDat <- cbind(scale(ymod), scale(xmod), xfac)

  names(stdDat) <- names(glmModel[["model"]])

  stdMod <- stats::glm(formula = glmModel[["formula"]], data = stdDat,
                       family = glmModel[["family"]])

  return(coef(stdMod))
}

.confusionMatrix <- function(glmModel, cutoff=0.5) {
  cMat <- list()
  pred <- predict(glmModel,type = "response")
  obs  <- glmModel$y
  h    <- hmeasure::HMeasure(obs, pred, threshold = cutoff)
  m    <- matrix(c(h[["metrics"]][["TN"]], h[["metrics"]][["FN"]],
                   h[["metrics"]][["FP"]], h[["metrics"]][["TP"]]), 2)
  dimnames(m) <- list("Observed" = c(0, 1), "Predicted" = c(0, 1))
  h[["metrics"]][["Brier"]] <- .brierScore(obs, pred)
  cMat[["matrix"]]  <- m
  cMat[["metrics"]] <- h[["metrics"]]
  return(cMat)
}

.brierScore <- function(obs, pred) {
  return(sum((pred - obs)^2) / length(pred))
}

.formatTerm <- function(term, glmModel) {
  # input: string of model term & glmObj
  vars <- names(glmModel[["model"]][-1])

  if (attr(glmModel[["terms"]], "intercept"))
    vars <- c(vars, gettext("(Intercept)"))

  # escape special regex characters
  vars <- gsub("(\\W)", "\\\\\\1", vars, perl=TRUE)

  # regex patterns
  pat1 <- paste0("\\:","(?=(",paste(vars, collapse = ")|("),"))")
  pat2 <- paste0("(?<=(",paste(vars, collapse = ")|("),"))")

  # split up string into components
  spl  <- strsplit(term, pat1, perl = TRUE)[[1]]
  spl2 <- lapply(spl, function(t) strsplit(t, pat2, perl = TRUE))

  # format and add back together
  col <- lapply(spl2, function(s) {
    if (length(unlist(s)) > 1) {
      varname <- .unv(unlist(s)[1])
      levname <- unlist(s)[2]
      return(paste0(varname, " (", levname, ")"))
    } else
      return(.unv(unlist(s)))
  })
  col2 <- paste(unlist(col), collapse = " * ")

  return(col2)
}

.predLevel <- function(glmModel) {
  predVar <- as.character(glmModel[["terms"]])[2]
  return(levels(glmModel[["data"]][[predVar]])[2])
}

.glmLogRegRibbon <- function(logRes, var, ciInt = 0.95) {
  # This function calculates the estimation & CI datapoints for logreg plot
  fac     <- FALSE
  factors <- names(logRes$xlevels)
  mf      <- model.frame(logRes)
  outcome <- logRes$terms[[2]]
  cDat    <- mf[,!(colnames(mf) %in% c(factors, outcome))]
  if (length(cDat) == nrow(mf)) {
    cm        <- mean(cDat)
    names(cm) <- colnames(mf)[!(colnames(mf) %in% c(factors, outcome))]
  } else
    cm <- colMeans(cDat)


  if (length(factors) == 0 || !(var %in% factors)) {
    # Variable of interest is continuous
    # create 101-length data frame of repeated colmeans
    newDat <- data.frame(matrix(rep(cm, 101), nrow=101, byrow=TRUE))
    colnames(newDat) <- names(cm)
    # add factors to this data
    if (length(factors) > 0) {
      for (f in factors) {
        col <- factor(rep(logRes[["xlevels"]][[f]][1], 101),
                      levels = logRes[["xlevels"]][[f]])
        newDat <- data.frame(newDat, col)
        colnames(newDat)[ncol(newDat)] <- f
      }
    }
    # then change the variable of interest to a nice range
    vd <- pretty(mf[[var]])
    xs <- seq(min(vd), max(vd), length.out = 101)
    newDat[[var]] <- xs
  } else {
    # variable is factor
    fac <- TRUE
    levs <- paste0(.unv(var), logRes[["xlevels"]][[var]])
    nlevs <- length(levs)

    # create a new data frame of nlevs length
    newDat <- data.frame(matrix(rep(cm, nlevs), nrow=nlevs, byrow=TRUE))
    colnames(newDat) <- names(cm)
    # add factors to this data
    if (length(factors) > 0) {
      for (f in factors) {
        col <- factor(rep(logRes[["xlevels"]][[f]][1], nlevs),
                      levels = logRes[["xlevels"]][[f]])
        newDat <- data.frame(newDat, col)
        colnames(newDat)[ncol(newDat)] <- f
      }
    }
    # then change the factor of interest to a nice range
    xs <- factor(logRes[["xlevels"]][[var]],
                 levels = logRes[["xlevels"]][[var]])
    newDat[[var]] <- xs
  }

  pred <- predict(logRes, newdata = newDat, type = "link", se.fit = TRUE)
  ys   <- .invLogit(pred$fit)
  critValue <- qnorm(1 - (1 - ciInt) / 2)
  lo <- .invLogit(pred$fit - critValue * pred$se.fit)
  hi <- .invLogit(pred$fit + critValue * pred$se.fit)
  outFrame <- data.frame(x = xs, y = ys, lo = lo, hi = hi)

  attr(outFrame, "factor") <- fac

  return(outFrame)
}

.invLogit <- function(x) {
  return(1/(1 + exp(-x)))
}

.glmRobustSE <- function(glmModel) {
  # Robust SE courtesy of Dan Gillen (UC Irvine)
  if (is.matrix(glmModel[["x"]]))
    xmat <- glmModel[["x"]]
  else {
    mf   <- model.frame(glmModel)
    xmat <- model.matrix(terms(glmModel), mf)
  }

  umat      <- residuals(glmModel, "working") * glmModel[["weights"]] * xmat
  modelv    <- summary(glmModel)[["cov.unscaled"]]
  robustCov <- modelv%*%(t(umat)%*%umat)%*%modelv
  dimnames(robustCov) <- dimnames(vcov(glmModel))

  ##	Format the model output with p-values and CIs
  s <- summary(glmModel)
  robustSE <- sqrt(diag(robustCov))
  return(robustSE)
}


# Table: Influential cases
.glmInfluenceTable <- function(jaspResults, model, dataset, options, ready, position, logisticRegression = FALSE) {

  tableOptionsOn <- c(options[["dfbetas"]],
                      options[["dffits"]],
                      options[["covarianceRatio"]],
                      options[["leverage"]],
                      options[["mahalanobis"]])
  
  nModels <- length(options$modelTerms)
  if (!ready || !options[["residualCasewiseDiagnostic"]] || 
      length(unlist(options$modelTerms[[nModels]][["components"]])) == 0)
    return()
  
  
  tableOptions <- c("dfbetas", "dffits", "covarianceRatio", "leverage", "mahalanobis")
  tableOptionsClicked <- tableOptions[tableOptionsOn]
  tableOptionsClicked <- c("cooksDistance", tableOptionsClicked)
  
  if (is.null(jaspResults[["influenceTable"]])) {
    influenceTable <- createJaspTable(gettext("Influential Cases"))
    influenceTable$dependOn(c(tableOptions, "dependent", "method", "modelTerms", "interceptTerm", 
                              "residualCasewiseDiagnostic", "residualCasewiseDiagnosticType",
                              "residualCasewiseDiagnosticZThreshold", 
                              "residualCasewiseDiagnosticCooksDistanceThreshold"))
    influenceTable$position <- position
    influenceTable$showSpecifiedColumnsOnly <- TRUE
    jaspResults[["influenceTable"]] <- influenceTable
  } else {
    return()
  }

  tableOptionToColName <- function(x) {
    switch(x,
           "dfbetas"  = "DFBETAS",
           "dffits"   = "DFFITS",
           "covarianceRatio" = "Covariance Ratio",
           "cooksDistance"   = "Cook's Distance",
           "leverage" = "Leverage",
           "mahalanobis" = "Mahalanobis")
  }
  
  if (is.null(model)) {
    for (option in tableOptionsClicked) {
      colTitle    <- tableOptionToColName(option)
      influenceTable$addColumnInfo(name = option, title = gettext(colTitle), type = "number")
    }
  } else {
    
    
    depType <- if (is.numeric(dataset[[options[["dependent"]]]]))  "number" else "string"
    influenceTable$addColumnInfo(name = "caseN", title = "Case Number", type = "integer")
    influenceTable$addColumnInfo(name = "stdResidual", title = gettext("Std. Residual"),   type = "number", format = "dp:3")
    influenceTable$addColumnInfo(name = "dependent",   title = options$dependent,          type = depType)
    influenceTable$addColumnInfo(name = "predicted",   title = gettext("Predicted Value"), type = "number")
    influenceTable$addColumnInfo(name = "residual", title = gettext("Residual"),   type = "number", format = "dp:3")
    
    colNameList  <- c()
    alwaysPresent <- c("caseN", "stdResidual", "dependent", "predicted", "residual")
    for (option in tableOptionsClicked) {
      if (option == "dfbetas") {
        predictors <- names(model$coefficients)
        for (predictor in predictors) {
          dfbetasName  <- gettextf("DFBETAS_%1s", predictor)
          colNameList <- c(colNameList, dfbetasName)
          if (predictor == "(Intercept)")
            dfbetasTitle <- gettext("DFBETAS:Intercept")
          else
            dfbetasTitle <- gettextf("DFBETAS:%1s", gsub(":", "*", predictor))
          influenceTable$addColumnInfo(name = dfbetasName, title = dfbetasTitle, type = "number")
        }
      } else {
        colNameList <- c(colNameList, option)
        colTitle    <- tableOptionToColName(option)
        influenceTable$addColumnInfo(name = option, title = gettext(colTitle), type = "number")
      }
    }

    .glmInfluenceTableFill(influenceTable, dataset, options, ready, 
                           model = model, 
                           influenceMeasures = tableOptionsClicked, 
                           colNames = c(colNameList, alwaysPresent))
  }
}

.glmInfluenceTableFill <- function(influenceTable, dataset, options, ready, model, influenceMeasures, colNames) {

  influenceRes <- influence.measures(model)
  nDFBETAS     <- length(names(model$coefficients))
  
  optionToColInd <- function(x, nDFBETAS) {
    switch(x,
           "dfbetas"  = 1:nDFBETAS,
           "dffits"   = (nDFBETAS+1),
           "covarianceRatio" = (nDFBETAS+2),
           "cooksDistance"   = (nDFBETAS+3),
           "leverage" = (nDFBETAS+4))}
  
  colInd <- c()
  for (measure in influenceMeasures) {
    colInd <- c(colInd, optionToColInd(measure, nDFBETAS))
  }
  
  tempResult <- influenceRes[["infmat"]][, colInd]
  resultContainsNaN <- any(is.nan(tempResult))
  tempResult[which(is.nan(tempResult))] <- NA
  influenceResData <- as.data.frame(tempResult)
  colnames(influenceResData)[1:length(colInd)] <- colNames[1:length(colInd)]
  
  influenceResData[["caseN"]] <- seq.int(nrow(influenceResData))
  influenceResData[["stdResidual"]] <- rstandard(model)
  influenceResData[["dependent"]] <- model.frame(model)[[options$dependent]]
  influenceResData[["predicted"]] <- model$fitted.values
  influenceResData[["residual"]] <- model$residual

  modelMatrix <- as.data.frame(model.matrix(model))
  modelMatrix <- modelMatrix[colnames(modelMatrix) != "(Intercept)"]
  influenceResData[["mahalanobis"]] <- mahalanobis(modelMatrix, center = colMeans(modelMatrix), cov = cov(modelMatrix))
  
  if (options$residualCasewiseDiagnosticType == "cooksDistance")
    index <- which(abs(influenceResData[["cooksDistance"]]) > options$residualCasewiseDiagnosticCooksDistanceThreshold)
  else if (options$residualCasewiseDiagnosticType == "outliersOutside")
    index <- which(abs(influenceResData[["stdResidual"]]) > options$residualCasewiseDiagnosticZThreshold)
  else # all
    index <- seq.int(nrow(influenceResData))

  # funky statement to ensure a df even if only 1 row
  influenceResSig       <- subset(influenceRes[["is.inf"]], 1:nrow(influenceResData) %in% index, select = colInd)
  colnames(influenceResSig) <- colNames[1:length(colInd)]
  influenceResData <- influenceResData[index, ]

  if (length(index) == 0)
    influenceTable$addFootnote(gettext("No influential cases found."))
  else {
    influenceTable$setData(influenceResData)
    # if any other metrix show influence, add footnotes:
    if (sum(influenceResSig, na.rm = TRUE) > 0) {
      for (thisCol in colnames(influenceResSig)) {
        if (sum(influenceResSig[, thisCol], na.rm = TRUE) > 0) 
          influenceTable$addFootnote(
            gettext("Potentially influential case, according to the selected influence measure."), 
            colNames = thisCol,
            rowNames = rownames(influenceResData)[influenceResSig[, thisCol]],
            symbol = "*"
          )
      }
    }
    
    if (resultContainsNaN) {
      influenceTable$addFootnote(
        gettext("Influence measures could not be computed for some cases due to extreme values, try another measure.")
      )
    }
  }
}

.reglogisticVovkSellke <- function(table, options) {
  table$addColumnInfo(name = "vsmpr",   title = gettextf("VS-MPR%s", "\u002A"), type = "number")
  message <- gettextf("Vovk-Sellke Maximum <em>p</em>-Ratio: Based on the <em>p</em>-value,
  the maximum possible odds in favor of H%1$s over H%2$s equals
  1/(-e <em>p</em> log(<em>p</em>)) for <em>p</em> %3$s .37
  (Sellke, Bayarri, & Berger, 2001).", "\u2081", "\u2080", "\u2264")
  table$addFootnote(message, symbol = "\u002A")
}

.reglogisticSetError <- function(res, table) {
  if(isTryError(res))
    table$setError(.extractErrorMessage(res))
}


## functions in package car 3.0-12
# Durbin-Watson
.durbinWatsonTest.lm <- function(model, max.lag=1, simulate=TRUE, reps=1000,
                                method=c("resample","normal"),
                                alternative=c("two.sided", "positive", "negative"), ...){
  method <- match.arg(method)
  alternative <- if (max.lag == 1) match.arg(alternative)
  else "two.sided"
  residuals <- residuals(model)
  if (any(is.na(residuals))) stop ('residuals include missing values')
  n <- length(residuals)
  r <- dw <-rep(0, max.lag)
  den <- sum(residuals^2)
  for (lag in 1:max.lag){
    dw[lag] <- (sum((residuals[(lag+1):n] - residuals[1:(n-lag)])^2))/den
    r[lag] <- (sum(residuals[(lag+1):n]*residuals[1:(n-lag)]))/den
  }
  if (!simulate){
    result <- list(r=r, dw=dw)
    class(result) <- "durbinWatsonTest"
    result
  }
  else {
    S <- summary(model)$sigma
    X <- unname(model.matrix(model)) # CHANGE: unname
    mu <- fitted.values(model)

    # START OF CHANGES
    # the original code is horrible for larger datasets.
    # It creates all bootstrap datasets at once (e.g., matrix(sample(residuals, n*reps, replace=TRUE), n, reps))
    # we do this one at a time.

    # since the design matrix is constant across bootstraps, we precompute solve(t(X) %*% X) %*% t(X) with
    # a QR factorization. Otherwise lm does this again and again in every step.
    qrX <- qr(X)
    R <- qr.R(qrX)
    Rinv <-  backsolve(r = R, x = diag(ncol(R))) # efficiently invert triangular matrix (see https://stackoverflow.com/a/31772085 )
    # this is the same as solve(t(X) %*% X) %*% t(X) but then computed more efficiently
    inv_XTX_XT <- tcrossprod(tcrossprod(Rinv), X)
    # all.equal(inv_XTX_XT, solve(t(X) %*% X) %*% t(X), check.attributes = FALSE)  # assertion 1
    # y <- model$model[[1]]
    # resids <- y - X %*% (inv_XTX_XT %*% y)
    # all.equal(unname(residuals), c(resids)) # assertion 2

    # start a progressbar if we expect this to take a while
    if (length(residuals) > 50000L) {
      startProgressbar(expectedTicks = reps, label = gettext("Computing Durbin-Watson Statistics"))
      tick <- progressbarTick
    } else {
      tick <- function() {}
    }

    # drop names to avoid copying them
    bootResult <- boot::boot(unname(residuals), statistic = function(data, indices, inv_XTX_XT, max.lag, tick) {
      y <- data[indices]
      resids <- c(y - X %*% (inv_XTX_XT %*% y)) # NOTE: parenthesis are super important here because doing X %*% inv_XTX_XT first allocates an n by n matrix
      # original code
      # E <- c(residuals(lm(y ~ X - 1)))
      # all.equal(unname(E), resids)
      .durbinWatsonTest.default(resids, max.lag = max.lag)
      tick()
    }, R = reps, inv_XTX_XT = inv_XTX_XT, max.lag = max.lag, tick = tick)
    DW <- bootResult$t

    # ORIGINAL CODE
    # Y <- if (method == "resample")
    #   matrix(sample(residuals, n*reps, replace=TRUE), n, reps) + matrix(mu, n, reps)
    # else matrix(rnorm(n*reps, 0, S), n, reps) + matrix(mu, n, reps)
    # E <- residuals(lm(Y ~ X - 1))
    # DW <- apply(E, 2, .durbinWatsonTest.default, max.lag=max.lag)
    # if (max.lag == 1) DW <- rbind(DW)
    # END OF CHANGES
    p <- rep(0, max.lag)
    if (alternative == 'two.sided'){
      for (lag in 1:max.lag) {
        p[lag] <- (sum(dw[lag] < DW[lag,]))/reps
        p[lag] <- 2*(min(p[lag], 1 - p[lag]))
      }
    }
    else if (alternative == 'positive'){
      for (lag in 1:max.lag) {
        p[lag] <- (sum(dw[lag] > DW[lag,]))/reps
      }
    }
    else {
      for (lag in 1:max.lag) {
        p[lag] <- (sum(dw[lag] < DW[lag,]))/reps
      }
    }
    result <- list(r=r, dw=dw, p=p, alternative=alternative)
    class(result)<-"durbinWatsonTest"
    result
  }
}

.durbinWatsonTest.default <- function(model, max.lag=1, ...){
  # in this case, "model" is the residual vectors
  if ((!is.vector(model)) || (!is.numeric(model)) ) stop("requires vector of residuals")
  if (any(is.na(model))) stop ('residuals include missing values')
  n <-  length(model)
  dw <- rep(0, max.lag)
  den <- sum(model^2)
  for (lag in 1:max.lag){
    dw[lag] <- (sum((model[(lag+1):n] - model[1:(n-lag)])^2))/den
  }
  dw
}

# test if a model is intercept only
.isInterceptOnly <- function(model) {
  terms <- try(terms(model))
  if (jaspBase::isTryError(terms)) {
    terms <- try(terms(model[["formula"]]))

    if (jaspBase::isTryError(terms)) {
      stop("Something went wrong; Cannot tell if a model is intercept-only.")
    }
  }

  intercept <- attr(terms, "intercept")
  labels <- attr(terms, "term.labels")

  # test if we have intercept and no other predictors
  return(intercept && length(labels) == 0)
}
jasp-stats/Regression documentation built on July 15, 2024, 7:04 a.m.