R/ModTools.R

Defines functions PredictCI RobSummary UnderSample OverSample InsertRow .InsertRefLevel RefLevel LowVar OddsRatio.polr Conf.polr plot.TModC print.TModC TModC BreuschPaganTest Cstat.FitMod Conf.FitMod ResponseName Response update.FitMod as.FitMod plot.FitMod print.FitMod drop1.FitMod summary.FitMod predict.FitMod FitMod confint.ROC BestCut .predict.C5.0 .predict.svm .predict.qda .predict.lda .predict.glm .print.polr SplitTrainTest

Documented in BestCut BreuschPaganTest drop1.FitMod FitMod OverSample plot.FitMod plot.TModC PredictCI predict.FitMod print.FitMod RefLevel Response RobSummary SplitTrainTest summary.FitMod TModC UnderSample

# ****************************************************************************
#
# Projekt:	ModTools.r
#
# Zweck:    Modeling Tools for regression and classification
#
# Autor:    Andri Signorell
# Version:	0.5 (in development)
#
# Depends:  DescTools
#
# Datum:
#           21.10.2018  komplettes Redesign (aber ohne Check-Errors...)
#           04.12.2016  mit neuen DescTools
#           02.11.2013  version 0.3
#           20.09.2013 	created
#
# ****************************************************************************

# TODO (Andri):
#   Vergleich f?r lines in den Plot zeichnen etc.

# bereits in DescTools (weil basics...)
# statistics:
#   Brier, c-statistic, pseudoR2 (McFadden etc.)
#   DurbinWatson, HosmerLemeshow, BreuschGodfrey
#   findCorrelation, Conf, VIF, MAE etc., ModTable

# modelling:
#   linear regression, rlm, glm, tobit, cox ph, quantreg
# mixed models, gam, mars, lasso,
# discriminant analysis,
#
# rpart, randomForest, svm, NaiveBayes, nnet, C5.0
# knn, logitBoost, adaBoost
# (http://www.inf.fu-berlin.de/inst/ag-ki/adaboost4.pdf)
#
# unsupervised:
#   SOM, ART Adaptive Resonance Theory
# cluster, association rules,
# multidimensional scaling
#
# tuning:
#   finding the appropriate parameters
#
# preprocess
#
# 1. Check for near zero variance predictors and flag as near zero if:
#   * the percentage of unique values is less than 20
# * the ratio of the most frequent to the second most frequent
# value is greater than 20,
# 2. Check for susceptibility to multicollinearity
# * Calculate correlation matrix
# * Find variables with correlation 0.9 or more and delete them
#
#
# transformations, retransformation:
#   rcs - restricted cubic splines
# smearingEstimator
#
# variable selection:
#   lasso, stepAIC
#
# imputation:
#   ... ?
# knnImputation
#
# evaluation, verfication of models:
#   CrossValidation
# variable importance

# ROC and all what s around
#
# tests:
# plots:
#   regression plots, termplots, residuals etc.
# tree plots
# interactive exploring of a tree
#
# reporting:
#   tables for regression, OR
# tables for comparing models
#
# ChemometricsWithR:
#    GA Genetic Algorithms for variable selection in classification

# imports from libraries:
# randomForest:
#   "tuneRF", "varImpPlot", "importance" from randomForest wrapped into Tune, VarImp, plot(VarImp)
#
# , "tune"


Tobit <- AER::tobit


SplitTrainTest <- function(x, p=0.1, seed=NULL, logical=FALSE){

  if(!is.null(seed)) set.seed(seed)
  n <- ifelse(is.atomic(x), length(x), nrow(x))
  idx <- 1:n
  itest <- sample(idx, trunc(length(idx) * p))
  if(logical)
    res <- Unwhich(itest, n)
  else {
    if(is.atomic(x))
      res <- list(train=x[idx[-itest]], test=x[idx[itest]])
    else
      res <- list(train=x[idx[-itest], ], test=x[idx[itest], ])

  }

  return(res)

}




.print.multinom <- function (x, digits = x$digits, ...) {

  object <- x

  vc <- vcov(object)
  r <- length(object$vcoefnames)
  se <- sqrt(diag(vc))
  if (length(object$lev) == 2L) {
    coef <- object$wts[1L + (1L:r)]
    stderr <- se
    names(coef) <- names(stderr) <- object$vcoefnames

  } else {
    coef <- matrix(object$wts, nrow = object$n[3L], byrow = TRUE)[-1L,
                                                                  1L + (1L:r), drop = FALSE]
    stderr <- matrix(se, nrow = object$n[3L] - 1L, byrow = TRUE)
    if (length(l <- object$lab) || length(l <- object$lev))
      dimnames(coef) <- dimnames(stderr) <- list(l[-1L],
                                                 object$vcoefnames)
  }

  object$is.binomial <- (length(object$lev) == 2L)
  object$digits <- digits
  object$coefficients <- coef
  object$standard.errors <- stderr
  object$Wald.ratios <- coef/stderr
  # if (correlation)
  #   object$correlation <- vc/outer(se, se)
  # class(object) <- "summary.multinom"


  if (!is.null(cl <- x$call)) {
    cat("\nCall:\n")
    dput(cl, control = NULL)
  }

  cat("\n")
  print(Format(x$drop1[,-3], digits=c(0,1,3), fmt=c("","","p*"), na.form = ""), print.gap = 3)
  cat("---\n Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n")
  cat("\nCoefficients:\n Level: ", x$lev[1], " (reference)\n\n")

  ci <- confint(x)
  b <- t(coef(x))
  lst <- list()
  for(i in seq(dim(ci)[3])){
    lst[[i]] <- cbind(b[, i], ci[,,i])
    # rownames(lst[[i]]) <- paste(colnames(b)[i], rownames(lst[[i]]) , sep=": ")
  }

  beta <- do.call(rbind, rbind(NA, lst, NA))

  betaor <- cbind(beta, exp(beta))
  colnames(betaor) <- c("coef", "lci", "uci", "or", "or.lci", "or.uci")
  rownames(betaor) <- paste0("  ", rownames(betaor))
  rownames(betaor)[seq(1, nrow(betaor), by=nrow(lst[[1]])+2)] <- paste0("Level: ", colnames(b))
  betaor[seq(2, nrow(betaor), by=nrow(lst[[1]])+2), 4:6] <- 0
  print(Format(betaor, digits=3, na.form = "", zero.form = "."), print.gap=2)

  cat("Residual Deviance: ", format(x$deviance), ",  ",
      "AIC: ", format(x$AIC), ",  nobs: ", nrow(x$residuals), "\n\n", sep="")
  if (!is.null(correl <- x$correlation)) {
    p <- dim(correl)[2L]
    if (p > 1) {
      cat("\nCorrelation of Coefficients:\n")
      ll <- lower.tri(correl)
      correl[ll] <- format(round(correl[ll], digits))
      correl[!ll] <- ""
      print(correl[-1L, -p], quote = FALSE, ...)
    }
  }
  invisible(x)
}


.print.polr <- function(x, digits = x$digits, ...){

  if (!is.null(cl <- x$call)) {
    cat("\nCall:\n")
    dput(cl, control = NULL)
  }

  cat("\n")

  xdrop <- data.frame(x$drop1[, 3], x$drop1[,1], x$drop1[,4], x$drop1[,4])
  rownames(xdrop) <- rownames(x$drop1)
  colnames(xdrop) <- c("Df", "AIC",  "pval", "")

  print(Format(xdrop, fmt=c("", "", "p", "*"),
               digits=c(0,1,4,NULL)),
        print.gap = 3)

  cat("---\n Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n")

  cat("\nCoefficients:\n")

  tval <- x$coefficients / head(sqrt(diag(vcov(x))), -length(x$zeta))
  pval <- pnorm(abs(tval), lower.tail = FALSE) * 2

  res <- data.frame(coef=x$coefficients, x$ci,
                    exp(x$coefficients), exp(x$ci),
                    Format(pval, fmt="p"), Format(pval, fmt="*"))
  colnames(res) <- c("coef","lci","uci", "or", "or.lci", "or.uci", "p.val", "")

  res[, 1:6] <- Format(res[, 1:6], digits=digits)
  res <- as.matrix(res)

  # insert reference levels for factors
  rr <- RefLevel(x)$simple
  i <- 2
  for(i in seq_along(rr)){
    ff <- names(rr[i])
    rl <- rr[i]
    j <- head(grep(ff, rownames(res)), 1)
    res <- Append(res, "-", after=j-1, rows=TRUE)
    rownames(res)[grep(ff, rownames(res))] <-
      gsub(ff, "  ", rownames(res)[grep(ff, rownames(res))])
    rownames(res)[j] <- gettextf("%s (Ref: %s)", ff, rl)
    res[j, 8] <- Format(xdrop[grep(ff, rownames(xdrop)), 4], fmt="*")

  }
  res <- as.data.frame(res)
  colnames(res)[8] <- ""
  print(res, print.gap = 2)

  cat("\nIntercepts:\n")
  print(x$zeta)

  cat("\n")

}



.predict.nnet <- function (object, newdata, type = c("class", "prob"), cutoff=0.5, ...) {

  type <- match.arg(type)

  if (!requireNamespace("nnet", quietly = TRUE))
    stop("package 'nnet' must be installed")
  predict.nnet <- get("predict.nnet", asNamespace("nnet"),
                          inherits = FALSE)

  Y <- predict.nnet(object, newdata, type="raw", ...)

  if(type=="prob" && dim(Y)[2]==1){
    # make compatible to the others
    Y <- cbind(1-Y, Y)
    colnames(Y) <- object$lev
  }


  switch(type, class = {
    if (length(object$lev) > 2L)  Y <- factor(max.col(Y),
                                              levels = seq_along(object$lev), labels = object$lev)
    if (length(object$lev) == 2L) Y <- factor((Y > cutoff) + 1,
                                              levels = 1L:2L, labels = object$lev)
    if (length(object$lev) == 0L) Y <- factor(max.col(Y),
                                              levels = seq_along(object$lab), labels = object$lab)
  }, prob = {
  })


  return(Y)

}


.predict.multinom <- function (object, newdata, type = c("class", "prob"), ...) {

  type <- match.arg(type)

  if (!requireNamespace("nnet", quietly = TRUE))
    stop("package 'nnet' must be installed")
  predict.multinom <- get("predict.multinom", asNamespace("nnet"),
                          inherits = FALSE)
  res <- predict.multinom(object, newdata, type, ...)

  # extend here with the complementary probability

  if(type=="prob" && (is.vector(res) || dim(res)[2]==1)){
    res <- cbind(1-apply(cbind(res), 1, sum), cbind(res))
    if(is.vector(res))
      names(res) <- object$lev
    else
      colnames(res) <- object$lev
  }
  res

}




.predict.glm <- function(object, newdata, type = c("class", "prob", "link", "terms", "response"), cutoff = 0.5, ...) {

  type <- match.arg(type)

  object$lev <- levels(model.response(model.frame(object)))

  switch(type[1],
         "prob"={
           Y <- stats::predict.glm(object, newdata, type="response", ... )
           if(is.list(Y)){
             Y[["fit"]] <- cbind(1-Y[["fit"]], Y[["fit"]])
             colnames(Y[["fit"]]) <- object$lev
           } else {
             Y <- cbind(1-Y, Y)
             colnames(Y) <- object$lev
           }
         },

         "class"={
           Y <- (stats::predict.glm(object, newdata, type="response", ...) > cutoff) + 1
         },
         "link"={
           Y <- stats::predict.glm(object, newdata, type="link", ... )
         },
         "terms"={
           Y <- stats::predict.glm(object, newdata, type="terms", ... )
         },
         "response"={
           Y <- stats::predict.glm(object, newdata, type="response", ... )
         }
  )


  switch(type[1], class = {
    if (length(object$lev) > 2L)  Y <- factor(max.col(Y),
                                              levels = seq_along(object$lev), labels = object$lev)
    if (length(object$lev) == 2L) Y <- factor(Y,
                                              levels = 1L:2L, labels = object$lev)
    if (length(object$lev) == 0L) Y <- factor(max.col(Y),
                                              levels = seq_along(object$lab), labels = object$lab)
  }, prob = {
  })

  return(Y)

}




.predict.lda <- function(object, newdata, type = c("class", "prob", "term"),
                        prior = object$prior, dimen, method = c("plug-in", "predictive", "debiased"),
                        ...) {

  type <- match.arg(type)

  if (!requireNamespace("MASS", quietly = TRUE))
    stop("package 'MASS' must be installed")
  predict.lda <- get("predict.lda", asNamespace("MASS"),
                      inherits = FALSE)

  res <- predict.lda(object, newdata, prior=prior, dimen=dimen, method=method, ...)

  switch(type,
         "prob"={
           res <- res$posterior
         },

         "class"={
           res <- res$class
         },
         "term"={
           res <- res$x
         }
  )

  return(res)

}





# .predict.naiveBayes <- function(object, newdata, type = c("class", "prob", "term")
#                                 , threshold = 0.001, eps = 0, ...) {
#
#   type <- match.arg(type)
#
#   if (!requireNamespace("e1071", quietly = TRUE))
#     stop("package 'e1071' must be installed")
#   predict.naiveBayes <- get("predict.naiveBayes", asNamespace("e1071"),
#                      inherits = FALSE)
#
#   if(missing(newdata))
#     newdata <- object$data$x
#   else
#     newdata <- as.data.frame(newdata)
#
#
#   if(type=="prob")
#     type <- "raw"
#   res <- predict.naiveBayes(object, newdata, type=type, threshold=threshold, eps=eps, ...)
#
#   return(res)
#
# }



.predict.qda <- function(object, newdata, type = c("class", "prob", "term"),
                        prior = object$prior, dimen, method = c("plug-in", "predictive", "debiased", "looCV"),
                        ...) {

  type <- match.arg(type)

  if (!requireNamespace("MASS", quietly = TRUE))
    stop("package 'MASS' must be installed")
  predict.qda <- get("predict.qda", asNamespace("MASS"), inherits = FALSE)

  res <- predict.qda(object, newdata, prior=prior, dimen=dimen, method=method, ...)

  switch(type,
         "prob"={
           res <- res$posterior
         },

         "class"={
           res <- res$class
         },
         "term"={
           res <- res$x
         }
  )

  return(res)

}


.predict.svm <- function(object, newdata, type = c("class", "prob", "term"),
                         decision.values = TRUE, probability = TRUE ,
                         na.action = na.omit, ...) {

  type <- match.arg(type)

  if (!requireNamespace("e1071", quietly = TRUE))
    stop("package 'e1071' must be installed")
  predict.svm <- get("predict.svm", asNamespace("e1071"),
                     inherits = FALSE)

  if(missing(newdata)) {
    # newdata <- eval(object$call$data) # [, as.character(attr(object$terms, "predvars"))]
    # select all predictors: all.vars(attr(r.lm$model, "terms"))
    newdata <- eval(object$call$data)[, all.vars(object$terms)]
  }

  res <- predict.svm(object, newdata, decision.values=decision.values, probability=probability,
                     na.action=na.action, ...)

  switch(type,
         "prob"={
           res <- attr(res, "probabilities")
           if(object$nclasses == 2){
             res <- res[, c(2, 1)]
           }
         },

         "class"={
           res <- res
         },
         "term"={
           res <- attr(res, "decision.values")
         }
  )

  return(res)

}



.predict.C5.0 <- function(object, newdata, type = c("class","prob"),
                          na.action=na.omit, ...) {

  type <- match.arg(type)

  if(missing(newdata)) {
    newdata <- eval(object$call$data)[, all.vars(object$Terms)]
  }

  # res <- predict.svm(object, newdata, decision.values=decision.values, probability=probability,
  #                    na.action=na.action, ...)

  C50::predict.C5.0(object, newdata, type = type, na.action=na.action, ...)
}





OddsRatio.FitMod <- function (x, conf.level = NULL, ...){
  class(x) <- class(x)[class(x) != "FitMod"]
  NextMethod(x, conf.level=conf.level, ...)
}





ROC <- function (x, resp = NULL, ...) {

  if(is.null(resp))
    roc(predictor = predict(x, type="prob")[, 2], response = Response(x), plot=FALSE, ...)
  else
    roc(predictor=x, response = resp, plot=FALSE, ...)

}


BestCut <- function(x, method=c("youden", "closest.topleft")){
  pROC::coords(roc=x, x="best", best.method=method,
               transpose=TRUE)
}


confint.ROC <- function(object, parm, level = 0.95, ...) {
  pROC::ci.coords(roc=object, conf.level = level, ...)
}



FitMod <- function(formula, data, ..., subset, na.action=na.pass, fitfn=NULL){

  cl <- match.call()

  if(is.null(fitfn)){
    # guess fitting function if not provided, based on type of response variable

    # this would return the response variable
    # resp <- eval(cl$data, parent.frame())[, all.vars(cl$formula)[1]]
    # see also:  https://stackoverflow.com/questions/13217322/how-to-reliably-get-dependent-variable-name-from-formula-object

    # evaluate left hand side in order to trap transformations
    resp <- eval(parse(text=cl$formula[2]), envir = eval(cl$data, parent.frame()))

    if(IsDichotomous(resp))
      cl[["fitfn"]] <- fitfn <- "logistic"

    else if(inherits(resp, "factor"))
      if(inherits(resp, "ordered"))
        cl[["fitfn"]] <- fitfn <- "polr"
      else
        cl[["fitfn"]] <- fitfn <- "multinom"

    else if(inherits(resp, "integer"))
      cl[["fitfn"]] <- fitfn <- "poisson"

    else if(inherits(resp, "numeric"))
      cl[["fitfn"]] <- fitfn <- "lm"

    else {
      cl[["fitfn"]] <- fitfn <- "randomForest"
      warning("fitting function could not be reasonably guessed, consider providing fitfn")
    }

  }


  if(fitfn == "lm"){
    fun <- get("lm", asNamespace("stats"), inherits = FALSE)

  } else if(fitfn == "poisson"){
    fun <- get("glm", asNamespace("stats"), inherits = FALSE)
    cl[["family"]] <- "poisson"

  } else if(fitfn == "quasipoisson"){
    fun <- get("glm", asNamespace("stats"), inherits = FALSE)
    cl[["family"]] <- "quasipoisson"

  } else if(fitfn == "gamma"){
    fun <- get("glm", asNamespace("stats"), inherits = FALSE)
    if(is.null(cl[["link"]]))
      cl[["family"]] <- "Gamma"
    else {
      cl[["family"]] <- eval(parse(text = gettextf("function() Gamma(link=%s)", cl[["link"]])), parent.frame())
      cl[["link"]] <- NULL
    }

  } else if(fitfn == "negbin"){
    if (!requireNamespace("MASS", quietly = TRUE))
      stop("package 'MASS' must be installed")
    if(is.null(cl[["theta"]]))
      # no theta available, use glm.nb and fit theta
      fun <- get("glm.nb", asNamespace("MASS"), inherits = FALSE)
    else {
      # theta is given, use usual glm and define family
      fun <- glm
      cl[["family"]] <- eval(parse(text = gettextf("function() negative.binomial(theta=%s)", cl[["theta"]])), parent.frame())
      cl[["theta"]] <- NULL
    }

  } else if(fitfn == "polr"){
    if (!requireNamespace("MASS", quietly = TRUE))
      stop("package 'MASS' must be installed")
    fun <- get("polr", asNamespace("MASS"), inherits = FALSE)
    if(is.null(cl[["Hess"]])) cl[["Hess"]] <- TRUE
    if(is.null(cl[["model"]])) cl[["model"]] <- TRUE

  } else if(fitfn == "lmrob"){
    if (!requireNamespace("robustbase", quietly = TRUE))
      stop("package 'robustbase' must be installed")
    fun <- get("lmrob", asNamespace("robustbase"), inherits = FALSE)

  } else if(fitfn == "tobit"){
    fun <- Tobit

  } else if(fitfn == "zeroinfl"){
    if (!requireNamespace("pscl", quietly = TRUE))
      stop("package 'pscl' must be installed")
    fun <- get("zeroinfl", asNamespace("pscl"), inherits = FALSE)

  } else if(fitfn == "multinom"){
    if (!requireNamespace("nnet", quietly = TRUE))
      stop("package 'nnet' must be installed")
    fun <- get("multinom", asNamespace("nnet"), inherits = FALSE)
    if(is.null(cl[["maxit"]])) cl[["maxit"]] <- 500   # we set a higher default for the iterations here...
    if(is.null(cl[["model"]])) cl[["model"]] <- TRUE
    if(is.null(cl[["trace"]])) cl[["trace"]] <- FALSE

  } else if(fitfn == "randomForest"){
    if (!requireNamespace("randomForest", quietly = TRUE))
      stop("package 'randomForest' must be installed")
    fun <- get("randomForest", asNamespace("randomForest"), inherits = FALSE)
    if(is.null(cl[["na.action"]])) cl[["na.action"]] <- na.omit

  } else if(fitfn == "rpart"){
    if (!requireNamespace("rpart", quietly = TRUE))
      stop("package 'rpart' must be installed")
    fun <- get("rpart", asNamespace("rpart"), inherits = FALSE)
    if(is.null(cl[["model"]])) cl[["model"]] <- TRUE
    if(is.null(cl[["y"]])) cl[["y"]] <- TRUE

  } else if(fitfn == "nnet"){
  if (!requireNamespace("nnet", quietly = TRUE))
    stop("package 'nnet' must be installed")
    fun <- get("nnet", asNamespace("nnet"), inherits = FALSE)
    if(is.null(cl[["maxit"]])) cl[["maxit"]] <- 300   # we set a default for the iterations here...
    if(is.null(cl[["trace"]])) cl[["trace"]] <- FALSE
    if(is.null(cl[["size"]])) cl[["size"]] <- 5

  } else if(fitfn == "C5.0"){
    if (!requireNamespace("C50", quietly = TRUE))
      stop("package 'C50' must be installed")
    fun <- get("C5.0", asNamespace("C50"), inherits = FALSE)

  } else if(fitfn == "lda"){
    if (!requireNamespace("MASS", quietly = TRUE))
      stop("package 'MASS' must be installed")
    fun <- get("lda", asNamespace("MASS"), inherits = FALSE)

  } else if(fitfn == "qda"){
    if (!requireNamespace("MASS", quietly = TRUE))
      stop("package 'MASS' must be installed")
    fun <- get("qda", asNamespace("MASS"), inherits = FALSE)

  } else if(fitfn == "svm"){
    if (!requireNamespace("e1071", quietly = TRUE))
      stop("package 'e1071' must be installed")
    fun <- get("svm", asNamespace("e1071"), inherits = FALSE)
    if(is.null(cl[["probability"]]))
      cl[["probability"]] <- TRUE

  } else if(fitfn == "naive_bayes"){
    if (!requireNamespace("naivebayes", quietly = TRUE))
      stop("package 'naivebayes' must be installed")
    fun <- get("naive_bayes", asNamespace("naivebayes"), inherits = FALSE)

  } else if(fitfn == "lb"){
    fun <- get("LogitBoost", asNamespace("ModTools"), inherits = FALSE)

  } else if(fitfn == "logit"){
    fun <- get("glm", asNamespace("stats"), inherits = FALSE)
    cl[["family"]] <- "binomial"
  }
  cl[["fitfn"]] <- NULL

  # cl[[1]] <- as.name(fun)
  cl[[1]] <- fun

  # evaluate model with given fit function
  res <- eval(cl, parent.frame())
  class(res) <- c("FitMod", class(res))

  if(fitfn == "logit")
    res[["call"]][[1]] <- as.name("glm")

  else if(fitfn %in% c("multinom", "rpart")){
    res[["call"]][[1]] <- as.name(fitfn)

    # extend result object with p-values
    capture.output(sm <- summary(res))
    z <- sm$coefficients / sm$standard.errors

    # 2-tailed Wald z tests to test significance of coefficients
    res[["pval"]] <- (1 - pnorm(abs(z), 0, 1)) * 2

  }


  if(inherits(res, "nnet"))
    # ... and display a warning, if it did not converge
    if(res$convergence == 1)
      warning("nnet() did not (yet) converge, consider increase maxit (and set trace=TRUE)!")

  if(inherits(res, "multinom")){
    res[["drop1"]] <- .drop1.multinom(res)
  }

  if(inherits(res, "polr")){
    ## res[["drop1"]] <- drop1(res, test="Chisq")
    res[["drop1"]] <- .drop1.polr(res)
    if(is.null(cl[["profiling"]]) || !cl[["profiling"]])
      res[["ci"]] <- confint.default(res)
    else
      res[["ci"]] <- confint(res)
  }

  if(inherits(res, "rpart")){
    # add a variables actually used in tree construction
    frame <- res$frame
    leaves <- frame$var == "<leaf>"
    used <- unique(frame$var[!leaves])
    res[["used"]] <- sort(as.character(used))

  }

  if(fitfn %in% c("poisson","gamma", "quasipoisson"))
    res[["call"]][[1]] <- as.name("glm")

  else if(fitfn == "negbin") {
    if (is.null(cl[["family"]]))
      res[["call"]][[1]] <- as.name("glm.nb")
    else
      res[["call"]][[1]] <- as.name("glm")

  } else if(fitfn %in% c("lm","lmrob","polr"))
    res[["call"]][[1]] <- as.name(fitfn)


  res$fitfn <- fitfn

  return(res)

}


predict.FitMod <- function(object, ...){

  if(inherits(object, "multinom")){
    .predict.multinom(object, ...)

  } else if(inherits(object, "nnet")){
    .predict.nnet(object, ...)


  } else if(inherits(object, "glm")){
    .predict.glm(object, ...)

  } else if(inherits(object, "rpart")){
    .predict.rpart(object, ...)

  } else if(inherits(object, "lda")){
    .predict.lda(object, ...)

  } else if(inherits(object, "qda")){
    .predict.qda(object, ...)

  } else if(inherits(object, "svm")){
    .predict.svm(object, ...)

  } else if(inherits(object, "C5.0")){
    .predict.C5.0(object, ...)

  } else {
    NextMethod(object, ...)
  }
}


summary.FitMod <- function(object, ...){
  NextMethod(object, ...)
}


drop1.FitMod <- function(object, ...){
  if(inherits(object, "multinom"))
    .drop1.multinom(object, ...)
  else
    NextMethod(object, ...)
}



print.FitMod <- function(x, ...){
  if(inherits(x, "multinom"))
    .print.multinom(x, ...)

  # else if(inherits(x, "nnet"))
  #   .print.nnet(x, ...)

  else if(inherits(x, "polr"))
     .print.polr(x, ...)

  else
    NextMethod(x, ...)
}

rpart:::plot.rpart


plot.FitMod <- function(x, ...){

  if(inherits(x, "rpart")){
    plot.rpart(x, ...)

    # evt. other defaults:
    # plot.rpart <- function (x = stop("no 'x' arg"), type = 0, extra = 0, under = FALSE,
    #           clip.right.labs = TRUE, fallen.leaves = FALSE, branch = if (fallen.leaves) 1 else 0.2,
    #           uniform = TRUE, digits = 2, varlen = -8, faclen = 3, cex = NULL,
    #           tweak = 1, compress = TRUE, ycompress = uniform, snip = FALSE,
    #           ...){


  } else if(inherits(x, "nnet")){
    # print.nnet <- nnet:::print.nnet
    # print.nnet <- get("print.nnet", asNamespace("NeuralNetTools"),
    #                      inherits = FALSE)
    plotnet(x, ...)

  } else {
    NextMethod(x, ...)
  }
}

as.FitMod <- function(x){
  structure(x, class=c("FitMod", class(x)))
}


update.FitMod <- function(object, ...){
  # update kills the model class, so restore here
  oclass <- class(object)
  class(object) <- class(object)[class(object) != "FitMod"]
  res <- update(object, ...)
  class(res) <- oclass
  return (res)
}




Response <- function(x, ...){

  # response name
  # deparse(attr(terms(model), "variables")[[2]])


  if(inherits(x, "C5.0")){
    x$terms <- eval(x$call$formula)
    res <- model.response(model.frame(x))

  } else if(inherits(x, "rpart") & is.null(x$model)){
    res <- factor(attr(x, "ylevels")[x$y])

  } else if(inherits(x, "naive_bayes")){
    x$terms <- eval(x$call$formula)
    res <- model.response(model.frame(x))

  } else {
    res <- model.response(model.frame(x))
  }

  # get name of response
  attr(res, "response") <- attr(attr(x$terms,"dataClasses"),"names")[attr(x$terms,"response")]

  return(res)

}


ResponseName <- function(x) {

  # https://stat.ethz.ch/pipermail/r-help/2007-November/145649.html
  f <- formula(x)
  if(length(f) < 3) stop("no response")

  resp <- f[[2]]

  if(!is.name(resp)) stop("response is not a name")

  as.character(resp)
}



Conf.FitMod <- function(x, ...){
  if(inherits(x, c("C5.0", "nnet", "naive_bayes", "lda", "LogitBoost")))
    Conf(x=predict(x, type="class"), ref=Response(x), ... )
  else
    NextMethod(x, ...)
}



Cstat.FitMod <- function(x, pos=NULL, ...) {
  pred <- predict(x, type = "prob")
  if(is.null(pos))
    pos <- 2
  if(ncol(pred)==2)
    Cstat(pred[, pos], Response(x), ...)
  else
    NA
}


# test  -------------------------


BreuschPaganTest <- function(formula, varformula = NULL, studentize = TRUE,
                             data = list()) {

  dname <- paste(deparse(substitute(formula)))

  if(!inherits(formula, "formula")) {
    X <- if(is.matrix(formula$x))
      formula$x
    else model.matrix(terms(formula), model.frame(formula))
    y <- if(is.vector(formula$y))
      formula$y
    else model.response(model.frame(formula))
    Z <- if(is.null(varformula)) X
    else model.matrix(varformula, data = data)
  } else {
    mf <- model.frame(formula, data = data)
    y <- model.response(mf)
    X <- model.matrix(formula, data = data)
    Z <- if(is.null(varformula)) X
    else model.matrix(varformula, data = data)
  }

  ## only use complete cases that are in both models
  if(!(all(c(row.names(X) %in% row.names(Z), row.names(Z) %in% row.names(X))))) {
    allnames <- row.names(X)[row.names(X) %in% row.names(Z)]
    X <- X[allnames,]
    Z <- Z[allnames,]
    y <- y[allnames]
  }

  k <- ncol(X)
  n <- nrow(X)

  resi <- lm.fit(X,y)$residuals
  sigma2 <- sum(resi^2)/n

  if(studentize)
  {
    w <- resi^2 - sigma2
    fv <- lm.fit(Z,w)$fitted
    bp <- n * sum(fv^2)/sum(w^2)
    method <- "studentized Breusch-Pagan test"
  }
  else
  {
    f <- resi^2/sigma2 -1
    fv <- lm.fit(Z,f)$fitted
    bp <- 0.5 * sum(fv^2)
    method <- "Breusch-Pagan test"
  }

  names(bp) <- "BP"
  df <- ncol(Z)-1
  names(df) <- "df";
  RVAL <- list(statistic = bp,
               parameter = df,
               method = method,
               p.value= pchisq(bp,df,lower.tail=FALSE),
               data.name=dname)

  class(RVAL) <- "htest"
  return(RVAL)
}




TModC <- function(..., newdata=NULL, reference=NULL, ord=NULL){

  graphrank <- function(i)
    paste(strrep(".", i-1), "x", strrep(".", max(i) - i), sep="")


  mods <- list(...)
  if(length(mods)==1)
    mods <- unlist(mods, recursive = FALSE)

  if(!is.null(newdata)){
    rocs <- list()
    for(i in seq_along(mods))
      rocs[[i]] <-  ROC(predict(mods[[i]], newdata=newdata, type="prob")[, 2],
                        resp = reference, plotit=FALSE)

  } else {
    rocs <- lapply(mods, ROC)
  }


  xname <- names(mods)
  if(is.null(xname))
    xname <- paste("model", seq_along(mods))


  auc <- sapply(rocs, function(x) x$auc)
  bs <- sapply(mods, BrierScore)

  res <- data.frame(auc=sapply(rocs, function(x) x$auc),
                    auc_p=auc/max(auc),
                    auc_rnk=rank(-auc),
                    bs=sapply(mods, BrierScore),
                    bs_p=((1-bs)/max(1-bs)),
                    bs_rnk=rank(bs))
  res$auc_grnk <- graphrank(res$auc_rnk)
  res$bs_grnk <- graphrank(res$bs_rnk)


  ord <- match.arg(ord, choices = c("ensemble", colnames(res)))

  if(ord=="bs")
    res <- Sort(res, ord="bs", decreasing=FALSE)

  else if(ord=="ensemble"){
    id <- order(apply(res[, c("auc_p","bs_p")], 1, mean),
                decreasing = TRUE)
    res <- res[id, ]

  } else {
    res <- Sort(res, ord=ord, decreasing=TRUE)

  }

  res <- list(tab=res, rocs=rocs, xname=xname)

  class(res) <- c("TModC", class(res))
  res
}



print.TModC <- function(x, ...){

  print.data.frame(Format(x$tab, digits=c(3,3,0,3,3,0,0,0)),
                   print.gap=2, ...)

}


plot.TModC <- function(x, col=NULL, args.legend=NULL, ...){

  if(is.null(col))
    cols <- Pal("Tibco")
  else
    cols <- col


  sapply(seq_along(x$rocs),
         function(i) plot(x$rocs[[i]], add=(i!=1), col=cols[i]))

  grid()

  args.legend1 <- list(x = "bottomright", inset = 0, legend = x$xname,
                       fill = cols, bg = "white", cex = 1.0)
  if (!is.null(args.legend)) {
    args.legend1[names(args.legend)] <- args.legend
  }
  add.legend <- TRUE
  if (!is.null(args.legend))
    if (identical(args.legend, NA)) {
      add.legend <- FALSE
    }
  if (add.legend)
    DoCall("legend", args.legend1)


}




Conf.polr <- function(x, ...) {

  # extract response from the model

  Conf(x=predict(x),
       ref=model.extract(model.frame(x), "response") , ... )
}

.drop1.multinom <-  function (object, scope, test = c("Chisq","none"), ...) {

  if (!inherits(object, "multinom"))
    stop("Not a multinom fit")
  if (missing(scope))
    scope <- drop.scope(object)
  else {
    if (!is.character(scope))
      scope <- attr(terms(update.formula(object, scope)),
                    "term.labels")
    if (!all(match(scope, attr(object$terms, "term.labels"),
                   nomatch = FALSE)))
      stop("scope is not a subset of term labels")
  }
  ns <- length(scope)
  ans <- matrix(nrow = ns + 1, ncol = 2, dimnames = list(c("<none>",
                                                           scope), c("Df", "AIC")))
  ans[1, ] <- c(object$edf, object$AIC)
  if (test[1]=="Chisq") ans <- cbind(ans,Chisq=NA, p.value=NA)
  env <- environment(formula(object))
  for(i in seq(ns)) {
    tt <- scope[i]

    nfit <- update(object, as.formula(paste("~ . -", tt)), evaluate = FALSE)

    # this is additionally to automatically handle missing values
    # it is not always clear, what impact missings have, so handle with care
    # maybe place a warning... see help drop1.multinom
    # !!!!
    nfit$data <- str2lang(gettextf("na.omit(%s[, c(%s)])",
                        deparse(nfit$data), paste(sQuote(all.vars(formula(object))), collapse=",")))

    nfit <- eval(nfit, envir=env) # was  eval.parent(nfit)

    if (nfit$edf == object$edf)
      nfit$AIC <- NA
    ans[i+1, ] <- c(nfit$edf, nfit$AIC,
                    if (test[1]=="Chisq") unlist(anova(object, nfit)[2,6:7]))
  }
  as.data.frame(ans)
}


OddsRatio.polr <- function(x, conf.level=NULL, digits=3, ...) {

  if(is.null(conf.level)) conf.level <- 0.95

  # class(x) <- class(x)[class(x)!="regr"]
  r.summary <- summary(x, Wald.ratios = TRUE)

  coe <- SetNames(r.summary$coefficients[,1, drop=FALSE], colnames="or")
  se <- r.summary$coefficients[,2]

  d.res <- r.summary

  d.print <- data.frame(
    "or"= Format(exp(coe[, "or"]), digits=digits),
    "or.lci" = Format(exp(coe[, "or"] + qnorm(0.025) * se), digits=digits),
    "or.uci" = Format(exp(coe[, "or"] - qnorm(0.025) * se), digits=digits),
    "pval" = Format(2*(1-pnorm(q = abs(coe[, "or"]/se), mean=0, sd=1)), fmt="p", digits=3),
    "sig" = Format(2*(1-pnorm(q = abs(coe[, "or"]/se), mean=0, sd=1)), fmt="*"),
    stringsAsFactors = FALSE
  )

  colnames(d.print)[4:5] <- c("Pr(>|z|)","")
  rownames(d.print) <- paste(coe$time, coe$id, sep=":")

  res <- list(or = d.print, call = x$call,
              BrierScore = NA, # BrierScore(x),
              PseudoR2 = PseudoR2(x, which="all"), res=d.res)

  class(res) <- "OddsRatio"
  return(res)

}


.drop1.polr <- function (mod, ...) {

  # based on car:::Anova.II.polr

  term.names <-
    function (model, ...)
    {
      term.names <- labels(terms(model))
      if (any(names(coefficients(model)) == "(Intercept)"))  # (has.intercept(model))
        c("(Intercept)", term.names)
      else term.names
    }


  df.terms.polr <-  function (model, term, ...) {

    if (!missing(term) && 1 == length(term)) {
      assign <- attr(model.matrix(model), "assign")
      which.term <- which(term == labels(terms(model)))
      if (0 == length(which.term))
        stop(paste(term, "is not in the model."))
      sum(assign == which.term)
    }
    else {
      terms <- if (missing(term))
        labels(terms(model))
      else term
      result <- numeric(0)
      for (term in terms) result <- c(result, Recall(model,
                                                     term))
      names(result) <- terms
      result
    }
  }

  relatives <- function (term, names, factors) {
    is.relative <- function(term1, term2) {
      all(!(factors[, term1] & (!factors[, term2])))
    }
    if (length(names) == 1)
      return(NULL)
    which.term <- which(term == names)
    (1:length(names))[-which.term][sapply(names[-which.term],
                                          function(term2) is.relative(term, term2))]
  }

  responseName <- function(model, ...)
    deparse(attr(terms(model), "variables")[[2]])


  if (!requireNamespace("MASS"))
    stop("MASS package is missing")
  which.nms <- function(name) which(asgn == which(names == name))

  fac <- attr(terms(mod), "factors")
  names <- term.names(mod)
  n.terms <- length(names)
  X <- model.matrix(mod)
  y <- model.response(model.frame(mod))
  wt <- model.weights(model.frame(mod))
  asgn <- attr(X, "assign")
  aic <- p <- LR <- rep(0, n.terms)
  df <- df.terms.polr(mod)

  for (term in 1:n.terms) {
    rels <- names[relatives(names[term], names, fac)]
    exclude.1 <- as.vector(unlist(sapply(c(names[term], rels),
                                         which.nms)))
    mod.1 <- if (n.terms > 1)
      MASS::polr(y ~ X[, -c(1, exclude.1)], weights = wt)
    else MASS::polr(y ~ 1, weights = wt)

    dev.1 <- deviance(mod.1)
    mod.2 <- if (length(rels) == 0)
      mod
    else {
      exclude.2 <- as.vector(unlist(sapply(rels, which.nms)))
      MASS::polr(y ~ X[, -c(1, exclude.2)], weights = wt)
    }
    dev.2 <- deviance(mod.2)
    LR[term] <- dev.1 - dev.2
    p[term] <- pchisq(LR[term], df[term], lower.tail = FALSE)
    aic[term] <- AIC(mod.1)
  }

  result <- data.frame(aic, LR, df, p)
  row.names(result) <- names
  names(result) <- c("AIC", "LR Chisq", "Df", "Pr(>Chisq)")
  class(result) <- c("anova", "data.frame")
  attr(result, "heading") <- c("Analysis of Deviance Table (Type II tests)\n",
                               paste("Response:", responseName(mod)))
  result

}



LowVar <- function(x, na.rm=FALSE, uvalp = 0.2, freq = 20, method=c("both", "unique", "freq")) {

  # The functionality is realized in two main steps:
  # 1. Check for near zero variance predictors and flag as near zero if:
  #   (a) the percentage of unique values is less than 20
  #   (b) the ratio of the most frequent to the second most frequent value is greater than 20,
  # 2. Check for susceptibility to multicollinearity
  #   (a) Calculate correlation matrix
  #   (b) Find variables with correlation 0.9 or more and delete them



  if (na.rm) x <- na.omit(x)
  method <- match.arg(method)

  lw1 <- (1 - sum(duplicated(x)) / length(x)) < uvalp

  m1 <- DescTools::Mode(x, na.rm)
  if(identical(m1, structure(NA_real_, freq = 1L)))
    # all values are unique, lw2 = FALSE
    lw2 <- FALSE

  else {
    m2 <- DescTools::Mode(x[x != m1], na.rm)
    lw2 <- (attr(m1, "freq") / attr(m2, "freq")) > freq

  }

  if(method == "unique")
    res <- lw1

  else if(method == "freq")
    res <- lw2

  else if(method == "both")
    res <- lw1 | lw2

  return(res)

}



RefLevel <- function(x){

  refCat <- function(model, var) {
    cs <- attr(model.matrix(model), "contrasts")[[var]]
    if (is.character(cs)) {
      if (cs == "contr.treatment")
        ref <- 1
      else stop("No treatment contrast")
    }
    else {
      zeroes <- !cs
      ones <- cs == 1
      stopifnot(all(zeroes | ones))
      cos <- colSums(ones)
      stopifnot(all(cos == 1))
      ros <- rowSums(ones)
      stopifnot(sum(!ros) == 1 && sum(ros) != ncol(cs))
      ref <- which(!ros)
    }
    return(levels(model$model[[var]])[ref])
  }

  # find factor predvals
  fpred <- names(grep("factor|ordered", attr(x[["terms"]], "dataClasses"), value=TRUE))
  resp <- all.vars(formula(x))[1]
  fpred <- fpred[fpred != resp]

  # list( complicated=
  # sapply(fpred , function(z) refCat(x, z))
  #
  # # or simply?
  # , simple=sapply(x$xlevels, "[", 1))

  sapply(fpred , function(z) refCat(x, z))

}


.InsertRefLevel <- function(ref, coeftable, pval){

  # call: .InsertRefLevel(ref=RefLevel(r.lm)$simple,
  #                       coefficients(r.lm))

  # insert reference levels for factors in a coefficient table
  # rr <- RefLevel(x)$simple

  for(i in seq_along(ref)){
    ff <- names(ref[i])
    rl <- ref[i]
    j <- head(grep(ff, rownames(coeftable)), 1)
    coeftable <- Append(coeftable, NA, after=j-1, rows=TRUE)
    rownames(coeftable)[grep(ff, rownames(coeftable))] <-
      gsub(ff, "  ", rownames(coeftable)[grep(ff, rownames(coeftable))])
    rownames(coeftable)[j] <- gettextf("%s (Ref: %s)", ff, rl)

    if(!is.null(pval))
      coeftable[j, 8] <- Format(pval[grep(ff, rownames(pval)), ], fmt="*")

  }

  as.data.frame(coeftable)

}



InsertRow <- function(x, val, after=nrow(x)) {

  # insert a row in a data.frame

  x[seq(after+1, nrow(x)+1), ] <- x[seq(after, nrow(x)), ]
  x[after, ] <- val

  x

}


OverSample <- function(x, vname){
  p <- sort(prop.table(table(x[, vname])))[1]
  xbig <- x[x[,vname] != names(p),]
  return(rbind(xbig, Sample(x[x[,vname] == names(p),], size=nrow(xbig), replace=TRUE)))
}


UnderSample <- function(x, vname){
  p <- sort(prop.table(table(x[, vname])))[1]
  xsmall <- x[x[,vname] == names(p),]
  return(rbind(xsmall, Sample(x[x[,vname] != names(p),], size=nrow(xsmall), replace=FALSE)))
}




RobSummary <- function(mod, conf.level=0.95, type="HC0"){

  sterr <- sqrt(diag(vcovHC(mod, type=type)))
  alpha <- 1 - (1-conf.level)/2

  if(inherits(mod, "glm")) {
    res <- cbind(est= coef(mod),
                 lci = coef(mod) - qnorm(alpha) * sterr,
                 uci = coef(mod) + qnorm(alpha) * sterr,
                 rse = sterr,
                 zval = coef(mod)/sterr,
                 pval = 2 * pnorm(abs(coef(mod)/sterr), lower.tail=FALSE))

  } else if(inherits(mod, "lm")) {

    res <- cbind(est= coef(mod),
                 lci = coef(mod) - qt(alpha, df = mod$df) * sterr,
                 uci = coef(mod) + qt(alpha, df = mod$df) * sterr,
                 rse = sterr,
                 zval = coef(mod)/sterr,
                 pval = 2 * pt(abs(coef(mod)/sterr), df=mod$df, lower.tail=FALSE)
    )

  }

  return(res)

}




PredictCI <- function(mod, newdata, conf.level=0.95){

  preds <- predict(mod, newdata=newdata, type="link", se.fit=TRUE)
  critval <- -qnorm((1-conf.level)/2)
  res <- cbind(
    fit <- preds$fit
    , lwr <- preds$fit - critval * preds$se.fit
    , upr <- preds$fit + critval * preds$se.fit
  )

  res <- SetNames(mod$family$linkinv(res),
                  colnames=c("prob", "lci", "uci"))

  return(res)
}
AndriSignorell/ModTools documentation built on Jan. 17, 2025, 12:37 a.m.