R/dummyVars_MSqRob.R

Defines functions predict.dummyVars_MSqRob

Documented in predict.dummyVars_MSqRob predict.dummyVars_MSqRob

#' Create A Full Set of Dummy Variables
#'
#' \code{dummyVars_MSqRob} creates a full set of dummy variables (i.e. less than full
#' rank parameterization)
#'
#' Most of the \code{\link[stats]{contrasts}} functions in R produce full rank
#' parameterizations of the predictor data. For example,
#' \code{\link[stats]{contr.treatment}} creates a reference cell in the data
#' and defines dummy variables for all factor levels except those in the
#' reference cell. For example, if a factor with 5 levels is used in a model
#' formula alone, \code{\link[stats]{contr.treatment}} creates columns for the
#' intercept and all the factor levels except the first level of the factor.
#' For the data in the Example section below, this would produce:
#' \preformatted{ (Intercept) dayTue dayWed dayThu dayFri daySat daySun 1 1 1 0
#' 0 0 0 0 2 1 1 0 0 0 0 0 3 1 1 0 0 0 0 0 4 1 0 0 1 0 0 0 5 1 0 0 1 0 0 0 6 1
#' 0 0 0 0 0 0 7 1 0 1 0 0 0 0 8 1 0 1 0 0 0 0 9 1 0 0 0 0 0 0 }
#'
#' In some situations, there may be a need for dummy variables for all the
#' levels of the factor. For the same example: \preformatted{ dayMon dayTue
#' dayWed dayThu dayFri daySat daySun 1 0 1 0 0 0 0 0 2 0 1 0 0 0 0 0 3 0 1 0 0
#' 0 0 0 4 0 0 0 1 0 0 0 5 0 0 0 1 0 0 0 6 1 0 0 0 0 0 0 7 0 0 1 0 0 0 0 8 0 0
#' 1 0 0 0 0 9 1 0 0 0 0 0 0 }
#'
#' Given a formula and initial data set, the class \code{dummyVars_MSqRob} gathers all
#' the information needed to produce a full set of dummy variables for any data
#' set. It uses \code{contr.ltfr_MSqRob} as the base function to do this.
#'
#' \code{class2ind} is most useful for converting a factor outcome vector to a
#' matrix of dummy variables.
#'
#' @aliases dummyVars_MSqRob dummyVars_MSqRob.default predict.dummyVars_MSqRob
#' contr.ltfr_MSqRob
#' @param formula An appropriate R model formula, see References
#' @param data A data frame with the predictors of interest
#' @param sep An optional separator between factor variable names and their
#' levels. Use \code{sep = NULL} for no separator (i.e. normal behavior of
#' \code{\link[stats]{model.matrix}} as shown in the Details section)
#' @param levelsOnly A logical; \code{TRUE} means to completely remove the
#' variable names from the column names
#' @param fullRank A logical; should a full rank or less than full rank
#' parameterization be used? If \code{TRUE}, factors are encoded to be
#' consistent with \code{\link[stats]{model.matrix}} and the resulting there
#' are no linear dependencies induced between the columns.
#' @param object An object of class \code{dummyVars_MSqRob}
#' @param newdata A data frame with the required columns
#' @param na.action A function determining what should be done with missing
#' values in \code{newdata}. The default is to predict \code{NA}.
#' @param n A vector of levels for a factor, or the number of levels.
#' @param contrasts A logical indicating whether contrasts should be computed.
#' @param sparse A logical indicating if the result should be sparse.
#' @param x A factor vector.
#' @param ... additional arguments to be passed to other methods
#' @return The output of \code{dummyVars_MSqRob} is a list of class 'dummyVars_MSqRob' with
#' elements \item{call }{the function call} \item{form }{the model formula}
#' \item{vars }{names of all the variables in the model} \item{facVars }{names
#' of all the factor variables in the model} \item{lvls }{levels of any factor
#' variables} \item{sep }{\code{NULL} or a character separator} \item{terms
#' }{the \code{\link[stats]{terms.formula}} object} \item{levelsOnly }{a
#' logical}
#'
#' The \code{predict} function produces a data frame.
#'
#' \code{contr.ltfr_MSqRob} generates a design matrix.
#' @author \code{contr.ltfr_MSqRob} is a small modification of
#' \code{\link[stats]{contr.treatment}} by Max Kuhn
#' @seealso \code{\link[stats]{model.matrix}}, \code{\link[stats]{contrasts}},
#' \code{\link[stats]{formula}}
#' @references
#' \url{https://cran.r-project.org/doc/manuals/R-intro.html#Formulae-for-statistical-models}
#' @keywords models
#' @examples
#'
#'
#' when <- data.frame(time = c("afternoon", "night", "afternoon",
#'                             "morning", "morning", "morning",
#'                             "morning", "afternoon", "afternoon"),
#'                    day = c("Mon", "Mon", "Mon",
#'                            "Wed", "Wed", "Fri",
#'                            "Sat", "Sat", "Fri"))
#'
#' levels(when$time) <- list(morning="morning",
#'                           afternoon="afternoon",
#'                           night="night")
#' levels(when$day) <- list(Mon="Mon", Tue="Tue", Wed="Wed", Thu="Thu",
#'                          Fri="Fri", Sat="Sat", Sun="Sun")
#'
#' ## Default behavior:
#' model.matrix(~day, when)
#'
#' mainEffects <- dummyVars_MSqRob(~ day + time, data = when)
#' mainEffects
#' predict(mainEffects, when[1:3,])
#'
#' when2 <- when
#' when2[1, 1] <- NA
#' predict(mainEffects, when2[1:3,])
#' predict(mainEffects, when2[1:3,], na.action = na.omit)
#'
#'
#' interactionModel <- dummyVars_MSqRob(~ day + time + day:time,
#'                               data = when,
#'                               sep = ".")
#' predict(interactionModel, when[1:3,])
#'
#' noNames <- dummyVars_MSqRob(~ day + time + day:time,
#'                      data = when,
#'                      levelsOnly = TRUE)
#' predict(noNames, when)
#'
#' @export dummyVars_MSqRob
"dummyVars_MSqRob" <-
  function(formula, ...){
    UseMethod("dummyVars_MSqRob")
  }

#' @rdname dummyVars_MSqRob
#' @method dummyVars_MSqRob default
#' @importFrom stats as.formula model.frame
#' @export
dummyVars_MSqRob.default <- function (formula, data, sep = ".", levelsOnly = FALSE, fullRank = FALSE, ...)
{
  formula <- as.formula(formula)
  if(!is.data.frame(data)) data <- as.data.frame(data)

  vars <- all.vars(formula)
  if(any(vars == "."))
  {
    vars <- vars[vars != "."]
    vars <- unique(c(vars, colnames(data)))
  }
  isFac <- unlist(lapply(data[,vars,drop = FALSE], is.factor))
  if(sum(isFac) > 0)
  {
    facVars <- vars[isFac]
    lvls <- lapply(data[,facVars,drop = FALSE], levels)
    if(levelsOnly)
    {
      tabs <- table(unlist(lvls))
      if(any(tabs > 1))
      {
        stop(paste("You requested `levelsOnly = TRUE` but",
                   "the following levels are not unique",
                   "across predictors:",
                   paste(names(tabs)[tabs > 1], collapse = ", ")))
      }
    }
  } else {
    facVars <- NULL
    lvls <- NULL
  }
  trms <- attr(model.frame(formula, data), "terms")
  out <- list(call = match.call(),
              form = formula,
              vars = vars,
              facVars = facVars,
              lvls = lvls,
              sep = sep,
              terms = trms,
              levelsOnly = levelsOnly,
              fullRank = fullRank)
  class(out) <- "dummyVars_MSqRob"
  out

}


#' @rdname dummyVars_MSqRob
#' @method predict dummyVars_MSqRob
#' @importFrom stats delete.response model.frame model.matrix na.pass
#' @export
predict.dummyVars_MSqRob <- function(object, newdata, na.action = na.pass, ...)
{
  if(is.null(newdata)) stop("newdata must be supplied")
  if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
  if(!all(object$vars %in% names(newdata))) stop(
    paste("Variable(s)",
          paste("'", object$vars[object$vars %in% names(newdata)],
                "'", sep = "",
                collapse = ", "),
          "are not in newdata"))
  Terms <- object$terms
  Terms <- delete.response(Terms)
  if(!object$fullRank)
  {
    oldContr <- options("contrasts")$contrasts
    newContr <- oldContr
    newContr["unordered"] <- "contr.ltfr_MSqRob"
    options(contrasts = newContr)
  }
  m <- model.frame(Terms, newdata, na.action = na.action, xlev = object$lvls)

  x <- model.matrix(Terms, m)
  if(!object$fullRank) options(contrasts = oldContr)

  if(object$levelsOnly) {
    for(i in object$facVars) {
      for(j in object$lvls[[i]]) {
        from_text <- paste0(i, j)
        colnames(x) <- gsub(from_text, j, colnames(x), fixed = TRUE)
      }
    }
  }
  if(!is.null(object$sep) & !object$levelsOnly) {
    for(i in object$facVars[order(-nchar(object$facVars))]) {
      ## the default output form model.matrix is NAMElevel with no separator.
      for(j in object$lvls[[i]]) {
        from_text <- paste0(i, j)
        to_text <- paste(i, j, sep = object$sep)
        colnames(x) <- gsub(from_text, to_text, colnames(x), fixed = TRUE)
      }
    }
  }
  x[, colnames(x) != "(Intercept)", drop = FALSE]
}


#' @rdname dummyVars_MSqRob
#' @export
contr.ltfr_MSqRob <- function (n, contrasts = TRUE, sparse = FALSE)
{
  if (is.numeric(n) && length(n) == 1L) {
    if (n > 1L)
      levels <- as.character(seq_len(n))
    else stop("not enough degrees of freedom to define contrasts")
  }
  else {
    levels <- as.character(n)
    n <- length(n)
  }
  contr <- .RDiag(levels, sparse = sparse)
  if (contrasts) {
    if (n < 2L) stop(gettextf("contrasts not defined for %d degrees of freedom", n - 1L), domain = NA)
  }
  contr
}


#  File src/library/stats/R/contrast.R
#  Part of the R package, http://www.R-project.org
#
#  Copyright (C) 1995-2012 The R Core Team
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  http://www.r-project.org/Licenses/

## a fast version of diag(n, .) / sparse-Diagonal() + dimnames
## Originally .Diag in stats:contrast.R

.RDiag <- function (nms, sparse) {
  n <- as.integer(length(nms))
  d <- c(n, n)
  dn <- list(nms, nms)
  #   if (sparse) {
  #     requireNamespace("Matrix", quietly = TRUE)
  #     new("ddiMatrix", diag = "U", Dim = d, Dimnames = dn)
  #   }
  #   else
  array(c(rep.int(c(1, numeric(n)), n - 1L), 1), d, dn)
}
statOmics/MSqRob documentation built on Dec. 8, 2022, 6 a.m.