R/merData.R

Defines functions REquantile wiggle single_wiggle shuffle superFactor averageObs hasWeights findFormFuns subsetList collapseFrame randomObs draw.merMod draw stripAttributes sanitizeNames

Documented in averageObs collapseFrame draw draw.merMod findFormFuns hasWeights randomObs REquantile sanitizeNames shuffle stripAttributes subsetList superFactor wiggle

#' @title Clean up variable names in data frames
#' @name sanitizeNames
#' @description Strips out transformations from variable names in data frames
#' @param data a data.frame
#' @return a data frame with variable names cleaned to remove factor() construction
sanitizeNames <- function(data){
  badFac <- grep("factor\\(", names(data))
  for(i in badFac){
    names(data)[i] <- gsub("factor\\(", "", names(data)[i])
    names(data)[i] <- gsub("\\)", "", names(data)[i])
  }
  row.names(data) <- NULL
  return(data)
}

#' @title Remove attributes from a data.frame
#' @name stripAttributes
#' @description Strips attributes off of a data frame that come with a merMod model.frame
#' @param data a data.frame
#' @return a data frame with variable names cleaned to remove all attributes except for
#' names, row.names, and class
stripAttributes <- function(data){
  attr <- names(attributes(data))
  good <- c("names", "row.names", "class")
  for(i in attr[!attr %in% good]){
    attr(data, i) <- NULL
  }
  return(data)
}


#' @title Draw a single observation out of an object matching some criteria
#' @name draw
#' @description Draw is used to select a single observation out of an R object.
#' Additional parameters allow the user to control how that observation is
#' chosen in order to manipulate that observation later. This is a generic
#' function with methods for a number of objects.
#' @param object the object to draw from
#' @param type what kind of draw to make. Options include random or average
#' @param varList a list specifying filters to subset the data by when making the
#' draw
#' @param seed numeric, optional argument to set seed for simulations, ignored if type="average"
#' @param ... additional arguments required by certain methods
#' @return a data.frame with a single row representing the desired observation
#' @details In cases of tie, ".", may be substituted for factors.
#' @export draw
#' @rdname draw
draw <- function(object, type = c("random", "average"),
                 varList = NULL, seed = NULL, ...){
  UseMethod("draw")
}

#' @title Draw an observation from a merMod object
#' @rdname draw
#' @method draw merMod
#' @export
#' @import lme4
#' @examples
#' fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
#' # Random case
#' draw(fm1, type = "random")
#' # Average
#' draw(fm1, type = "average")
#' # Subset
#' draw(fm1, type = "average", varList = list("Subject" = "308"))
#'
draw.merMod <- function(object, type = c("random", "average"),
                        varList = NULL, seed = NULL, ...){
  type <- match.arg(type, c("random", "average"), several.ok = FALSE)
  if(type == 'random'){
    out <- randomObs(object, varList, seed)
    return(out)
  } else if(type == 'average'){
    out <- averageObs(object, varList, ...)
    return(out)
  }
}

#' @title Select a random observation from model data
#' @name randomObs
#' @description Select a random observation from the model frame of a merMod
#' @param merMod an object of class merMod
#' @param varList optional, a named list of conditions to subset the data on
#' @param seed numeric, optional argument to set seed for simulations
#' @return a data frame with a single row for a random observation, but with full
#' factor levels. See details for more.
#' @details Each factor variable in the data frame has all factor levels from the
#' full model.frame stored so that the new data is compatible with predict.merMod
#' @export
randomObs <- function(merMod, varList, seed = NULL){
  if(!missing(varList)){
    data <- subsetList(merMod@frame, varList)
  }

  if (!is.null(seed))
    set.seed(seed)
  else if (!exists(".Random.seed", envir = .GlobalEnv))
    runif(1)

  out <- data[sample(1:nrow(data), 1),]
  chars <- !sapply(out, is.numeric)
  for(i in names(out[, chars])){
    out[, i] <- superFactor(out[, i], fullLev = unique(merMod@frame[, i]))
  }
  out <- stripAttributes(out)
  return(out)
}

#' @title Collapse a dataframe to a single average row
#' @name collapseFrame
#' @description Take an entire dataframe and summarize it in one row by using the
#' mean and mode.
#' @param data a data.frame
#' @return a data frame with a single row
#' @details Each character and factor variable in the data.frame is assigned to the
#' modal category and each numeric variable is collapsed to the mean. Currently if
#' mode is a tie, returns a "."
collapseFrame <- function(data){
  chars <- !sapply(data, is.numeric)
  chars <- names(data[, chars, drop = FALSE])
  nums <- sapply(data, is.numeric)
  nums <- names(data[, nums, drop = FALSE])
  numDat <- apply(data[, nums, drop = FALSE], 2, mean)
  statmode <- function(x){
    z <- table(as.vector(x))
    m <- names(z)[z == max(z)]
    if (length(m) == 1) {
      return(m)
    }
    return(".")
  }
  charDat <- apply(data[, chars, drop = FALSE], 2, statmode)
  cfdata <- cbind(as.data.frame(t(numDat)), as.data.frame(t(charDat)))
  cfdata <- cfdata[, names(data)]
  return(cfdata)
}


#' @title Subset a data.frame using a list of conditions
#' @name subsetList
#' @description Split a data.frame by elements in a list
#' @param data a data.frame
#' @param list a named list of splitting conditions
#' @return a data frame with values that match the conditions in the list
subsetList <- function(data, list){
  if("logical" %in% unlist(lapply(list, class))){
    stop("List is improperly formatted. Try using only `=` instead of `==` in subsets")
  }
  for(i in names(list)){
    data <- split(data, data[, i])
    data <- data[[list[[i]]]]
    data <- as.data.frame(data)
  }
  return(data)
}

#' \code{findFormFuns} used by \link[merTools]{averageObs} to calculate proper
#' averages
#'
#' The purpose is to properly derive data for the average observation in the
#' data by being 'aware' of formulas that contain interactions and/or function
#' calls. For example, in the old behavior, if the formula contained a square
#' term specified as \code{I(x^2)}, we were returning the mean of \code{x(^2)} not the
#' square of mean(x).
#'
#' @param merMod the merMod object from which to draw the average observation
#' @param origData (default=NULL) a data frame containing the original,
#'        untransformed data used to call the model. This MUST be specified if
#'        the original variables used in formula function calls are NOT present
#'        as 'main effects'.
#'
#' @return a data frame with a single row for the average observation, but with full
#' factor levels. See details for more.
#'
#' @export
findFormFuns <- function(merMod, origData = NULL) {
  form <- getCall(merMod)$formula
  form.rhs <- delete.response(terms(form))
  modFrame <- model.frame(merMod)
  if (identical(modFrame, origData)) {
    origData = NULL
  }
  modFrame.tt <- terms(modFrame)
  #This part is a bit kludgy but should work
  modFrame.labels <- unique(unlist(strsplit(attr(modFrame.tt, "term.labels"), split = ":", fixed = TRUE)))
  modFrame.resp <- setdiff(rownames(attr(modFrame.tt, "factors")),
                           unique(unlist(strsplit(colnames(attr(modFrame.tt, "factors")), split = ":", fixed = TRUE))))

  merMod.weights <- hasWeights(merMod)
  if (merMod.weights) {
    modFrame <- modFrame[, c(modFrame.resp, modFrame.labels, "(weights)")]
  } else {
    modFrame <- modFrame[, c(modFrame.resp, modFrame.labels)]
  }

  #Scan RHS of formula labels for parens -> exit if clean
  paren_terms <- grepl("[()]", c(modFrame.resp, modFrame.labels))

  if (!any(paren_terms)) {
    if(is.null(origData)){
      out <- collapseFrame(modFrame)
    } else{
      out <- collapseFrame(origData)
    }
    return(out)
  } else {
    rhs.vars <- all.vars(form.rhs)
    # if (merMod.weights) {
    #   rhs.vars <- c(rhs.vars, c("(weights)"))
    # }
    #Warning if functions are detected but neither MAIN EFFECTS NOR DATA are supplied
    if (is.null(origData))  {
      if (!all(rhs.vars %in% modFrame.labels)) {
        warning(paste("\n\n  Functions detected in formula without user supplied data",
                      "  or main effects of affected variables so returning means of",
                      "  transformed variables.\n",
                      "  Make sure that this is appropriate or supply untransformed",
                      "  data using the 'origData' argument. See ?merTools::findFormFuns", sep = "\n"))
        out <- collapseFrame(modFrame)
        return(out)
      } else {
        #Functions Detected and Main Effects Present
        out <- collapseFrame(modFrame)
        for (i in which(paren_terms)) {
          out[1,i] <- eval(parse(text = colnames(out)[i]), envir = out[, rhs.vars])
        }
        return(out)
      }
    } else {
      #Functions Detected and Not All Main Effects Present ... but Data supplied
      out <- collapseFrame(modFrame)
      outData <- collapseFrame(origData)
      for (i in which(paren_terms)) {
        out[1,i] <- eval(parse(text = colnames(out)[i]), envir = outData)
      }
      return(out)
    }
  }
}

#' Identify if a merMod has weights
#'
#' @param merMod the merMod object to test for weights
#'
#' @return TRUE or FALSE for whether the model has weights
hasWeights <- function(merMod) {
  if (all(merMod@resp$weights == 1)) {
    FALSE
  } else {
    TRUE
  }
}


#' @title Find the average observation for a merMod object
#' @name averageObs
#' @description Extract a data frame of a single row that represents the
#' average observation in a merMod object. This function also allows the
#' user to pass a series of conditioning argument to calculate the average
#' observation conditional on other characteristics.
#' @param merMod a merMod object
#' @param varList optional, a named list of conditions to subset the data on
#' @param origData (default=NULL) a data frame containing the original,
#'        untransformed data used to call the model. This MUST be specified if
#'        the original variables used in formula function calls are NOT present
#'        as 'main effects'.
#' @param ... not used currently
#' @return a data frame with a single row for the average observation, but with full
#' factor levels. See details for more.
#' @details Each character and factor variable in the data.frame is assigned to the
#' modal category and each numeric variable is collapsed to the mean. Currently if
#' mode is a tie, returns a "." Uses the collapseFrame function.
#' @export
averageObs <- function(merMod, varList = NULL, origData = NULL, ...){
  if(!missing(varList)){
    if (is.null(origData)) {
      data <- subsetList(merMod@frame, varList)
    } else {
      data <- subsetList(origData, varList)
    }
    if(nrow(data) < 20 & nrow(data) > 2){
      warning("Subset has less than 20 rows, averages may be problematic.")
    }
    if(nrow(data) <3){
      warning("Subset has fewer than 3 rows, computing global average instead.")
      if (is.null(origData)) {
        data <- merMod@frame
      } else {
        data <- origData
      }
    }
  } else{
    if (is.null(origData)) {
      data <- merMod@frame
    } else {
      data <- origData
    }
  }
  out <- findFormFuns(merMod, origData = data)
  reTerms <- names(ngrps(merMod))
  if(any(reTerms %in% names(varList))){
    reTerms <- reTerms[!reTerms %in% names(varList)]
  }
  if(length(reTerms) > 0){
    for(i in 1:length(reTerms)){
      out[, reTerms[i]] <- REquantile(merMod = merMod,
                                      quantile = 0.5, groupFctr = reTerms[[i]])
      out[, reTerms[i]] <- as.character(out[, reTerms[i]])
    }
  }
  chars <- !sapply(out, is.numeric)
  for(i in names(out[, chars, drop = FALSE])){ # drop = FALSE
    out[, i] <- try(superFactor(out[, i], fullLev = unique(merMod@frame[, i])), silent = TRUE)
  }
  out <- stripAttributes(out)
  out <- out[, names(merMod@frame)]
  return(out)
}


#' @title Create a factor with unobserved levels
#' @name superFactor
#' @description Create a factor variable and include unobserved levels
#' for compatibility with model prediction functions
#' @param x a vector to be converted to a factor
#' @param fullLev a vector of factor levels to be assigned to x
#' @return a factor variable with all observed levels of x and all levels
#' of x in fullLev
#' @export
#' @examples
#' \donttest{
#' regularFactor <- c("A", "B", "C")
#' regularFactor <- factor(regularFactor)
#' levels(regularFactor)
#' # Now make it super
#' newLevs <- c("D", "E", "F")
#' regularFactor <- superFactor(regularFactor, fullLev = newLevs)
#' levels(regularFactor) # now super
#' }
superFactor <- function(x, fullLev){
  x <- as.character(x)
  if("factor" %in% class(fullLev)){
    fullLev <- unique(levels(fullLev))
  }
  unobsLev <- unique(x)[!unique(x) %in% fullLev]
  x <- factor(x, levels = c(fullLev, unobsLev),
              labels = c(fullLev, unobsLev))
  return(x)
}

#' @title Randomly reorder a dataframe
#' @name shuffle
#' @description Randomly reorder a dataframe by row
#' @param data a data frame
#' @return a data frame of the same dimensions with the rows reordered
#' randomly
shuffle <- function(data){
  return(data[sample(nrow(data)),])
}


# wiggle for a single variable (var) and single set of changing values (values)
single_wiggle <- function(data, var, values) {
  tmp.data <- data
  data <- do.call("rbind", replicate(length(values), data, simplify= FALSE))
  data[, var] <- rep(values, each = nrow(tmp.data))
  if(any(class(tmp.data[, var]) %in% c("factor", "ordered"))){
    data[, var] <- superFactor(data[, var],
                               fullLev = levels(tmp.data[, var]))

  }
  return(data)
}

#' @title Assign an observation to different values
#' @name wiggle
#' @description Creates a new data.frame with copies of the original observation,
#' each assigned to a different user-specified value of a variable. Allows the
#' user to look at the effect on predicted values of changing either a single variable
#' or multiple variables.
#' @param data a data frame with one or more observations to be reassigned
#' @param varlist a character vector specifying the name(s) of the variable to adjust
#' @param valueslist a list of vectors with the values to assign to var
#' @return a \code{data.frame} with each row assigned to the one of the new variable combinations.
#' All variable combinations are returned, eg wiggling two variables with 3 and 4 variables
#' respectively will return a new dataset with \code{3 * 4 = 12} observations.
#' @details If the variable specified is a factor, then wiggle will return it
#' as a character.
#' @export
#' @examples
#' data(iris)
#' wiggle(iris[3,], varlist = "Sepal.Width", valueslist = list(c(1, 2, 3, 5)))
#' wiggle(iris[3:5,], "Sepal.Width", valueslist = list(c(1, 2, 3, 5)))
#' wiggle(iris[3,], c("Sepal.Width", "Petal.Length"), list(c(1,2,3,5), c(3,5,6)))
#' wiggle(iris[3:5,], c("Sepal.Width", "Petal.Length"), list(c(1,2,3,5), c(3,5,6)))
wiggle <- function(data, varlist, valueslist) {
  if (length(varlist) != length(valueslist)) stop("varlist and valueslist must be equi-length.")
  n_var <- length(varlist)
  if (n_var == 1) {
    return(single_wiggle(data, varlist[[1]], valueslist[[1]]))
  } else {
    temp <- single_wiggle(data, varlist[[1]], valueslist[[1]])
    temp <- split(temp, f= varlist[[1]])
    varlist <- varlist[-1]; valueslist <- valueslist[-1]
    return(do.call("rbind", lapply(temp, wiggle, varlist= varlist, valueslist= valueslist)))
  }
}

#' @title Identify group level associated with RE quantile
#' @name REquantile
#' @description For a user specified quantile (or quantiles) of the random effect
#' terms in a merMod object. This allows the user to easily identify the observation
#' associated with the nth percentile effect.
#' @param merMod a merMod object with one or more random effect levels
#' @param quantile a numeric vector with values between 0 and 100 for quantiles
#' @param groupFctr a character of the name of the random effect grouping factor to extract
#' quantiles from
#' @param term a character of the random effect to extract for the grouping factor
#' specified. Default is the intercept.
#' @return a vector of the level of the random effect grouping term that corresponds
#' to each quantile
#' @export
#' @examples
#' \donttest{
#' fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
#' REquantile(fm1, quantile = 0.25, groupFctr = "Subject")
#' REquantile(fm1, quantile = 0.25, groupFctr = "Subject", term = "Days")
#' }
REquantile <- function(merMod, quantile, groupFctr, term = "(Intercept)"){
  if(any(quantile > 1 | quantile < 0)){
    stop("Quantiles must be specified on the range 0-1")
  }
  myRE <- ranef(merMod)[[groupFctr]]
  if(is.null(myRE)){
    stop("Random effect group name not found. Please respecify grouping factor.")
  }
  myRE.tmp <- try(myRE[order(myRE[, term]), ,drop = FALSE], silent = TRUE)
  if(!is(myRE.tmp, "data.frame")){
    term1 <- names(myRE)[1]
    myRE.tmp <- try(myRE[order(myRE[, term1]), ,drop = FALSE], silent = TRUE)
    warning(paste0(term, " not found in random effect terms. Returning first term, ",
          term1,", for grouping factor, ", groupFctr, ", instead."))
  }
  myRE <- myRE.tmp; myRE.tmp <- NULL
  nobs <- nrow(myRE)
  if(nobs < 20){
    message("Number of observations < 20, random effect quantiles may not be well-defined.")
  }
  obsnum <- floor(quantile * nobs)
  return(rownames(myRE)[obsnum])
}
jknowles/merTools documentation built on Feb. 11, 2024, 5:07 a.m.