R/up.R

Defines functions varLevel constant.data.frame constant.default constant

Documented in constant constant.data.frame constant.default varLevel

##
##  up + constant: to check whether something is constant
##
#' Generic test for constant
#'
#' @param x vector or data frame
#' @param na.rm (default FALSE) logical value indicating whether `NA` values should be ignored 
#' @param \dots other arguments
#' @export
constant <- function(x,...) UseMethod("constant")

#' @describeIn constant Default methods
#' @export
constant.default <- function(x, na.rm = FALSE,...) {
  if (na.rm) x <- na.omit(x)
  length(unique(x)) <= 1
}
#' @param id vector or list of vectors used to define clusters within which
#'           to check whether `x` is constant
#' @param all if TRUE, report whether constant overall instead of checking
#'        within subgroups created by `id`
#' @describeIn constant method for class 'data.frame'
#' @export
constant.data.frame <- function( x, id  , all = FALSE , ...) {
  ## Description:    (G. Monette, June 13, 2005)
  ## check if values of variables are constant within levels of id
  ## if no id, check each variable
  ## if all == TRUE, report overall instead of within levels of id
  ## Possible improvements:
  ##    allow nested formulas: ~id1/id2 for id and report level of
  ##    each variable
  ##    [see varLevel()]

  ## note that the following code allows id to be given as a name or as
  ## a formula

  if (missing(id)) ret <- sapply(x, constant,...)
  else {
    id <- eval(substitute(id), x, parent.frame())
    if( inherits(id,"formula") ) id <- c( model.frame(id,x) )
    ret <- sapply(x, function(xx) tapply(xx, id, constant, ...))
    if ( all ) ret <- apply( ret, 2, all)
  }
  ret
}

###
### varLevel
###


#' Identify level of aggregation at which a variable in invariant
#'
#' Shows levels of each variable with respect to grouping formula
#' of form ~id or nested ids ~id1/id2
#'
#' @return Level 0 is a constant for the whole data frame,
#' Level <= 1 implies the variable is constant within levels of id1,
#' Level <= 2 implies the variable is constant within levels of id2,
#' ... etc.
#'
#' NOTE: NA counts as a distinct value
#' @param x data frame
#' @param form formula (evaluated in `x`) to identify clusters
#' @param \dots extra arguments to \code{\link{constant}} function
#' @export
varLevel <- function(x, form, ...) {
  ## Description:    (G. Monette, June 13, 2005)
  sel <- model.frame( form, x )
  z <- list()
  idx <- rep('',nrow(x))
  z[[1]] <- constant(x,...)
  for ( ii in 1:ncol(sel)) {
    idx <- paste(idx, as.character(sel[[ii]]), sep = ";")
    z[[ii+1]] <- constant( x, idx, all = TRUE,...)
  }
  # print(z)
  ret <- do.call("rbind", z)
  # print(ret)
  ret <- length(z) - apply( ret*1, 2 , sum)
  ret
}

###
### up
###

#' Create a data frame at a higher level of aggregation
#'
#' Produce a higher level data set with one row per cluster. The data set can
#' contain only variables that are invariant in each cluster or it can also
#' include summaries (mean or modes) of variables that vary by cluster. Adapted
#' from \code{gsummary} in the \code{nlme} package.
#'
#' \code{up} was created from \code{nlme::gsummary} and modified to make it
#' easier to use and to make an equivalent of \code{gsummary} available when
#' using \code{lme4}.
#'
#' @param object a data frame to be aggregated.
#' @param form a one-sided formula identifying the variable(s) in \code{object}
#'        that identifies clusters. e.g. ~ school/Sex to get a summary within each Sex
#'        of each school.
#' @param agg (NEW: Aug 2016) a one-sided formula identifying variables to be aggregated,
#'        i.e. variables that vary within clusters and that need to be aggregated 
#'        (within-cluster mean for numeric variables and within-cluster incidence
#'        proportions for factors). Default: NULL 
#' @param sep.agg (NEW: Aug 2016) separator between factor names and factor
#'        level for within-cluster incidence proportions. Default: '_'        
#' @param freq (NEW: Nov 2018) a one-sided formula identifying character variables to
#'        be represented according the frequencies of their levels,
#'        i.e. variables that vary withing cluster and that need to be aggregated 
#'        (within-cluster sum for numeric variables and within-cluster frequencies
#'        for factors). Default: NULL 
#' @param sep.freq (NEW: Nove 2018) separator between factor names and factor
#'        level for within-cluster incidence frequencies. Default: '_'        
#' @param all if TRUE, include summaries of variables that vary within
#'        clusters, otherwise keep only cluster-invariant variables and variables
#'        listed in 'agg'
#' @param sep separator to form cluster names combining more than one
#' clustering variables.  If the separator leads to the same name for distinct
#' clusters (e.g. if var1 has levels 'a' and 'a/b' and var2 has levels 'b/c'
#' and 'c') the function produces an error and a different separator should be
#' used.
#' @param FUN function to be used for summaries.
#' @param omitGroupingFactor kept for compatibility with \code{gsummary}
#' @param groups kept for compatibility with \code{gsummary}
#' @param invariantsOnly kept for compatibility with \code{gsummary}
#' @param \dots additional arguments to \code{tapply} when summarizing
#' numerical variables. e.g. \code{na.rm = TRUE}
#' @return a data frame with one row per combination of values of the variable(s) in \code{form}.
#'         The number of rows for each combination is retuned in a variable 'Freq'.
#'         Frequencies (proportions) of values for each variable specified
#'         by '~freq' ('~agg') are also included. 
#' @examples
#'     data(hs)
#'     dim( hs )
#'     hsu <- up( hs, ~ school )
#'     dim( hsu )
#'
#'     # to also get cluster means of cluster-varying numeric variables and modes of factors:
#'
#'     hsa <- up( hs, ~ school , all = TRUE )
#'
#'     # to get summary proportions of cluster varying factors:
#'
#'     up( cbind( hs, model.matrix( ~ Sex -1 , hs)), ~ school, all = T)
#'
#'     # Similar using 'agg'
#'     
#'     up(hs, ~school, agg = ~ Sex)
#'     
#'     ## To plot a summary between-cluster panel along with within-cluster panels:
#'
#'     hsu <- up( hs, ~ school, all = TRUE)
#'     hsu$school <- ' between'  # space to make it come lexicographically before cluster names
#'
#'     require( lattice )
#'     xyplot( mathach ~ ses | school, rbind(hs,hsu),
#'         panel = function( x, y, ...) {
#'             panel.xyplot( x, y, ...)
#'             panel.lmline( x, y, ...)
#'         } )
#'         
#'     ## To create a data frame grouped by predictors with frequency variables for each
#'     ## level of a response variable for analysis with a binomial glm with goodness of fit
#'     ## based on the deviance
#'     
#'     hsa <- up( hs, ~school, freq = ~ Sex)
#'     head(hsa)
#'     fit <- glm(cbind(Sex_Female, Sex_Male) ~ Sector, hsa, family = binomial)
#'     summary(fit) # the residual deviance provides a goodness of fit test
#'     
#' @author adapted by G. Monette from gsummary in 'nlme' by Bates & Pinheiro
#' @export
up <- function ( object, form = formula(object),
           agg = NULL, sep.agg = "_",
           freq = NULL, sep.freq = "_",
           all = FALSE, sep = "/",
           na.rm = TRUE,
           FUN = function(x) mean(x, na.rm = na.rm),
           omitGroupingFactor = FALSE,
           groups, invariantsOnly = !all , ...)
{
  if (!inherits(object, "data.frame")) {
    stop("Object must inherit from data.frame")
  }
  sel.mf <- model.frame( form , object , na.action = na.include )
  narows <- apply(sel.mf,1,function(x) any(is.na(x)))
  hasFreq <- !is.null(object[['Freq']])
  if(hasFreq & all) warning("variable 'Freq' is ignored in computing aggregate values")
  if(any(narows)) {
    warning("Rows with NAs in grouping variable(s) are omitted")
    sel.mf <- droplevels(sel.mf[!narows,,drop=FALSE])
    object <- object[!narows,,drop=FALSE]
  }
  if ( ncol(sel.mf) > 1) {
    sel <- apply( sel.mf, 1 , paste, collapse = sep)
    groups <- as.factor(sel)
    # Check if sep works to create unique group combinations
    sel2 <- apply(sel.mf,1, paste, collapse = as.character(sample(1000:9999,1)))
    if ( length( unique(sel)) != length( unique(sel2))) {
      stop( 'distinct grouping combinations have the same name: change the "sep" argument')
    }
  } else {
    groups <- as.factor(sel.mf[[1]])
  }

  # Create Freq variable
  if(!hasFreq) object$Freq <- 1
  object$Freq <- capply(object$Freq, groups, sum)

    if(!is.null(agg)) {
    if(hasFreq) warning("Frequencies in 'Freq' are ignored when using the argument 'agg'")
    agg.mf <- model.frame(agg, object, na.action = na.include)
    
    #ret <- object
    for (i in seq_along(agg.mf)) {
      x <- agg.mf[[i]]
      if(is.character(x)) x <- factor(x)
      if(is.factor(x)) {
        mat <- cvar(x, sel.mf, all = T, na.rm = na.rm)
        colnames(mat) <- paste0(names(agg.mf[i]),sep.agg,colnames(mat))
      }
      else {
        mat <- cvar(x, sel.mf, na.rm = na.rm, ...)
        mat <- data.frame(x=mat)
        names(mat) <- names(agg.mf[i])
      }
      object <- cbind(object, mat)
    }
  }
  if(!is.null(freq)) {
    if(hasFreq) warning("Frequencies in 'Freq' are ignored when using the argument 'freq'")
    freq.mf <- model.frame(freq, object, na.action = na.include)
    for (i in seq_along(freq.mf)) {
      x <- freq.mf[[i]]
      if(is.character(x)) x <- factor(x)
      # GM 2020_01_04 x <- freq.mf[[i]]
      if(is.factor(x)) {
        mat <- cvar(x, sel.mf, all = T, FUN = sum, na.rm = na.rm)
        colnames(mat) <- paste0(names(freq.mf[i]),sep.freq,colnames(mat))
      }
      else {
        mat <- cvar(x, sel.mf, FUN = sum, na.rm = na.rm, ...)
        mat <- data.frame(x=mat)
        names(mat) <- names(agg.mf[i])
      }
      object <- cbind(object, mat)
    }
  }
  gunique <- unique(groups)
  firstInGroup <- match(gunique, groups)
  asFirst <- firstInGroup[match(groups, gunique)]
  value <- as.data.frame(object[firstInGroup, , drop = FALSE])
  row.names(value) <- as.character(gunique)
  value <- value[as.character(sort(gunique)), , drop = FALSE]
  varying <- unlist(lapply(object, function(column, frst) {
    aux <- column
    if( is.matrix( aux))aux[] <- as.character( aux )
    else aux <- as.character(aux)
    if ( is.matrix( aux )) any(!identical( aux, aux[frst,]))
    else any(!identical(aux, aux[frst]))
  }, frst = asFirst))
  if (any(varying) && (!invariantsOnly)) {
    Mode <- function(x) {
      aux <- table(x)
      names(aux)[match(max(aux), aux)]
    }
    if (data.class(FUN) == "function") {
      FUN <- list(numeric = FUN, ordered = Mode, factor = Mode, logical = FUN)
    }
    else {
      if (!(is.list(FUN) && all(sapply(FUN, data.class) ==
                                "function"))) {
        stop("FUN can only be a function or a list of functions")
      }
      auxFUN <- list(numeric = mean, ordered = Mode, factor = Mode, logical = mean)
      aux <- names(auxFUN)[is.na(match(names(auxFUN), names(FUN)))]
      if (length(aux) > 0)
        FUN[aux] <- auxFUN[aux]
    }
    for (nm in names(object)[varying]) {
      dClass <- if (is.ordered(object[[nm]]))
        "ordered"
      else if (is.factor(object[[nm]]))
        "factor"
      else mode(object[[nm]])
      if (dClass == "numeric") {
        if( is.matrix ( object[[nm]])){
          zmat <- object[[nm]]
          ret <- list()
          for ( jj in seq_len(ncol(zmat))) {
            ret[[jj]] <- as.vector( tapply( zmat[,jj],
                                            groups, FUN[['numeric']],...))
          }
          value[[nm]] <- do.call(cbind, ret)
          
        } else {
          value[[nm]] <- as.vector(tapply(object[[nm]],
                                          groups, FUN[["numeric"]], ...))
        }
      }
      else {
        value[[nm]] <- as.vector(tapply(object[[nm]],
                                        groups, FUN[[dClass]]))
      }
    }
  }
  else {
    value <- value[, !varying, drop = FALSE]
  }
  if (omitGroupingFactor) {
    if (is.null(form)) {
      stop("Cannot omit grouping factor without \"form\"")
    }
    grpForm <- getGroupsFormula(form, asList = TRUE)
    if (missing(level))
      level <- length(grpForm)
    grpNames <- names(grpForm)[level]
    whichKeep <- is.na(match(names(value), grpNames))
    if (any(whichKeep)) {
      value <- value[, whichKeep, drop = FALSE]
    }
    else {
      return(NULL)
    }
  }
  value
}

#' na.include action
#'
#' From the Hmisc package, author: Frank Harrell
#'
#' @param obj a data.frame whose factors will be redefined so exclude = NULL
#' @export
na.include <-
function (obj)
{
  if (inherits(obj, "data.frame"))
    for (i in seq(along = obj)) obj[[i]] <- na.include(obj[[i]])
    else {
      if (length(levels(obj)) && any(is.na(obj)))
        obj <- factor(obj, exclude = NULL)
    }
    obj
}
#' Apply a function to clusters of rows in a data frame
#' 
#' Apply a function to clusters of rows in a data frame and return
#' the result so it is conformable with the data frame created by
#' 'up' applied to the same data frame and same clustering formula
#' 
#' @param object a data frame as source for an aggregated result
#' @param form a one-sided formula identifying the variable(s) in \code{object}
#' that identifies clusters. e.g. ~ school/Sex to get a summary within each Sex
#' of each school
#' @param FUN a function to be applied to each data frame consisting of a 
#'        cluster of rows of 'object'. The most common choice is \code{\link{with}} so that
#'        '...' can be an expression using variable names in 'object'. 
#' @param ... other arguments to FUN, frequently when FUN is 'with', an expression 
#'        using variable names in 'object'        
#' @examples
#' zd <- data.frame(a=c('a','a','b','b','c','c','c'),
#'       b = c("B","B","A","B","C","D","D"), x = 1:7, y = 11:17)
#' zd$n <- capply(zd$x, zd[c('a','b')], length)
#' zdu <- up(zd, ~a, agg = ~b)
#' zdu
#' zdu$p <- up_apply(zd, ~ a, with, sum(x)/sum(y))
#' zdu
#' @export
up_apply <-
  function ( object, form, FUN , ...,  sep = '/') {
    sel.mf <- model.frame( form , object , na.action = na.include )
    narows <- apply(sel.mf,1,function(x) any(is.na(x)))
    if(any(narows)) {
      warning("Rows with NAs in grouping variable(s) are omitted")
      sel.mf <- droplevels(sel.mf[!narows,,drop=FALSE])
      object <- object[!narows,,drop=FALSE]
    }
    if ( ncol(sel.mf) > 1) {
      sel <- apply( sel.mf, 1 , paste, collapse = sep)
      groups <- as.factor(sel)
      # Check if sep works to create unique group combinations
      sel2 <- apply(sel.mf,1, paste, collapse = as.character(sample(1000:9999,1)))
      if ( length( unique(sel)) != length( unique(sel2))) {
        stop( 'distinct grouping combinations have the same name: change the "sep" argument')
      }
    } else {
      groups <- as.factor(sel.mf[[1]])
    }
    
    FUN <- match.fun(FUN)
    # if (inherits(by,'formula')) by <- model.frame( by , x , na.action = na.include)
    # if (is.character(by)) by <- factor(by)
    # if (is.factor(by)) by <- as.numeric(by)
    ret <- sapply ( split ( object , groups ), FUN, ...)
    if(is.null(dim(ret))) ret else t(ret)
  }
gmonette/spida2 documentation built on Aug. 20, 2023, 7:21 p.m.