R/paired.R
In arsenal: An Arsenal of 'R' Functions for Large-Scale Statistical Summaries

Documented in paired

#' Summary Statistics of a Set of Independent Variables Paired Across Two Timepoints
#'
#' Summarize one or more variables (x) by a paired time variable (y). Variables
#'   on the right side of the formula, i.e. independent variables, are summarized by the
#'   two time points on the left of the formula. Optionally, an appropriate test is performed to test the
#'   distribution of the independent variables across the time points.
#' @param formula an object of class \code{\link{formula}} of the form \code{time ~ var1 + ...}.
#'   See "Details" for more information.
#' @inheritParams tableby
#' @param id The vector giving IDs to match up data for the same subject across two timepoints.
#' @param na.action a function which indicates what should happen when the data contain \code{NA}s.
#'   The default is \code{na.paired("in.both")}. See \code{\link{na.paired}} for more details
#' @param control control parameters to handle optional settings within \code{paired}.
#'   Two aspects of \code{paired} are controlled with these: test options of RHS variables and x variable summaries.
#'   Arguments for \code{paired.control} can be passed to \code{paired} via the \code{...} argument,
#'   but if a control object and \code{...} arguments are both supplied,
#'   the latter are used. See \code{\link{paired.control}} for more details.
#' @param ... additional arguments to be passed to internal \code{paired} functions or \code{\link{paired.control}}.
#' @return An object with class \code{c("paired", "tableby", "arsenal_table")}
#' @details
#' Do note that this function piggybacks off of \code{\link{tableby}} quite heavily, so there is no
#' \code{summary.paired} function (for instance).
#'
#' These tests are accepted:
#' \itemize{
#'   \item{
#'     \code{paired.t}: a paired \code{\link[stats:t.test]{t-test}}.
#'   }
#'   \item{
#'     \code{mcnemar}: \link[stats:mcnemar.test]{McNemar's test}.
#'   }
#'   \item{
#'     \code{signed.rank}: a \link[stats:wilcox.test]{signed rank test}.
#'   }
#'   \item{
#'     \code{sign.test}: a sign test.
#'   }
#'   \item{
#'     \code{notest}: no test is performed.
#'   }
#' }
#' @seealso \code{\link{arsenal_table}}, \code{\link{paired.control}}, \code{\link{tableby}}, \code{\link{formulize}}, \code{\link{selectall}}
#' @author Jason Sinnwell, Beth Atkinson, Ryan Lennon, and Ethan Heinzen
#' @name paired
NULL
#> NULL

#' @rdname paired
#' @export
paired <- function(formula, data, id, na.action, subset=NULL, strata, control = NULL, ...) {
  control <- c(list(...), control)
  control <- do.call("paired.control", control[!duplicated(names(control))])

  Call <- match.call()

  ## Tell user if they passed an argument that was not expected, either here or in control
  expectArgs <- c("formula", "data", "na.action", "subset", "strata", "control", names(control), "id")
  match.idx <- match(names(Call)[-1], expectArgs)
  if(anyNA(match.idx)) warning("unused arguments: ", paste(names(Call)[1+which(is.na(match.idx))], collapse=", "), "\n")

  indx <- match(c("formula", "data", "subset", "na.action", "id", "strata"), names(Call), nomatch = 0)
  if(indx[1] == 0) stop("A formula argument is required")
  if(length(formula) != 3) stop("'formula' must be two-sided.")
  if(indx[5] == 0) stop("An id argument is required")

  special <- c("paired.t", "mcnemar", "signed.rank", "sign.test", "notest")

  out.tables <- list()
  formula.list <- as_list_formula(formula)
  for(FORM in formula.list)
  {
    temp.call <- Call[c(1, indx)]
    temp.call[[1]] <- as.name("model.frame")

    if(is.null(temp.call$na.action)) temp.call$na.action <- na.paired("in.both")

    if (missing(data)) {
      temp.call$formula <- stats::terms(FORM, special)
    } else {
      # instead of call("keep.labels", ...), which breaks when arsenal isn't loaded (Can't find "keep.labels")
      temp.call$data <- as.call(list(keep.labels, temp.call$data))
      temp.call$formula <- stats::terms(FORM, special, data = keep.labels(data))
    }

    ##  set up new environment for
    ## if specials, assign dummy versions of those functions
    tabenv <- new.env(parent = environment(formula))
    for(sp in special)
    {
      if(!is.null(attr(temp.call$formula, "specials")[[sp]])) assign(sp, inline_tableby_stat_test, envir = tabenv)
    }

    ## set tabenv as environment in which to evalulate formula
    environment(temp.call$formula) <- tabenv

    ## evaluate the formula with env set for it
    modeldf <- loosen.labels(eval.parent(temp.call))
    if(nrow(modeldf) == 0) stop("No (non-missing) observations")

    Terms <- stats::terms(modeldf)

    ###### Check for strata ######

    if(hasStrata <- "(strata)" %in% colnames(modeldf))
    {
      strata.col <- modeldf[["(strata)"]]
      strataTerm <- deparse(Call$strata)
      if(is.null(strataLabel <- attr(strata.col, "label"))) strataLabel <- strataTerm
      if(is.factor(strata.col))
      {
        strata.col <- droplevels(strata.col)
        strata.levels <- levels(strata.col)
      } else strata.levels <- sort(unique(strata.col))

      modeldf[["(strata)"]] <- NULL
    } else
    {
      strata.col <- rep("", nrow(modeldf))
      strataTerm <- strataLabel <- strata.levels <- ""
    }

    ###### Check for by-variable ######

    if(attributes(Terms)$response != 0)
    {
      by.col <- modeldf[[1]]
      termBy <- names(modeldf)[1]
      if(is.null(labelBy <- attr(by.col, "label"))) labelBy <- termBy
      if(is.factor(by.col)) {
        by.col <- droplevels(by.col)
        by.levels <- levels(by.col)
      } else by.levels <- sort(unique(by.col))
      by.col <- as.character(by.col)
      by.levels <- as.character(by.levels)

      if(any(by.levels == ""))
      {
        warning('Empty string detected in by-variable is not allowed; converting to " ".')
        by.col[by.col == ""] <- " "
        by.levels <- unique(replace(by.levels, by.levels == "", " "))
      }
      modeldf[[1]] <- NULL
    }

    if(length(by.levels) != 2) stop("Please specify exactly 2 time points")
    ids <- modeldf$`(id)`
    tab <- table(ids, by.col)
    if(sum(tab > 1) > 0)
      stop("At least one person has multiple observations for at least one time point")
    if(sum(rowSums(tab) == 2) == 0)
      stop("No one appears to have data on both time points")

    ids.both <- intersect(ids[by.col == by.levels[1]], ids[by.col == by.levels[2]])

    strata.col1 <- strata.col[by.col == by.levels[1]]
    TP1 <- modeldf[by.col == by.levels[1], , drop = FALSE]
    strata.col2 <- strata.col[by.col == by.levels[2]]
    TP2 <- modeldf[by.col == by.levels[2], , drop = FALSE]

    cn <- colnames(modeldf)
    cn <- cn[cn != "(id)"]

    idx1 <- match(ids.both, TP1$`(id)`, nomatch = 0)
    idx2 <- match(ids.both, TP2$`(id)`, nomatch = 0)
    strata.col1 <- strata.col1[idx1]
    strata.col2 <- strata.col2[idx2]
    TP1 <- TP1[idx1, cn, drop = FALSE]
    TP2 <- TP2[idx2, cn, drop = FALSE]
    modeldf[["(id)"]] <- NULL

    if(is.null(difflab <- control$stats.labels$difference)) difflab <- "Difference"
    ystats <- c(table(factor(by.col, levels=by.levels), exclude=NA), Difference=length(ids.both))
    names(ystats)[names(ystats) == "Difference"] <- difflab
    yList <- list(stats=ystats, label=labelBy, term=termBy)


    ## find which columnss of modeldf have specials assigned to known specials
    specialIndices <- unlist(attr(Terms, "specials")) - attributes(Terms)$response
    specialTests <- rep("", ncol(modeldf))
    ## If a special shows up multiple times, the unlist assigned a number at the end. Strip it off.
    ## This disallows functions with a number at the end
    specialTests[specialIndices] <- gsub("\\d+$", "", names(specialIndices))

    xTerms <- Map(modeldf, names(modeldf), f = function(col, nm) {
      if(is.null(nameEff <- attr(col, "name"))) nameEff <- nm
      if(is.null(labelEff <- attr(col, "label"))) labelEff <- nameEff
      if(is.null(termEff <- attr(col, "term"))) termEff <- nm
      list(variable=nameEff, label=labelEff, term=termEff, term.orig = nm)
    })
    names(xTerms) <- vapply(xTerms, "[[", NA_character_, "variable")

    control.list <- lapply(modeldf, attr, "control.list")
    names(control.list) <- names(xTerms)

    strataList <- vector("list", length(strata.levels))
    if(hasStrata) names(strataList) <- paste0("(", strataTerm, ") == ", strata.levels)

    for(strat in strata.levels)
    {
      ## list of x variables
      xList <- vector("list", ncol(modeldf))
      names(xList) <- names(xTerms)

      bycol <- by.col[strata.col == strat]

      for(eff in seq_along(modeldf)) {

        currcol <- modeldf[[eff]]
        TP1.eff <- TP1[[eff]]
        TP2.eff <- TP2[[eff]]

        ############################################################

        if(is.ordered(currcol) || is.logical(currcol) || is.factor(currcol) || is.character(currcol)) {
          ######## ordinal or categorical variable (character or factor) ###############

          ## convert logicals and characters to factor
          if(is.character(currcol))
          {
            lvl <- sort(unique(currcol[!is.na(currcol)]))
            currcol <- factor(currcol, levels = lvl)
            TP1.eff <- factor(TP1.eff, levels = lvl)
            TP2.eff <- factor(TP2.eff, levels = lvl)
          } else if(is.logical(currcol))
          {
            lvl <- c(FALSE, TRUE)
            currcol <- factor(currcol, levels=lvl)
            TP1.eff <- factor(TP1.eff, levels = lvl)
            TP2.eff <- factor(TP2.eff, levels = lvl)
          }


          ## to make sure all levels of cat variable are counted, need to pass values along
          xlevels <- levels(currcol)

          if(length(xlevels) == 0) stop(paste0("Zero-length levels found for ", names(xTerms)[eff]))

          ## get stats funs from either formula  or control
          if(is.ordered(currcol))
          {
            currstats <-  control$ordered.stats
            currtest <- control$ordered.test
            vartype <- "ordinal"
          } else
          {
            currstats <- control$cat.stats
            currtest <- control$cat.test
            vartype <- "categorical"
          }

        } else if(is.Date(currcol)) {
          ######## Date variable ###############
          xlevels <- sort(unique(currcol))

          ## get stats funs from either formula  or control
          currstats <- control$date.stats
          currtest <- control$date.test
          vartype <- "Date"

        } else if(is.selectall(currcol)) {
          xlevels <- colnames(currcol)

          currstats <- control$selectall.stats
          currtest <- control$selectall.test
          vartype <- "selectall"

        } else if(inherits(currcol, "Surv")) {
          ##### Survival (time to event) #######
          stop("Sorry, survival objects don't work in this function.")

        } else if(is.numeric(currcol) || inherits(currcol, "difftime")) {
          ######## Continuous variable (numeric) ###############

          ## for difftime, convert to numeric
          if(inherits(currcol, "difftime")) currcol <- as.numeric(currcol)
          xlevels <- sort(unique(currcol))

          ## if no missings, and control says not to show missings,
          ## remove Nmiss stat fun
          currstats <- control$numeric.stats
          currtest <- control$numeric.test
          vartype <- "numeric"
        } else stop("Variable ", names(xTerms), " has unknown class(es): ", paste0(class(currcol)[-1], collapse = ", "))
        ############################################################

        ## if no missings, and control says not to show missings,
        ## remove Nmiss stat fun
        if(!is.null(attrstats <- attr(modeldf[[eff]], "stats"))) currstats <- attrstats

        # now finally subset
        currcol <- currcol[strata.col == strat]
        TP1.eff <- TP1.eff[strata.col1 == strat]
        TP2.eff <- TP2.eff[strata.col2 == strat]
        if(!anyNA(currcol) && "Nmiss" %in% currstats) currstats <- currstats[currstats != "Nmiss"]
        statList <- list()
        for(statfun2 in currstats) {
          statfun <- get_stat_function(statfun2)
          tmp <- get0(statfun, mode = "function")
          statfun <- if(is.null(tmp)) get(statfun, parent.frame(), mode = "function") else tmp

          bystatlist <- list()
          if(statfun2 %in% c("countrowpct", "countcellpct", "rowbinomCI", "Npct"))
          {
            bystatlist <- do.call(statfun, list(currcol, levels = xlevels,
                                                by = by.col, by.levels = by.levels, na.rm = TRUE))
            bystatlist$Total <- NULL
          } else if(statfun2 == "Nsigntest") {
            bystatlist <- as.countpct(NA_real_)
          } else
          {
            for(bylev in by.levels) {
              idx <- bycol == bylev
              bystatlist[[bylev]] <- do.call(statfun, list(currcol[idx], levels=xlevels, na.rm=TRUE, conf.level = control$conf.level))
            }
          }

          if(is.selectall(currcol))
          {
            tmp <- as.selectall(TP1.eff != TP2.eff)
            bystatlist[[difflab]] <- do.call(statfun, list(tmp, levels=xlevels, na.rm=TRUE, conf.level = control$conf.level))
          } else if(statfun2 %in% c("countpct", "countrowpct", "countcellpct"))
          {
            # countrowpct to get the right percentages
            bystatlist[[difflab]] <- countrowpct(TP1.eff, levels = xlevels, by = TP1.eff == TP2.eff,
                                                 by.levels = c(TRUE, FALSE), na.rm = TRUE)[[2]]
          } else if(statfun2 == "count")
          {
            # this doesn't have percentages
            bystatlist[[difflab]] <- count(replace(TP1.eff, TP1.eff == TP2.eff, NA), levels = xlevels, na.rm = TRUE)
          } else if(statfun2 %in% c("binomCI", "rowbinomCI"))
          {
            bystatlist[[difflab]] <- rowbinomCI(TP1.eff, levels = xlevels, by = TP1.eff == TP2.eff,
                                                by.levels = c(TRUE, FALSE), na.rm = TRUE, conf.level = control$conf.level)[[2]]
          } else if(statfun2 == "Npct")
          {
            # get the right percentages
            bystatlist[[difflab]] <- Npct(TP1.eff, levels = xlevels, by = TP1.eff == TP2.eff, by.levels = c(TRUE, FALSE), na.rm = TRUE)[[2]]
          } else
          {
            bystatlist[[difflab]] <- do.call(statfun, list(as.numeric(TP2.eff) - as.numeric(TP1.eff),
                                                           levels=xlevels, na.rm=TRUE, conf.level = control$conf.level))
          }
          statList[[statfun2]] <- bystatlist
        }
        if(length(statList) == 0) stop(paste0("Nothing to show for variable '", names(xTerms)[eff], "'"))

        currtest <- if(nchar(specialTests[eff]) > 0) specialTests[eff] else currtest
        testout <- if(control$test) {
          eval(call(currtest, TP1.eff, TP2.eff, mcnemar.correct=control$mcnemar.correct,
                    signed.rank.exact = control$signed.rank.exact, signed.rank.correct = control$signed.rank.correct,
                    test.always=control$test.always))
        } else notest()

        xList[[eff]] <- list(stats=statList, test=testout, type=vartype)
      }

      strataList[[if(!hasStrata) 1 else paste0("(", strataTerm, ") == ", strat)]] <- xList
    }
    out.tables[[termBy]] <- list(formula = FORM, y = yList, strata = list(term = strataTerm, values = strata.levels, label = strataLabel, hasStrata = hasStrata),
                                 x = xTerms, tables = strataList, control.list = control.list, hasWeights = FALSE)
  }
  structure(list(Call = Call, control = control, tables = out.tables), class = c("paired", "tableby", "arsenal_table"))
}