R/tabl.R
In labelr: Label Data Frames, Variables, and Values

Documented in tabl

#' Construct Value Label-Friendly Frequency Tables
#'
#' @description
#' `tabl` calculates raw or weighted frequency counts (or proportions) over
#' arbitrary categorical values (including integer values), which may be
#' expressed in terms of raw variable values or labelr label values.
#'
#' @details
#' This function creates a labelr-friendly data.frame representation of
#' multi-variable tabular data, where either value labels or values can be
#' displayed (use of value labels is the default), and where various convenience
#' options are provided, such as using frequency weights, using proportions
#' instead of counts, rounding those percentages, automatically expressing
#' many-valued, non-value-labeled numerical variables in terms of quantile
#' category groups, or pivoting / casting one of the categorical variables'
#' levels (labels) to serve as columns in a cross-tab-like table.
#'
#' @param data a data.frame.
#' @param vars a quoted character vector of variable names of variables you wish
#' to include in defining category groups to tabulate over in the table. If NULL
#' `tabl` will attempt to construct a table over all combinations of all
#' non-decimal-having variables in the data.frame that do not exceed the
#' max.unique.vals threshold. Additionally, note the effects of the qtiles
#' argument.
#' @param labs.on if TRUE (the default), then value labels -- rather than the
#' raw variable values -- will be displayed in the returned table for any
#' value-labeled variables. Variables need not be value-labeled: This command
#' (with this option set to TRUE or FALSE) will work even when no variables are
#' value-labeled.
#' @param qtiles if not NULL, must be a 1L integer between 2 and 100 indicating
#' the number of quantile categories to employ in temporarily (for purposes of
#' tabulation) auto-value-labeling numeric columns that exceed the max.unique.vals
#' threshold. If NULL, no such auto-value-labeling will take place. Note: When
#' labs.on = TRUE, any pre-existing variable value labels will be used in favor
#' of the quantile value labels generated by this argument. By default,
#' qtiles = 4, and the automatically generated quantile category levels will be
#' labeled as "q025" (i.e., first quartile), "q050", "q075", and "q100".
#' @param prop.digits if non-NULL, cell percentages (proportions) will be
#' returned instead of frequency counts, and these will be rounded to the digit
#' specified (e.g., prop.digits = 3 means a value of 0.157 would be returned for
#' a cell that accounted for 8 observations if the total number of observations
#' were 51). If NULL (the default), frequency counts will be returned.
#' @param wt an optional vector that includes cell counts or some other
#' idiosyncratic "importance" weight. If NULL, no weighting will be employed.
#' @param div.by Divide the returned counts by a constant for scaling purposes.
#' This may be a number (e.g., div.by = 10 to divide by 10) or a character that
#' follows the convention "number followed by 'K', 'M', or 'B'", where, e.g.,
#' "10K" is translated as 10000, "1B" is translated as 1000000000, etc.
#' @param max.unique.vals Integer to specify the maximum number of unique values
#' of a variable that may be observed for that variable to be included in
#' tabulations. Note that labelr sets a hard ceiling of 5000 on the total number
#' of unique value labels that any variable is permitted to have under any
#' circumstance, as labelr is primarily intended for interactive use with
#' moderately-sized data.frames. See the qtiles argument for an approach to
#' incorporating many-valued numeric variables that exceed the max.unique.vals
#' threshold.
#' @param sort.freq By default, returned table rows are sorted in descending
#' order of cell frequency (most frequent categories/combinations first). If set
#' to FALSE, table rows will be sorted by the the distinct values of the vars
#' (in the order vars are specified).
#' @param zero.rm If TRUE, zero-frequency vars categories/combinations (i.e.,
#' those not observed in the data.frame) will be filtered from the table. For
#' tables that would produce more than 10000 rows, this is done automatically.
#' @param irreg.rm If TRUE, tabulations exclude cases where any applicable
#' variable (see vars argument) features any of the following "irregular"
#' values: NA, NaN, Inf, -Inf, or any non-case-sensitive variation on "NA",
#' "NAN", "INF", or "-INF." If FALSE, all "irregular" values (as just defined)
#' are assigned to a "catch-all" category of NA that is featured in the
#' returned table (if/where present).
#' @param wide.col If non-NULL, this is the quoted name of a single column / var
#' of supplied data.frame whose distinct values (category levels) you wish to be
#' columns of the returned table. For example, if you are interested in a
#' cross-tab of "edu" (highest level of education) and "race" (a race/ethnicity
#' variable), you could supply vars= c("edu") and wide.col = "race", and the
#' different racial-ethnic group categories would appear as distinct columns,
#' with "edu" category levels appearing as distinct rows, and cell values
#' representing the cross-tabbed cell "edu" level frequencies for the respective
#' "race" groups (see examples). You may supply one wide.col at most.
#'
#' @return a data.frame.
#' @importFrom stats reshape aggregate
#' @export
#'
#' @examples
#' # assign mtcars to new data.frame df
#' df <- mtcars
#'
#' # add na values to make things interesting
#' df[1, 1:11] <- NA
#' rownames(df)[1] <- "Missing Car"
#'
#' # add value labels
#' df <- add_val_labs(
#'   data = df,
#'   vars = "am",
#'   vals = c(0, 1),
#'   labs = c("automatic", "manual")
#' )
#'
#' df <- add_val_labs(
#'   data = df,
#'   vars = "carb",
#'   vals = c(1, 2, 3, 4, 6, 8),
#'   labs = c(
#'     "1-carb", "2-carbs",
#'     "3-carbs", "4-carbs",
#'     "6-carbs", "8-carbs"
#'   )
#' )
#'
#' # var arg can be unquoted if using add_val1()
#' # note that this is not add_val_labs(); add_val1() has "var" arg instead of "vars
#' df <- add_val1(
#'   data = df,
#'   var = cyl, # note, "var," not "vars" arg
#'   vals = c(4, 6, 8),
#'   labs = c(
#'     "four-cyl",
#'     "six-cyl",
#'     "eight-cyl"
#'   )
#' )
#'
#' df <- add_val_labs(
#'   data = df,
#'   vars = "gear",
#'   vals = 3:5,
#'   labs = c(
#'     "3-speed",
#'     "4-speed",
#'     "5-speed"
#'   )
#' )
#'
#'
#' # lookup mapping
#' get_val_labs(df)
#'
#' # introduce other "irregular" values
#' df$am[1] <- NA
#'
#' df[2, "am"] <- NaN
#' df[3, "am"] <- -Inf
#' df[5, "cyl"] <- "NAN"
#'
#' # take a look
#' head(df)
#'
#' # demonstrate tabl() frequency tabulation function
#'
#' # this is the "first call" that will be referenced repeatedly below
#' # labels on, sort by variable values, suppress/exclude NA/irregular values
#' # ...return counts
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = TRUE, # NAs and the like are suppressed
#'   prop.digits = NULL
#' ) # return counts, not proportions
#'
#' # same as "first call", except now value labels are off
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   labs.on = FALSE, # use variable values
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = TRUE, # NAs and the like are suppressed
#'   prop.digits = NULL
#' ) # return counts, not proportions
#'
#' # same as "first call," except now proportions instead of counts
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = TRUE, # NAs and the like are suppressed
#'   prop.digits = 3
#' ) # return proportions, rounded to 3rd decimal
#'
#' # same as "first call," except now sort by frequency counts
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = TRUE, # sort in order of descending frequency
#'   irreg.rm = TRUE, # NAs and the like are suppressed
#'   prop.digits = NULL
#' ) # return proportions, rounded to 3rd decimal
#'
#' # same as "first call," except now use weights
#' set.seed(2944) # for reproducibility
#' df$freqwt <- sample(10:50, nrow(df), replace = TRUE) # create (fake) freq wts
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   wt = "freqwt", # use frequency weights
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = FALSE, # NAs and the like are included/shown
#'   prop.digits = NULL
#' ) # return counts, not proportions
#'
#' df$freqwt <- NULL # we don't need this anymore
#'
#' # now, with extremely large weights to illustrate div.by
#' set.seed(428441) # for reproducibility
#' df$freqwt <- sample(1000000:10000000, nrow(df), replace = TRUE) # large freq wts
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   wt = "freqwt", # use frequency weights
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = FALSE, # NAs and the like are included/shown
#'   prop.digits = NULL
#' ) # return counts, not proportions
#'
#' # show div by - Millions
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   wt = "freqwt", # use frequency weights
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = FALSE, # NAs and the like are included/shown
#'   prop.digits = NULL, # return counts, not proportions
#'   div.by = "1M"
#' ) # one million
#'
#' # show div by - Tens of millions
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   wt = "freqwt", # use frequency weights
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = FALSE, # NAs and the like are included/shown
#'   prop.digits = NULL, # return counts, not proportions
#'   div.by = "10M"
#' ) # ten million
#'
#' # show div by - 10000
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   wt = "freqwt", # use frequency weights
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = FALSE, # NAs and the like are included/shown
#'   prop.digits = NULL, # return counts, not proportions
#'   div.by = 10000
#' ) # ten thousand; could've used div.by = "10K"
#'
#' # show div by - 10000, but different syntax
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   wt = "freqwt", # use frequency weights
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = FALSE, # NAs and the like are included/shown
#'   prop.digits = NULL, # return counts, not proportions
#'   div.by = "10K"
#' ) # ten thousand; could've used div.by = 10000
#'
#' df$freqwt <- NULL # we don't need this anymore
#'
#' # turn labels off, to make this more compact
#' # do not show zero values (zero.rm)
#' # do not show NA values (irreg.rm)
#' # many-valued numeric variables will be converted to quantile categories by
#' # ...qtiles argument
#' tabl(df,
#'   vars = c("am", "gear", "carb", "mpg"),
#'   qtiles = 4, # many-valued numerics converted to quantile
#'   labs.on = FALSE, # use values, not variable value labels
#'   sort.freq = FALSE, # sort by vars values (not frequencies)
#'   irreg.rm = TRUE, # NAs and the like are suppressed
#'   zero.rm = TRUE, # variable combinations that never occur are suppressed
#'   prop.digits = NULL, # return counts, not proportions
#'   max.unique.vals = 10
#' ) # drop from table any var with >10 distinct values
#'
#' # same as above, but include NA/irregular category values,
#' # zero.rm is TRUE; include unobserved (zero-count) category combinations
#' tabl(df,
#'   vars = c("am", "gear", "carb", "mpg"),
#'   qtiles = 4,
#'   labs.on = FALSE, # use values, not variable value labels
#'   sort.freq = TRUE, # sort by frequency
#'   irreg.rm = FALSE, # preserve/include NAs and irregular values
#'   zero.rm = FALSE, # include non-observed combinations
#'   prop.digits = NULL, # return counts, not proportions
#'   max.unique.vals = 10
#' ) # drop from table any var with >10 distinct values
#'
#' # show cross-tab view with wide.col arg
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = TRUE, # sort by vars values (not frequencies)
#'   irreg.rm = TRUE, # NAs and the like are suppressed
#'   prop.digits = NULL, # return counts, not proportions
#'   wide.col = "am"
#' ) # use "am" as a column variable in a cross-tab view
#'
#' tabl(df,
#'   vars = c("cyl", "am"),
#'   labs.on = TRUE, # use variable value labels
#'   sort.freq = TRUE, # sort by vars values (not frequencies)
#'   irreg.rm = TRUE, # NAs and the like are suppressed
#'   prop.digits = NULL, # return counts, not proportions
#'   wide.col = "cyl"
#' ) # use "cyl" as a column variable in a cross-tab view
#'
#' # verify select counts using base::subset()
#' nrow(subset(df, am == 0 & cyl == 4))
#' nrow(subset(df, am == 0 & cyl == 8))
#' nrow(subset(df, am == 1 & cyl == 8))
#' nrow(subset(df, am == 0 & cyl == 6))
#' nrow(subset(df, am == 1 & cyl == 6))
#'
#' # will work on an un-labeled data.frame
#' tabl(mtcars, vars = c("am", "gear", "carb", "mpg"))
tabl <- function(data,
                 vars = NULL,
                 labs.on = TRUE,
                 qtiles = 4,
                 prop.digits = NULL,
                 wt = NULL,
                 div.by = NULL,
                 max.unique.vals = 10,
                 sort.freq = TRUE,
                 zero.rm = FALSE,
                 irreg.rm = FALSE,
                 wide.col = NULL) {
  ############################################################################
  # tabl_df - internal function to calculate compact tables and weighted tables
  ############################################################################
  tabl_df <- function(data, vars = NULL, group.id.name = "gid", wts = NULL) {
    # safe_char_val() - search for presence of character stub in x
    # ...and identify the first variant of stub that is not already in x
    # "variant of stub" means the leading stub characters with numbers affixed
    # ..afterward
    safe_char_val <- function(x, stub) {
      if (is.data.frame(x)) {
        x <- as.data.frame(x)
        x <- names(x)
      }
      if (!stub %in% x) {
        the_name <- stub
      } else {
        the_name <- NULL
        found_it <- FALSE
        count <- 0
        while (!found_it) {
          count <- count + 1
          the_name <- paste0(stub, "_", count)
          if (!the_name %in% x) found_it <- TRUE
        }
      }

      return(the_name)
    }
    # end safe_char_val

    group_id_nm <- safe_char_val(names(data), group.id.name)

    if (is.null(vars)) vars <- names(data)
    if (!is.null(wts)) vars <- base::setdiff(vars, wts)

    data_l <- data.frame(table(data[vars]))
    data_l <- data.frame(col1 = rownames(data_l), data_l)
    names(data_l)[1] <- group_id_nm
    names(data_l)[ncol(data_l)] <- "n"

    i <- sapply(data_l, is.factor)
    data_l[i] <- lapply(data_l[i], as.character)

    if (!is.null(wts)) {
      data <- merge(data_l, data, all = TRUE)
      data_z <- tapply(data[[wts]], data[["gid"]], sum)
      data_z <- data.frame(col1 = names(data_z), "n.wtd" = unname(data_z))
      data_z[is.na(data_z$n.wtd), "n.wtd"] <- 0
      names(data_z)[1] <- group_id_nm
      i <- sapply(data_z, is.character)
      data_z[i] <- lapply(data_z[i], as.numeric)
      data_l <- merge(data_l, data_z, by = group_id_nm)
    }

    data_l[[1]] <- NULL
    return(data_l)
  }
  ############################################################################
  # end tabl_df() - begin main tabl() function code
  ############################################################################

  # make this a Base R data.frame
  data <- as_base_data_frame(data)

  # get nrow
  if (nrow(data) > 300000) {
    warning("
Note: tabl() is not optimized for data.frames this large.\n")
  }

  # grab weights value if present
  if (!is.null(wt)) {
    if (!wt %in% names(data)) {
      stop("
wt arg must be a colname of supplied data.frame. No such column found.")
    }

    wts <- data[[wt]]

    # check weights variable (numeric)
    if (!is.numeric(wts)) {
      stop("
wt argument must be a numeric variable.")
    }

    # check weights variable (irregular)
    if (any(unname(check_irregular(wts)))) {
      stop("
wt variable may not include NA or other irregular (e.g., NaN) values.")
    }

    # check weights values (all >=0)
    if (any(wts < 0)) {
      stop("
All weights must be numeric values >=0.")
    }
  }

  # make sure only one wide.col supplied
  if (!is.null(wide.col)) {
    if (length(wide.col) != 1) {
      stop("
You may not specify more than one wide.col.")
    }
  }

  # safely drop out of scope columns
  if (!is.null(vars) && !is.null(wide.col)) {
    vars <- unique(c(vars, wide.col))
  } else if (!is.null(vars) && is.null(wide.col)) {
    vars <- vars
  } else {
    vars <- names(data)
  }

  # make sure all selected vars are found in data.frame
  if (any(!vars %in% names(data))) {
    stop("
At least one colname arg to vars or wide.col not found in supplied data.frame.")
  } else {
    data <- sbrac(data, , vars) # subset, preserving labels
  }

  # check max vals - 5000 unique value labels for a variable is a hard cap:
  # Under no circumstances can a variable with 5000 distinct values receive value
  # ...labels
  if (max.unique.vals > 5000) {
    stop("
    \n max.unique.vals may not exceed 5000.")
  }

  # turn on value labels, if specified
  if (labs.on) {
    # add quantile labels if specified
    if (!is.null(qtiles)) {
      data <- suppressWarnings(all_quant_labs(data,
        qtiles = qtiles,
        unique.vals.thresh = max.unique.vals
      ))
    }

    data <- suppressWarnings(use_val_labs(data))
  } else {
    data <- strip_labs(data)

    # add quantile labels if specified
    if (!is.null(qtiles)) {
      data <- suppressWarnings(all_quant_labs(data,
        qtiles = qtiles,
        unique.vals.thresh = max.unique.vals
      ))
    }

    data <- suppressWarnings(use_val_labs(data))
  }
  # drop vars with decimal points or too many unique values
  num_vars_to_drop <- sapply(
    data,
    function(x) {
      length(unique(x)) > max.unique.vals |
        has_decv(x)
    }
  )

  if (any(num_vars_to_drop)) {
    names_to_drop <- names(num_vars_to_drop)[unname(num_vars_to_drop)]
    for (i in seq_along(names_to_drop)) {
      this_name <- names_to_drop[i]
      warning(sprintf("
Excluding variable --%s-- (includes decimals or exceeds max.unique.vals).\n", this_name))
    }
    data <- data[!num_vars_to_drop]
    data <- as.data.frame(data)
    vars <- names(data)
  }

  # combinations
  combos <- prod(sapply(data, function(x) length(unique(x, na.rm = TRUE))))

  # zero.rm
  if (combos > 100000 && !zero.rm) {
    zero.rm <- TRUE
    warning("
Requested table would be >100000 rows. Excluding zero-frequency (unobserved) combinations")
  }

  # find a safe name to use (one not already in vars)
  if (!"vars" %in% vars) {
    the_name <- "vars"
  } else {
    the_name <- NULL
    found_it <- FALSE
    count <- 0
    while (!found_it) {
      count <- count + 1
      the_name <- paste0("vars", "_", count)
      if (!the_name %in% vars) found_it <- TRUE
    }
  }

  # convert factors to character
  data <- as.data.frame(data)
  i <- sapply(data, is.factor)
  data[i] <- lapply(data[i], as.character)

  # convert irregular values to "NA" function over all remaining variables
  data <- lapply(data, irregular2v, to = "NA")
  data <- do.call("cbind", data)
  data <- as.data.frame(data)

  # do weighted counts if wt arg is not NULL (see sapply() call below w/ sum())
  if (!is.null(wt)) {
    last_col_name <- "n.wtd"
    data <- cbind(data, wts) # restore weights
    data <- as.data.frame(data)
    names(data)[ncol(data)] <- wt
    data2 <- tabl_df(data, vars = vars, wts = wt)
    data2[["n"]] <- NULL

    # do unweighted counts if wt arg is NULL (see sapply() call below w/ nrow())
  } else {
    last_col_name <- "n"
    data2 <- tabl_df(data, vars = vars)
  }

  # remove irregular values, as requested
  if (irreg.rm) {
    irreg_rows <- unname(which(apply(data2, 1, function(x) any(x == "NA"))))

    if (length(irreg_rows) != 0) data2 <- data2[-c(irreg_rows), ]
  }
  # remove rows with zero counts, as requested
  if (zero.rm) {
    zero_rows <- unname(which(data2[[last_col_name]] == 0))

    if (length(zero_rows) != 0) data2 <- data2[-c(zero_rows), ]
  }

  # sort by frequency counts if that option is TRUE
  # else by var values
  # sort table results
  # sort by frequency counts, then by vars for var combinations that
  # share the same frequency count
  if (sort.freq) {
    sort_vars <- c(last_col_name, vars)
    desc_args <- c(TRUE, rep(FALSE, length(vars)))
    data2 <- suppressWarnings(
      suppressMessages(
        ssort(data2, sort_vars, desc_args)
      )
    )

    # else, just sort by vars
  } else {
    data2 <- suppressWarnings(
      suppressMessages(
        ssort(data2, vars)
      )
    )
  }

  # use percents (proportions) instead of counts if prop.digits is not NULL
  if (!is.null(prop.digits)) {
    name_x <- names(data2)[ncol(data2)]
    data2[[name_x]] <- data2[[name_x]] / sum(data2[[name_x]])
    data2[[name_x]] <- round(data2[[name_x]], digits = prop.digits)
  }

  # allow for dividing totals by some constant, if one is specified
  # and if prop.digits argument is NULL
  if (!is.null(div.by) && is.null(prop.digits)) {
    if (is.character(div.by)) {
      div.by <- sub("B", "000000000", toupper(div.by))
      div.by <- sub("M", "000000", toupper(div.by))
      div.by <- sub("K", "000", toupper(div.by))
      div.by <- as.integer(div.by)
    }

    data2[[ncol(data2)]] <- data2[[ncol(data2)]] / div.by
  }

  rownames(data2) <- 1:nrow(data2)

  # "cast" / "pivot wider" by wide.col, if we've specified one
  if (!is.null(wide.col)) {
    vals.var <- names(data2)[ncol(data2)]
    other.vars <- names(data2)[!names(data2) %in% c(wide.col, vals.var)]

    data2 <- as_base_data_frame(data2)

    data2 <- stats::reshape(data2,
      timevar = wide.col,
      idvar = other.vars,
      direction = "wide"
    )
    vals.var <- paste0(vals.var, "\\.")
    names(data2) <- gsub(vals.var, "", names(data2))
    data2 <- as.data.frame(data2)
    data2 <- data2[names(data2)]
    data2 <- as.data.frame(data2)

    # convert NA to 0 (counts) in new pivoted-wider cols
    orig_vars <- base::setdiff(vars, wide.col)
    new_vars <- base::setdiff(names(data2), orig_vars)
    for (i in new_vars) {
      data2[[i]] <- as_numv(data2[[i]])
      data2[is.na(data2[[i]]), i] <- 0
    }
  }

  # restore numeric status to any variables for which this makes sense
  data2 <- as_num(data2)

  # convert irregular values to NA values
  data2 <- lapply(data2, irregular2v, to = NA)
  data2 <- do.call("cbind", data2)
  data2 <- as.data.frame(data2)
  data2[[ncol(data2)]] <- as.numeric(data2[[ncol(data2)]])

  return(data2)
}