tjmisc: TJ's Miscellany

Documented in compare_pairs sample_n_of seq_along_rows

# Dataframe related helpers

#' Create a sequence along the rows of a dataframe
#' @param data a dataframe
#' @return a sequence of integers along the rows of a dataframe
#' @export
seq_along_rows <- function(data) {
  seq_len(nrow(data))
}


#' Randomly sample data from n sub-groups of data
#'
#' @param data a dataframe
#' @param size number of groups to sample
#' @param ... variables to group by
#' @return the data from subgroups
#' @export
#' @examples
#' sample_data <- tibble::tibble(
#'   letter = rep(letters, 5),
#'   color = rep(c("red", "green", "yellow", "orange", "blue"), 26),
#'   value = rnorm(26 * 5)
#' )
#'
#' # data from two letters
#' sample_data %>%
#'   sample_n_of(2, letter)
#'
#' # data from two colors
#' sample_data %>%
#'   sample_n_of(2, color)
#'
#' # data from 10 letter-colors pairs
#' sample_data %>%
#'   sample_n_of(10, letter, color)
sample_n_of <- function(data, size, ...) {
  dots <- quos(...)
  rows <- tibble::tibble(row = seq_len(nrow(data)))

  # Default to sampling rows if no grouping variables set
  if (length(dots) == 0) {
    dots <- list(.rowid = rows$row)
  }

  rows[, "group"] <- data %>%
    group_by(!!! dots) %>%
    group_indices()

  n_groups <- max(rows$group)

  if (n_groups < size) {
    w <- glue::glue(
      "Sample size ({size}) is larger than number of groups ({n_groups}). ",
      "Using size = {n_groups}.")
    rlang::warn(w)
    size <- n_groups
  }

  subset <- rows %>%
    filter(.data$group %in% sample(unique(.data$group), size)) %>%
    pull(.data$row)

  data[subset, ]
}


#' Compare pairs of categorical variables
#' @param data a dataframe
#' @param levels a column with a categorical variable. All pairs of values in
#'   `levels` will be compared.
#' @param values a column with values to compare.
#' @param f comparison function to apply to values in each pair. Defaults to `-`
#'   to compute the pairwise differences.
#' @return a dataframe with pairwise comparisons
#' @export
#' @examples
#' to_compare <- nlme::Machines %>%
#'   dplyr::group_by(Worker) %>%
#'   dplyr::summarise(avg_score = mean(score)) %>%
#'   print()
#'
#' to_compare %>%
#'   compare_pairs(Worker, avg_score) %>%
#'   dplyr::rename(difference = value) %>%
#'   dplyr::mutate_if(is.numeric, round, 1)
compare_pairs <- function(data, levels, values, f = `-`) {
  levels <- enquo(levels)
  values <- enquo(values)

  pairs <- data %>%
    pull(!! levels) %>%
    create_pairs()

  wide <- data %>%
    tidyr::spread(key = !! levels, value = !! values)

  for (row_i in seq_along_rows(pairs)) {
    pair_i <- pairs[row_i, ]
    wide[, pair_i$name] <- f(wide[[pair_i$x1]], wide[[pair_i$x2]])
  }

  wide %>%
    select(-one_of(c(pairs$x1), c(pairs$x2))) %>%
    tidyr::gather("pair", "value", one_of(c(pairs$name))) %>%
    mutate(pair = factor(.data$pair, levels = pairs$name))
}


create_pairs <- function(xs) {
  if (!is.factor(xs)) xs <- ordered(xs)
  xs %>%
    levels() %>%
    rev() %>%
    utils::combn(2) %>%
    t() %>%
    as.data.frame() %>%
    rlang::set_names("x1", "x2") %>%
    mutate(name = paste0(.data$x1, "-", .data$x2)) %>%
    mutate_all(as.character) %>%
    arrange(.data$x1, desc(.data$x2))
}