R/enhead.R

Defines functions check_distinct check_header corner_pos match_function side_join `right-ish` `down-ish` `left-ish` `up-ish` corner_join `down-left` `down-right` `up-right` `up-left` right up enhead.data.frame enhead

Documented in enhead

#' Join data cells to headers
#'
#' @description
#' Data cells in a table are associated with header cells by proximity.
#' [enhead()] joins a data frame of data cells to a data frame of header cells,
#' choosing the nearest header cells in the given direction.  See `?direction`.
#'
#' @param data_cells Data frame of data cells with at least the columns 'row'
#'   and 'column', which are `numeric` or `integer`.
#' @param header_cells Data frame of header cells with at least the columns
#'   'row' and 'column', which are numeric/integer vectors.
#' @param direction The direction between a data cell and its header, one of
#' `"up"`, `"right"`, `"down"`, `"left"`, `"up-left"`, `"up-right"`,
#' `"right-up"`, `"right-down"`, `"down-right"`, `"down-left"`, `"left-down"`,
#' `"left-up"`, `"up-ish"`, `"down-ish"`, `"left-ish"` and `"right-ish"`. See
#' `?direction`.
#' @param drop Logical vector length 1. Whether data cells that can't be
#'   associated with a header should be dropped.  Default: `TRUE`.
#'
#' @name enhead
#' @export
#' @examples
#' library(dplyr)
#' # Load some pivoted data
#' (x <- purpose$`up-left left-up`)
#' # Make a tidy representation
#' cells <- as_cells(x)
#' cells <- cells[!is.na(cells$chr), ]
#' head(cells)
#' # Select the cells containing the values
#' data_cells <-
#'   filter(cells, row >= 3, col >= 3) %>%
#'   transmute(row, col, count = as.integer(chr))
#' head(data_cells)
#' # Select the headers
#' qualification <-
#'   filter(cells, col == 1) %>%
#'   select(row, col, qualification = chr)
#' age <-
#'   filter(cells, col == 2) %>%
#'   select(row, col, age = chr)
#' gender <-
#'   filter(cells, row == 1) %>%
#'   select(row, col, gender = chr)
#' satisfaction <-
#'   filter(cells, row == 2) %>%
#'   select(row, col, satisfaction = chr)
#' # From each data cell, search for the nearest one of each of the headers
#' data_cells %>%
#'   enhead(gender, "up-left") %>%
#'   enhead(satisfaction, "up") %>%
#'   enhead(qualification, "left-up") %>%
#'   enhead(age, "left") %>%
#'   select(-row, -col)
#'
#' # The `drop` argument controls what happens when for some cells there is no
#' # header in the given direction. When `drop = TRUE` (the default), cells that
#' # can't be joined to a header are dropped.  Otherwise they are kept.
#' enhead(data_cells, gender, "up")
#' enhead(data_cells, gender, "up", drop = FALSE)
enhead <- function(data_cells, header_cells, direction, drop = TRUE) {
  UseMethod("enhead")
}

#' @export
enhead.data.frame <- function(data_cells, header_cells, direction,
                              drop = TRUE) {
  check_header(header_cells)
  direction <- standardise_direction(direction)
  check_distinct(data_cells)
  check_distinct(header_cells)
  if (direction %in% c("up-ish", "right-ish", "down-ish", "left-ish")) {
    do.call(direction, list(data_cells, header_cells))
  } else if (direction %in% c(
    "up", "right", "down", "left",
    "up-left", "up-right",
    "right-up", "right-down",
    "down-right", "down-left",
    "left-down", "left-up"
  )) {
    do.call(direction, list(data_cells, header_cells, drop))
  }
}

up <- function(data_cells, header_cells, drop = TRUE) {
  check_header(header_cells)
  join <- ifelse(drop, dplyr::inner_join, dplyr::left_join)
  out <- join(data_cells, dplyr::select(header_cells, -row),
    by = "col",
    suffix = c(".data", ".header")
  )
  tibble::as_tibble(out)
}

right <- function(data_cells, header_cells, drop = TRUE) {
  check_header(header_cells)
  join <- ifelse(drop, dplyr::inner_join, dplyr::left_join)
  out <- join(data_cells, dplyr::select(header_cells, -col),
    by = "row",
    suffix = c(".data", ".header")
  )
  tibble::as_tibble(out)
}

down <- up
left <- right

`up-left` <- function(data_cells, header_cells, drop = TRUE) {
  corner_join(data_cells, header_cells, "top_left", drop)
}

`up-right` <- function(data_cells, header_cells, drop = TRUE) {
  corner_join(data_cells, header_cells, "top_right", drop)
}

`down-right` <- function(data_cells, header_cells, drop = TRUE) {
  corner_join(data_cells, header_cells, "bottom_right", drop)
}

`down-left` <- function(data_cells, header_cells, drop = TRUE) {
  corner_join(data_cells, header_cells, "bottom_left", drop)
}

`left-up` <- `up-left`
`right-up` <- `up-right`
`right-down` <- `down-right`
`left-down` <- `down-left`

corner_join <- function(data_cells, header_cells, corner, drop = TRUE) {
  check_header(header_cells)
  headers <-
    header_cells %>%
    partition(dplyr::distinct(header_cells, row, col),
      corner,
      nest = FALSE
    ) %>%
    dplyr::select(-row, -col)
  datas <- partition(data_cells,
    dplyr::distinct(header_cells, row, col),
    corner,
    nest = FALSE,
    strict = FALSE
  )
  out <-
    dplyr::inner_join(datas, headers,
      by = c("corner_row", "corner_col"),
      suffix = c("", ".y")
    ) %>%
    dplyr::select(-corner_row, -corner_col)
  if (!drop) {
    remainder <- dplyr::anti_join(data_cells, out, by = c("row", "col"))
    out <- dplyr::bind_rows(out, remainder)
  }
  out
}

`up-ish` <- function(data_cells, header_cells, drop = TRUE) {
  side_join(data_cells, header_cells, "up-left", drop)
}

`left-ish` <- function(data_cells, header_cells, drop = TRUE) {
  side_join(data_cells, header_cells, "left-up", drop)
}

`down-ish` <- function(data_cells, header_cells, drop = TRUE) {
  side_join(data_cells, header_cells, "down-left", drop)
}

`right-ish` <- function(data_cells, header_cells, drop = TRUE) {
  side_join(data_cells, header_cells, "right-up", drop)
}

side_join <- function(data_cells, header_cells, corner, drop = TRUE) {
  check_header(header_cells)
  if (corner %in% c("up-left", "up-right", "down-left", "down-right")) {
    pos <- rlang::sym("col")
  } else {
    pos <- rlang::sym("row")
  }
  # The domain of each header is up to (but not including) half-way between it
  # and the previous header
  header_cells <- dplyr::mutate(
    header_cells,
    !!pos := corner_pos(!!pos, corner)
  )
  match_function(corner)(data_cells, header_cells, drop = drop)
}

match_function <- function(x, env = parent.frame()) {
  rlang::as_function(x, env = env)
}

corner_pos <- function(cells, corner) {
  corner_names <-
    c("up-left", "up-right", "right-up", "right-down",
      "down-right", "down-left", "left-down", "left-up")
  corner_poss <- rep(c("col", "col", "row", "row"), 2L)
  corner_looks <- c(
    rep(list(dplyr::lag, dplyr::lead), 2L),
    rep(list(dplyr::lead, dplyr::lag), 2L)
  )
  corner_defaults <- c(1L, 16384L, 1L, 1048576L, 16384L, 1L, 1048576L, 1L)
  corner_coefs <- c(2L, -2L, 2L, -2L, -2L, 2L, -2L, 2L)
  corner_extremes <- c(
    rep(c(floor, ceiling), 2L),
    rep(c(ceiling, floor), 2L)
  )
  corner_i <- match(corner, corner_names)
  pos <- rlang::sym(corner_poss[corner_i])
  look <- corner_looks[[corner_i]]
  default <- corner_defaults[corner_i]
  extreme <- corner_extremes[[corner_i]]
  coef <- corner_coefs[corner_i]
  out <- extreme((cells + look(cells) + coef) / 2)
  out[is.na(out)] <- default
  out
}

check_header <- function(header_cells) {
  if (length(unique(header_cells$row)) > 1 & length(unique(header_cells$col)) > 1) {
    stop(
      "Multiple lines of headers are not supported in this way.",
      "\n  Perhaps you meant to concatenate them together first,",
      "\n  Or look at ?partition"
    )
  }
}

check_distinct <- function(cells) {
  if (dplyr::n_distinct(dplyr::select(cells, row, col)) != nrow(cells)) {
    stop("Row and column numbers must be distinct.",
      "\n  Perhaps you meant to use a single sheet.",
      call. = FALSE
    )
  }
}

Try the unpivotr package in your browser

Any scripts or data that you put into this service are public.

unpivotr documentation built on Jan. 23, 2023, 5:40 p.m.