ivs: Interval Vectors

Documented in iv_identify_splits iv_locate_splits iv_splits

#' Splits
#'
#' @description
#' This family of functions revolves around splitting an iv on its endpoints,
#' which results in a new iv that is entirely disjoint (i.e. non-overlapping).
#' The intervals in the resulting iv are known as "splits".
#'
#' - `iv_splits()` computes the disjoint splits for `x`.
#'
#' - `iv_identify_splits()` identifies the splits that correspond to each
#' interval in `x`. It replaces `x` with a list of the same size where each
#' element of the list contains the splits that the corresponding interval in
#' `x` overlaps. This is particularly useful alongside [tidyr::unnest()].
#'
#' - `iv_locate_splits()` returns a two column data frame with a `key` column
#' containing the result of `iv_splits()` and a `loc` list-column containing
#' integer vectors that map each interval in `x` to the splits that it overlaps.
#'
#' @section Graphical Representation:
#'
#' Graphically, generating splits looks like:
#'
#' ![](splits.png)
#'
#' @inheritParams rlang::args_dots_empty
#'
#' @param x `[iv]`
#'
#'   An interval vector.
#'
#' @param on `[vector / NULL]`
#'
#'   An optional vector of additional values to split on.
#'
#'   This should have the same type as `iv_start(x)`.
#'
#' @return
#' - For `iv_splits()`, an iv with the same type as `x`.
#'
#' - For `iv_identify_splits()`, a list-of containing ivs with the same size as
#' `x`.
#'
#' - For `iv_locate_splits()`, a two column data frame with a `key` column
#' of the same type as `x` and `loc` list-column containing integer vectors.
#'
#' @name iv-splits
#'
#' @examples
#' library(tidyr)
#' library(dplyr)
#'
#' # Guests to a party and their arrival/departure times
#' guests <- tibble(
#'   arrive = as.POSIXct(
#'     c("2008-05-20 19:30:00", "2008-05-20 20:10:00", "2008-05-20 22:15:00"),
#'     tz = "UTC"
#'   ),
#'   depart = as.POSIXct(
#'     c("2008-05-20 23:00:00", "2008-05-21 00:00:00", "2008-05-21 00:30:00"),
#'     tz = "UTC"
#'   ),
#'   name = list(
#'     c("Mary", "Harry"),
#'     c("Diana", "Susan"),
#'     "Peter"
#'   )
#' )
#'
#' guests <- unnest(guests, name) %>%
#'   mutate(iv = iv(arrive, depart), .keep = "unused")
#'
#' guests
#'
#' # You can determine the disjoint intervals at which people
#' # arrived/departed with `iv_splits()`
#' iv_splits(guests$iv)
#'
#' # Say you'd like to determine who was at the party at any given time
#' # throughout the night
#' guests <- mutate(guests, splits = iv_identify_splits(iv))
#' guests
#'
#' # Unnest the splits to generate disjoint intervals for each guest
#' guests <- guests %>%
#'   unnest(splits) %>%
#'   select(name, splits)
#'
#' guests
#'
#' # Tabulate who was there at any given time
#' guests %>%
#'   summarise(n = n(), who = list(name), .by = splits)
#'
#' # ---------------------------------------------------------------------------
#'
#' x <- iv_pairs(c(1, 5), c(4, 9), c(12, 15))
#' x
#'
#' # You can provide additional singular values to split on with `on`
#' iv_splits(x, on = c(2, 13))
NULL

#' @rdname iv-splits
#' @export
iv_splits <- function(x, ..., on = NULL) {
  check_dots_empty0(...)

  proxy <- iv_proxy(x)
  check_iv(proxy, arg = "x")

  start <- field_start(proxy)
  end <- field_end(proxy)

  args <- iv_split_candidates(start, end, on = on)
  candidate_start <- args$start
  candidate_end <- args$end

  needles <- data_frame(start = candidate_start, end = candidate_end)
  haystack <- data_frame(start = end, end = start)

  # Find actual overlaps among all possible candidates
  loc <- vec_locate_matches(
    needles,
    haystack,
    condition = c("<", ">"),
    no_match = "drop",
    multiple = "any",
    incomplete = "match",
    error_call = current_env()
  )

  out <- vec_slice(needles, loc$needles)
  out <- new_bare_iv_from_fields(out)
  out <- iv_restore(out, x)

  out
}

#' @rdname iv-splits
#' @export
iv_identify_splits <- function(x, ..., on = NULL) {
  check_dots_empty0(...)

  proxy <- iv_proxy(x)
  check_iv(proxy, arg = "x")

  start <- field_start(proxy)
  end <- field_end(proxy)

  args <- iv_split_candidates(start, end, on = on)
  candidate_start <- args$start
  candidate_end <- args$end

  needles <- data_frame(start = start, end = end)
  haystack <- data_frame(start = candidate_end, end = candidate_start)

  loc <- vec_locate_matches(
    needles,
    haystack,
    condition = c("<", ">"),
    no_match = "error",
    incomplete = NA_integer_,
    error_call = current_env()
  )

  sizes <- vec_run_sizes(loc$needles)
  loc <- vec_chop(loc$haystack, sizes = sizes)

  ptype <- vec_ptype(x)
  ptype <- vec_ptype_finalise(ptype)

  candidates <- new_bare_iv(candidate_start, candidate_end)
  candidates <- iv_restore(candidates, x)

  out <- vec_chop(candidates, indices = loc)
  out <- new_list_of(out, ptype = ptype)

  out
}

#' @rdname iv-splits
#' @export
iv_locate_splits <- function(x, ..., on = NULL) {
  check_dots_empty0(...)

  proxy <- iv_proxy(x)
  check_iv(proxy, arg = "x")

  start <- field_start(proxy)
  end <- field_end(proxy)

  args <- iv_split_candidates(start, end, on = on)
  candidate_start <- args$start
  candidate_end <- args$end

  needles <- data_frame(start = candidate_start, end = candidate_end)
  haystack <- data_frame(start = end, end = start)

  # Find actual overlaps among all possible candidates
  loc <- vec_locate_matches(
    needles,
    haystack,
    condition = c("<", ">"),
    no_match = "drop",
    incomplete = "match",
    error_call = current_env()
  )

  sizes <- vec_run_sizes(loc$needles)

  starts <- vec_run_sizes_to_starts(sizes)
  starts <- vec_slice(loc$needles, starts)

  loc <- vec_chop(loc$haystack, sizes = sizes)

  key <- vec_slice(needles, starts)
  key <- new_bare_iv_from_fields(key)
  key <- iv_restore(key, x)

  out <- data_frame(key = key, loc = loc)

  out
}

iv_split_candidates <- function(start,
                                end,
                                ...,
                                on = NULL,
                                error_call = caller_env()) {
  check_dots_empty0(...)

  on <- vec_cast(on, start, x_arg = "on", to_arg = "iv_start(x)", call = error_call)

  # Candidates are built from all sorted unique values
  points <- vec_sort(vec_unique(vec_c(start, end, on)))
  size_points <- vec_size(points)

  # If a missing interval is present, it is at the very end.
  # We remove it before proceeding, because this "candidates" computation
  # revolves around computing `start` and `end` locations, where `start < end`
  # is a requirement. In the case of a single missing interval, we end up with
  # 1 unique point (an `NA`), but we still want to keep `[NA, NA)` as an
  # interval candidate so we remove it now and add it back at the end.
  last <- vec_slice(points, size_points)
  any_missing <- any(vec_detect_missing(last))
  if (any_missing) {
    points <- vec_slice(points, -size_points)
    size_points <- size_points - 1L
  }

  slice_start <- 1L
  slice_end <- size_points

  loc_start <- seq2(slice_start, slice_end - 1L)
  loc_end <- seq2(slice_start + 1L, slice_end)

  candidate_start <- vec_slice(points, loc_start)
  candidate_end <- vec_slice(points, loc_end)

  if (any_missing) {
    candidate_start <- vec_c(candidate_start, vec_init(candidate_start))
    candidate_end <- vec_c(candidate_end, vec_init(candidate_end))
  }

  list(start = candidate_start, end = candidate_end)
}