R/seq-predicates.R
In scrutiny: Error Detection in Science

Documented in is_seq_ascending is_seq_descending is_seq_dispersed is_seq_linear

# Helpers for `function_map_seq()` as well as its assorted `reverse_*()` and
# `summarize_*()` functions:

# `index_seq()` takes a vector that's (1) numeric or string coercible to
# numeric, and (2) that is either a continuous sequence of numbers (step size:
# 1) or that would be such a sequence if not for exactly one single missing case
# -- not in the sense of `NA`, but a sequence of two numbers where the second is
# the first plus 2, so that one would expect an intermediate number in the
# middle. This number can be identified as the "index case" in an original
# sequence that dropped it at some point.

# The function returns a sequence of `1` values for a continuous sequence, and
# such a sequence with a single `2` value strewn in for a sequence with a single
# missing case. The `2`, if present, has the same index as the last value before
# the index case in `x`.
index_seq <- function(x) {
  if (!is.numeric(x)) {
    x <- as.numeric(x)
  }
  x_seq <- seq_along(x)
  x_seq <- x[x_seq] - x[x_seq + 1L]
  abs(x_seq[!is.na(x_seq)])
}


is_seq_linear_basic <- function(x) {
  if (length(x) < 3L) {
    return(TRUE)
  }
  # As the difference between each successive pair of values must be equal for
  # `x` to be a linear sequence, we can take the first pairwise difference and
  # test each other difference for equality with it. If any comparison turns out
  # unequal, `x` is not a linear sequence.
  diff_first <- x[2L] - x[1L]
  for (i in 3L:length(x)) {
    if (x[i] - x[i - 1L] != diff_first) {
      return(FALSE)
    }
  }
  TRUE
}

is_seq_ascending_basic <- function(x) {
  for (i in 1L:(length(x) - 1L)) {
    if (x[i + 1L] <= x[i]) {
      return(FALSE)
    }
  }
  TRUE
}


is_seq_descending_basic <- function(x) {
  for (i in 1L:(length(x) - 1L)) {
    if (x[i + 1L] >= x[i]) {
      return(FALSE)
    }
  }
  TRUE
}


# # Test any of the sequence functions interactively:
# x <- c(1, 2, NA, 4)
# tolerance <- .Machine$double.eps^0.5
# test_linear <- TRUE
# test_special <- NULL
# min_length <- NULL
# args_other <- NULL


# Non-exported workhorse API of all the sequence predicates:
is_seq_basic <- function(x, tolerance = .Machine$double.eps^0.5,
                         test_linear = TRUE, test_special = NULL,
                         min_length = NULL, args_other = NULL) {

  if (!is.null(test_special) && test_special == "dispersed") {
    # Without the `force()` call, the function may return `FALSE` early, even if
    # `from` was not supplied:
    force(args_other$from)

    # A dispersed sequence requires one central value, so the number of elements
    # in `x` must be odd:
    if (is_even(length(x))) {
      return(FALSE)
    }
  }

  if (!is_numeric_like(x)) {
    return(FALSE)
  }

  if (!is.null(min_length) && length(x) < min_length) {
    return(FALSE)
  }

  if (length(x) == 1L) {
    return(TRUE)
  }

  x_has_na <- anyNA(x)

  if (x_has_na) {

    # Save the unmodified `x` for a test that is conducted if `x` contains one
    # or more `NA` elements:
    x_orig <- x
    n_x_orig <- length(x)

    not_na <- which(!is.na(x))

    # Need at least three known values:
    if (length(not_na) < 3L) {
      return(NA)
    }

    # # Indices of `NA`s at the start and end of `x`:
    # n_na_start <- seq_len(not_na[1L] - 1L)
    # n_na_end   <- (not_na[length(not_na)] + 1L):n_x_orig

    n_na_start <- match(FALSE,     is.na(x_orig))  - 1L
    n_na_end   <- match(FALSE, rev(is.na(x_orig))) - 1L

    # Remove all `NA` values from the start and the end of `x` because `NA`s at
    # these particular locations cannot disprove that `x` is the kind of
    # sequence of interest. (They do mean that it cannot be proven, so the
    # function will return either `NA` or `FALSE`, depending on other factors.)
    x <- x[not_na[1L]:not_na[length(not_na)]]

    if (test_linear && !anyNA(x) && !is_seq_linear_basic(x)) {
      return(FALSE)
    }

    # If the removal of leading and / or trailing `NA` elements in `x` caused
    # the central value to shift, or if only one side from among left and right
    # had any `NA` values, the original `x` might not have been symmetrically
    # grouped around that value, and hence not a dispersed sequence. It depends
    # on whether it could potentially be dispersed, assuming values consistent
    # with this idea that are imputed for the `NA`s here.
    if (
      !is.null(test_special) &&
      test_special == "dispersed" &&
      n_na_start != n_na_end
    ) {
      unit <- x[2] - x[1]

      # Impute sequences of regularly spaces values that the `NA`s might stand
      # for, so that the input could potentially be dispersed around
      # `args_other$from`...
      seq_na_start <- seq(x[1],         length.out = n_na_start, by = -unit) - 1
      seq_na_end   <- seq(x[length(x)], length.out = n_na_end,   by =  unit) + 1

      seq_imputed <- c(rev(seq_na_start), x, seq_na_end)

      # ...assuming the overall (partly imputed) sequence is still dispersed
      # around that value:
      if (is_seq_dispersed(seq_imputed, from = args_other$from)) {
        return(NA)
      }
      return(FALSE)
    }

    # Used within the for loop below to check whether the step size must be
    # negative:
    x_is_descending_basic <- is_seq_descending_basic(x[!is.na(x)])

    for (i in seq_along(x)) {
      if (is.na(x[i])) {
        index_lower <- 1L
        index_upper <- 1L
        while (is.na(x[i - index_lower])) {
          index_lower <- index_lower - 1L
        }
        while (is.na(x[i + index_upper])) {
          index_upper <- index_upper + 1L
        }
        seq_start <- x[i - index_lower]
        seq_end   <- x[i + index_upper]
        step <- step_size(c(seq_start, seq_end))

        # Descending sequences require a negative step size:
        if (x_is_descending_basic || seq_start > seq_end) {
          step <- -step
        }

        # Look here for `seq.default()` errors:
        seq_replacement <- seq(from = seq_start, to = seq_end, by = step)

        # Remove the first and the last element because these correspond to the
        # two next surrounding non-`NA` numbers rather than to the `NA`
        # subsequence, and therefore should not replace any `NA`s:
        seq_replacement <- seq_replacement[-1L]
        seq_replacement <- seq_replacement[-length(seq_replacement)]

        if (test_linear) {
          # In the first of these two cases, the replacement sequence is too
          # short to bridge the `NA` subsequence. In the second case, the
          # replacement sequence is longer than the subsequence of `NA`
          # elements, which invariably means that the numbers surrounding the
          # `NA`s are too far spaced out for there to be a linear sequence. In
          # either case...
          seq_replacement_has_wrong_length <-
            length(seq_replacement) == 0L ||
            length(seq_replacement) > length(index_lower:index_upper)

          # ...an error is thrown:
          if (seq_replacement_has_wrong_length) {
            return(FALSE)
          }
        }

        # Substitute the replacement sequence for `NA` elements. Warnings are
        # suppressed because the lengths will only differ in an unproblematic
        # case -- `x` is non-linear and `test_linear` is `FALSE`, i.e., the user
        # only wants one of the special tests but not the test for linearity:
        suppressWarnings(
          x[i + ((index_lower:index_upper) - 1L)] <- seq_replacement
        )

      } # End of the `is.na(x[i])` condition
    }   # End of the for loop
  }     # End of the `x_has_na` condition

  # If desired, test `x` -- as passed to the function or as partly reconstructed
  # in the for loop above -- for linearity:
  if (test_linear) {
    x_seq <- index_seq(x)
    pass_test_linear <- all(dplyr::near(x_seq, min(x_seq), tol = tolerance))
    if (!pass_test_linear) {
      return(FALSE)
    }
  }

  # Interface for the special variant functions:
  if (!is.null(test_special)) {
    pass_test_special <- switch(
      test_special,
      "ascending"  = is_seq_ascending_basic(x),
      "descending" = is_seq_descending_basic(x),
      "dispersed"  = is_seq_dispersed_basic(x, args_other$from, tolerance)
      # TODO: MAKE THIS "DISPERSED" TEST ABLE TO ASSUME THAT `NA`S IN `x_orig`
      # ARE ACTUALLY DISPERSED VALUES! MAYBE USE `index_central()`'S INTERNAL
      # LOGIC AND BUILD UP ON IT.
    )
    if (!pass_test_special) {
      return(FALSE)
    }
  }

  if (x_has_na) {
    NA
  } else {
    TRUE
  }

}



#' Is a vector a certain kind of sequence?
#'
#' @description Predicate functions that test whether `x` is a numeric vector
#'   (or coercible to numeric) with some special properties:

#'   - `is_seq_linear()` tests whether every two consecutive elements of `x`
#'   differ by some constant amount.

#'   - `is_seq_ascending()` and `is_seq_descending()` test whether the
#'   difference between every two consecutive values is positive or negative,
#'   respectively. `is_seq_dispersed()` tests whether `x` values are grouped
#'   around a specific central value, `from`, with the same distance to both
#'   sides per value pair. By default (`test_linear = TRUE`), these functions
#'   also test for linearity, like `is_seq_linear()`.
#'
#' `NA` elements of `x` are handled in a nuanced way. See *Value* section below
#' and the examples in `vignette("devtools")`, section *NA handling*.

#' @param x Numeric or coercible to numeric, as determined by
#'   `is_numeric_like()`. Vector to be tested.
#' @param from Numeric or coercible to numeric. Only in `is_seq_dispersed()`. It
#'   will test whether `from` is at the center of `x`, and if every pair of
#'   other values is equidistant to it.
#' @param test_linear Logical. In functions other than `is_seq_linear()`, should
#'   `x` also be tested for linearity? Default is `TRUE`.
#' @param tolerance Numeric. Tolerance of comparison between numbers when
#'   testing. Default is circa 0.000000015 (1.490116e-08), as in
#'   `dplyr::near()`.

#' @return A single logical value. If `x` contains at least one `NA` element,
#'   the functions return either `NA` or `FALSE`:
#'   - If all elements of `x` are `NA`, the functions return `NA`.
#'   - If some but not all elements are `NA`, they check if `x` *might* be a
#'   sequence of the kind in question: Is it a linear (and / or ascending, etc.)
#'   sequence after the `NA`s were replaced by appropriate values? If so, they
#'   return `NA`; otherwise, they return `FALSE`.

#' @seealso `validate::is_linear_sequence()`, which is much like
#'   `is_seq_linear()` but more permissive with `NA` values. It comes with some
#'   additional features, such as support for date-times.

#' @export
#'
#' @name seq-predicates

#' @examples
#' # These are linear sequences...
#' is_seq_linear(x = 3:7)
#' is_seq_linear(x = c(3:7, 8))
#'
#' # ...but these aren't:
#' is_seq_linear(x = c(3:7, 9))
#' is_seq_linear(x = c(10, 3:7))
#'
#' # All other `is_seq_*()` functions
#' # also test for linearity by default:
#' is_seq_ascending(x = c(2, 7, 9))
#' is_seq_ascending(x = c(2, 7, 9), test_linear = FALSE)
#'
#' is_seq_descending(x = c(9, 7, 2))
#' is_seq_descending(x = c(9, 7, 2), test_linear = FALSE)
#'
#' is_seq_dispersed(x = c(2, 3, 5, 7, 8), from = 5)
#' is_seq_dispersed(x = c(2, 3, 5, 7, 8), from = 5, test_linear = FALSE)
#'
#' # These fail their respective
#' # individual test even
#' # without linearity testing:
#' is_seq_ascending(x = c(1, 7, 4), test_linear = FALSE)
#' is_seq_descending(x = c(9, 15, 3), test_linear = FALSE)
#' is_seq_dispersed(1:10, from = 5, test_linear = FALSE)



#' @rdname seq-predicates
#' @export

is_seq_linear <- function(x, tolerance = .Machine$double.eps^0.5) {
  is_seq_basic(x, tolerance, test_linear = TRUE)
}



#' @rdname seq-predicates
#' @export

is_seq_ascending <- function(x, test_linear = TRUE,
                             tolerance = .Machine$double.eps^0.5) {
  is_seq_basic(
    x, tolerance, test_linear, test_special = "ascending", min_length = 2L
  )
}



#' @rdname seq-predicates
#' @export

is_seq_descending <- function(x, test_linear = TRUE,
                              tolerance = .Machine$double.eps^0.5) {
  is_seq_basic(
    x, tolerance, test_linear, test_special = "descending", min_length = 2L
  )
}



#' @rdname seq-predicates
#' @export

is_seq_dispersed <- function(x, from, test_linear = TRUE,
                             tolerance = .Machine$double.eps^0.5) {
  is_seq_basic(
    x, tolerance, test_linear, test_special = "dispersed", min_length = 3L,
    args_other = list(from = from)
  )
}



# Helper, not exported:
is_seq_dispersed_basic <- function(x, from,
                                   tolerance = .Machine$double.eps^0.5) {

  if (is_even(length(x))) {
    return(FALSE)
  }

  if (!is.numeric(x)) {
    if (is_numeric_like(x)) {
      x <- as.numeric(x)
    } else {
      return(FALSE)
    }
  }

  if (!is.numeric(from)) {
    if (is_numeric_like(from)) {
      x <- as.numeric(from)
    } else {
      return(FALSE)
    }
  }

  index_central_x <- index_central(x)

  if (!dplyr::near(x[index_central_x], from, tolerance)) {
    return(FALSE)
  }

  dispersion_minus <- from - x[1L:(index_central_x - 1L)]
  dispersion_plus  <- from + x[(index_central_x + 1L):length(x)]

  from_reconstructed <- (dispersion_plus - rev(dispersion_minus)) / 2

  all(dplyr::near(from, from_reconstructed, tolerance))
}