
Defines functions find_different_from_previous_vec_ check_differs_from_previous_always check_differs_from_previous_once run_differs_from_previous_ differs_from_previous

Documented in differs_from_previous

## differs_from_previous
#' @title Find values in a vector that differ from the previous value
#' @description
#'  \Sexpr[results=rd, stage=render]{lifecycle::badge("maturing")}
#'  Finds values, or indices of values, that differ from the previous value by some threshold(s).
#'  Operates with both a positive and a negative threshold.
#'  Depending on \code{`direction`}, it checks if the difference to the previous value is:
#'  \itemize{
#'    \item greater than or equal to the positive threshold.
#'    \item less than or equal to the negative threshold.
#'  }
#' @author Ludvig Renbo Olsen, \email{r-pkgs@@ludvigolsen.dk}
#' @export
#' @param data \code{data.frame} or \code{vector}.
#'  \strong{N.B.} If checking a \code{factor}, it is converted to a \code{character vector}.
#'  This means that factors can only be used when \code{`threshold`} is \code{NULL}.
#'  Conversion will generate a warning, which can be turned off by setting \code{`factor_conversion_warning`} to \code{FALSE}.
#'  \strong{N.B.} If \code{`data`} is a \emph{grouped} \code{data.frame},
#'  the function is applied group-wise and the output is a \code{list} of \code{vector}s.
#'  The names are based on the group indices
#'  (see \code{\link[dplyr:group_data]{dplyr::group_indices()}}).
#' @param threshold Threshold to check difference to previous value to.
#'  \code{NULL}, \emph{numeric scalar} or \emph{numeric vector with length \code{2}}.
#'  \subsection{NULL}{
#'  Checks if the value is different from the previous value.
#'  Ignores \code{`direction`}.
#'  N.B. Works for both numeric and character vectors.
#'  }
#'  \subsection{Numeric scalar}{
#'  Positive number.
#'  Negative threshold is the negated number.
#'  N.B. Only works for numeric vectors.
#'  }
#'  \subsection{Numeric vector with length 2}{
#'  Given as \code{c(negative threshold, positive threshold)}.
#'  Negative threshold must be a negative number and positive threshold must be a positive number.
#'  N.B. Only works for numeric vectors.
#'  }
#' @param direction
#'  \code{both}, \code{positive} or \code{negative}. (character)
#'  \subsection{both}{
#'  Checks whether the difference to the previous value is
#'    \itemize{
#'      \item greater than or equal to the positive threshold.
#'      \item less than or equal to the negative threshold.
#'    }
#'  }
#'  \subsection{positive}{
#'  Checks whether the difference to the previous value is
#'    \itemize{
#'      \item greater than or equal to the positive threshold.
#'    }
#'  }
#'  \subsection{negative}{
#'  Checks whether the difference to the previous value is
#'    \itemize{
#'      \item less than or equal to the negative threshold.
#'    }
#'  }
#' @param return_index Return indices of values that differ. (Logical)
#' @param col Name of column to find values that differ in. Used when \code{`data`} is
#'  \code{data.frame}. (Character)
#' @param include_first Whether to include the first element of the vector in the output. (Logical)
#' @param handle_na How to handle \code{NA}s in the column.
#'  \subsection{"ignore"}{
#'  Removes the \code{NA}s before finding the differing values, ensuring
#'  that the first value after an \code{NA} will be correctly identified as new,
#'  if it differs from the value before the \code{NA}(s).
#'  }
#'  \subsection{"as_element"}{
#'  Treats all \code{NA}s as the string \code{"NA"}.
#'  This means, that \code{threshold} must be \code{NULL} when using this method.
#'  }
#'  \subsection{Numeric scalar}{
#'  A numeric value to replace \code{NA}s with.
#'  }
#' @param factor_conversion_warning Whether to throw a warning when converting a \code{factor} to a \code{character}. (Logical)
#' @return \code{vector} with either the differing values or the indices of the differing values.
#'  \strong{N.B.} If \code{`data`} is a \emph{grouped} \code{data.frame},
#'  the output is a \code{list} of \code{vector}s
#'  with the differing values. The names are based on the group indices
#'  (see \code{\link[dplyr:group_data]{dplyr::group_indices()}}).
#' @aliases not_previous
#' @family l_starts tools
#' @examples
#' # Attach packages
#' library(groupdata2)
#' # Create a data frame
#' df <- data.frame(
#'   "a" = factor(c("a", "a", "b", "b", "c", "c")),
#'   "n" = c(1, 3, 6, 2, 2, 4)
#' )
#' # Get differing values in column 'a' with no threshold.
#' # This will simply check, if it is different to the previous value or not.
#' differs_from_previous(df, col = "a")
#' # Get indices of differing values in column 'a' with no threshold.
#' differs_from_previous(df, col = "a", return_index = TRUE)
#' # Get values, that are 2 or more greater than the previous value
#' differs_from_previous(df, col = "n", threshold = 2, direction = "positive")
#' # Get values, that are 4 or more less than the previous value
#' differs_from_previous(df, col = "n", threshold = 4, direction = "negative")
#' # Get values, that are either 2 or more greater than the previous value
#' # or 4 or more less than the previous value
#' differs_from_previous(df, col = "n", threshold = c(-4, 2), direction = "both")
differs_from_previous <- function(data,
                                  col = NULL,
                                  threshold = NULL,
                                  direction = "both",
                                  return_index = FALSE,
                                  include_first = FALSE,
                                  handle_na = "ignore",
                                  factor_conversion_warning = TRUE) {
  # Run find_different_from_previous_vec_ for either a vector or data frame

  # Check inputs
  checks <- check_differs_from_previous_once(
    data = data,
    col = col,
    threshold = threshold,
    direction = direction,
    return_index = return_index,
    include_first = include_first,
    handle_na = handle_na,
    factor_conversion_warning = factor_conversion_warning

  data <- checks[["data"]]

  # Apply by group (recursion)
  if (dplyr::is_grouped_df(data)) {

    data = data,
    .fn = run_differs_from_previous_,
    col = col,
    threshold = threshold,
    direction = direction,
    return_index = return_index,
    include_first = include_first,
    handle_na = handle_na,
    factor_conversion_warning = factor_conversion_warning


run_differs_from_previous_ <- function(data,
                                       factor_conversion_warning) {

  check_differs_from_previous_always(data = data)

  if (is.data.frame(data)) {
    v <- data[[col]]
  } else {
    v <- data

  # Create and return start values or indices of values that differ from the previous value

    threshold = threshold,
    direction = direction,
    return_index = return_index,
    include_first = include_first,
    handle_na = handle_na


check_differs_from_previous_once <- function(data,
                                             factor_conversion_warning) {
  # Check arguments ####
  assert_collection <- checkmate::makeAssertCollection()
  if (is.null(data)) {
    assert_collection$push("'data' cannot be 'NULL'")
  if (!is.data.frame(data) && length(data) == 1 && is.na(data)) {
    assert_collection$push("'data' cannot be 'NA'.")
  checkmate::assert_flag(x = return_index, add = assert_collection)
  checkmate::assert_flag(x = include_first, add = assert_collection)
  checkmate::assert_flag(x = factor_conversion_warning, add = assert_collection)
    x = threshold,
    min.len = 1,
    max.len = 2,
    null.ok = TRUE,
    add = assert_collection
  checkmate::assert_string(x = col, null.ok = TRUE, add = assert_collection)
  checkmate::assert_string(x = direction, add = assert_collection)
    checkmate::check_string(x = handle_na),
    checkmate::check_number(x = handle_na, finite = TRUE),
    .var.name = "handle_na"
  if (length(threshold) == 2) {
    if (threshold[[1]] >= 0) {
      assert_collection$push("when 'threshold' has length 2, 'threshold[[1]]' must be a negative number.")
    if (threshold[[2]] <= 0) {
      assert_collection$push("'threshold[[2]]' must be a positive number.")
      x = data,
      min.cols = 1,
      min.rows = 1
      x = data,
      min.len = 1,
      strict = TRUE
    checkmate::check_factor(x = data, min.len = 1),
    .var.name = "data"
    x = direction,
    subset.of = c("both", "positive", "negative"),
    add = assert_collection
  if (checkmate::test_string(x = handle_na)) {
      x = handle_na,
      subset.of = c("ignore", "as_element"),
      add = assert_collection


  # If data is a data frame
  if (is.data.frame(data)) {
    if (is.null(col)) {
      # If not, raise error
      assert_collection$push("'col' must be specified when 'data' is data.frame.")
    if (col %ni% colnames(data)) {
      assert_collection$push("'col' was not found in 'data'.")

    # If col is a factor
    if (is.factor(data[[col]])) {
      if (!is.null(threshold)) {
          "'col' is factor. 'threshold' must be 'NULL'. Alternatively, convert factor to numeric vector."
      if (isTRUE(factor_conversion_warning)) {
        warning("'col' is factor. Using as character.")
      # Convert col to character
      data[[col]] <- as.character(data[[col]])
  } else {
    if (!is.null(col)){
      warning("'col' is ignored when 'data' is not a data.frame.")
    # If data is a factor
    if (is.factor(data)) {
      if (isTRUE(factor_conversion_warning)) {
        warning("'data' is factor. Using as character.")
      # Convert data to character
      data <- as.character(data)
  # End of argument checks ####

  list("data" = data)


check_differs_from_previous_always <- function(data) {
  # Check arguments ####
  assert_collection <- checkmate::makeAssertCollection()

      x = data,
      min.cols = 1,
      min.rows = 1
      x = data,
      min.len = 1,
      strict = TRUE
    checkmate::check_factor(x = data, min.len = 1),
    .var.name = "data"

  # End of argument checks ####

find_different_from_previous_vec_ <- function(v,
                                              threshold = NULL,
                                              direction = "both",
                                              return_index = FALSE,
                                              handle_na = "ignore",
                                              include_first = FALSE) {
  # Find values or index at which
  # value changes in vector
  # E.g. vector c(1,1,2,2,2,3,3) would return
  # values c(1,2,3) or indices c(1,3,6)
  # Uses a kind of rolling windows to determine
  # if a value is the same as the previous or new

  # Threshold can be a numeric scalar, a numeric vector of length 2, or NULL.
  # Threshold is inclusive. I.e. if threshold is 2 and the difference is 2, it's a match.
  # If threshold is NULL
  # .. TRUE if value is different than previous value.
  # .. .. Works for both numeric and character vectors.
  # If threshold is numeric scalar
  # .. Depending on direction, the difference between the value
  # .. and the previous value is checked against the threshold.
  # If threshold is a numeric vector of length 2
  # .. The first element is the negative threshold (and must be a negative number).
  # .. .. Returns TRUE if the difference from the previous value is negative and below or equal to this threshold.
  # .. The second element is the positive threshold (and must be a positive number).
  # .. .. Returns TRUE if the difference from the previous value is positive and above or equal to this threshold.

  # Direction is used when threshold is not NULL.
  # If direction is 'both'
  # .. Check whether the difference to the previous value is
  # .. .. greater than or equal to the positive threshold
  # .. .. less than or equal to the negative threshold
  # If direction is 'positive'
  # .. Check whether the difference to the previous value is
  # .. .. greater than or equal to the positive threshold
  # If direction is 'negative'
  # .. Check whether the difference to the previous value is
  # .. .. less than or equal to the negative threshold

  v_orig <- v

  contains_na <- anyNA(v)
  if (length(handle_na) > 1) {
    stop("'handle_na' had length > 1.")
  if (handle_na == "as_element" &&
      !is.null(threshold)) {
    stop("when 'handle_na' is 'as_element', 'threshold' must be NULL.")

  ## Handle NAs
  if (isTRUE(contains_na)) {
    if (handle_na == "ignore") {
      not_na_indices <- which(!is.na(v))
      v <- v[!is.na(v)]
    } else if (handle_na == "as_element") {
      v[is.na(v)] <- "NA"
    } else if (is.numeric(handle_na)) {
      v[is.na(v)] <- handle_na
      v_orig <- v
    } else {
        "'handle_na' must be either a method ('ignore' or 'convert') or a value to replace NAs with."

  # Adds vector to data frame
  # Creates a new column shifted down one row.
  # Checks if the current value is the same as the previous.

  # Shift / offset v one row down
  # Insert v[1] at beginning and remove last element of v
  # to get same length as v
  v2 <- c(v[1], v[seq_along(v) - 1])

  # Create data frame with v, v2 and
  # a logical column stating whether
  # v is new or not.
  if (!is.null(threshold)) {
    if (!is.numeric(threshold)) {
      stop("'threshold' must be numeric scalar, a numeric vector of length 2, or NULL.")
    if (length(threshold) == 2) {
      if (threshold[1] >= 0) {
        stop("When 'threshold' is a vector of length 2, the first element must be negative.")
      if (threshold[2] <= 0) {
        stop("When 'threshold' is a vector of length 2, the second element must be positive.")
      if (direction != "both") {
        stop("When 'threshold' is a vector of length 2, 'direction' must be 'both'.")

      neg_threshold <- threshold[1]
      threshold <- threshold[2]
    } else if (length(threshold) == 1) {
      if (threshold <= 0) {
        stop("When 'threshold' is a scalar it must be a positive number.")

      neg_threshold <- -threshold
    } else {
      stop("'threshold' must be numeric scalar, a numeric vector of length 2, or NULL.")

    if (direction == "both") {
      df <- data.frame(
        new = !is_between_(v - v2, neg_threshold, threshold),
        stringsAsFactors = FALSE
    } else if (direction == "positive") {
      df <- data.frame(v,
                       new = v - v2 >= threshold,
                       stringsAsFactors = FALSE)
    } else if (direction == "negative") {
      df <- data.frame(v,
                       new = v - v2 <= neg_threshold,
                       stringsAsFactors = FALSE)
    } else {
      stop("'direction' must be one of 'both', 'negative', and 'positive'.")
  } else {
    df <- data.frame(v, v2,
                     new = v != v2,
                     stringsAsFactors = FALSE)

  if (isTRUE(include_first)) {
    # Set first value to TRUE
    df[["new"]][1] <- TRUE

  # Add back NA rows
  if (isTRUE(contains_na) && handle_na == "ignore") {
    df[["orig_indices"]] <- not_na_indices
    # Get indices where v contains a new value
    new_indices <- df[["orig_indices"]][df[["new"]]]
  } else {
    # Get indices where v contains a new value
    new_indices <- which(df[["new"]])

  # If return_index is TRUE
  if (isTRUE(return_index)) {
    # Return only the indices where
    # v contains a new value
  } else {
    # Return values at the indices

Try the groupdata2 package in your browser

Any scripts or data that you put into this service are public.

groupdata2 documentation built on July 9, 2023, 6:46 p.m.