R/is_group_subset.R

Defines functions .unique_value .is_group_subset is_group_subset

Documented in is_group_subset

#' Test whether a data.frame is a proper subset of any defined groups within a
#' target data.frame.
#'
#' @param x Focal data.frame, reflecting a single 'group' of interest
#' @param target Target data.frame, containing one or more defined groups
#' @param group Name of the column within \code{target} defining groups
#' @param by Character vector giving the column(s) by which \code{x} and
#'   \code{target} should be matched and compared.
#'
#' @return
#' Logical indicating whether \code{x} is a proper subset of any groups within
#' \code{target}
#'
#' @examples
#' target <- data.frame(
#'   g =  c("1", "1", "1",  "2", "2",  "3", "3",  "4", "4",  "5", "5"),
#'   x1 = c("a", "a", "a",  "b", "b",  "a", "a",  "b", "c",  "a", "b"),
#'   x2 = c("a", "a", "a",  "b", "b",  "a", "a",  "b", "c",  "a", "b"),
#'   stringsAsFactors = FALSE
#' )
#'
#' x1 <- data.frame(
#'   g =  c("6", "6"),
#'   x1 = c("a", "a"),
#'   x2 = c("a", "a"),
#'   stringsAsFactors = FALSE
#' )
#'
#' is_group_subset(x1, target, group = "g", by = c("x1", "x2"))
#'
#' x2 <- data.frame(
#'   g =  c("7", "7"),
#'   x1 = c("a", "d"),
#'   x2 = c("a", "d"),
#'   stringsAsFactors = FALSE
#' )
#'
#' is_group_subset(x2, target, group = "g", by = c("x1", "x2"))
#'
#' @importFrom dplyr semi_join left_join
#' @export is_group_subset
is_group_subset <- function(x, target, by, group) {

  if (!(group %in% names(target))) {
    stop("Argument 'group' must be a name of a column within 'target'")
  }
  if (!all(by %in% names(x)) | !all(by %in% names(target))) {
    stop("Both 'x' and 'target' must contain all columns specified in 'by'")
  }

  # check whether relevant rows of x[,by] and target[,by] are duplicated
  # 'relevant' means has matching row in x/target
  has_dups_x <- any(duplicated(semi_join(x[,by], target, by = by)))
  has_dups_target <- any(duplicated(semi_join(target[,by], x, by = by)))

  if (!has_dups_x & !has_dups_target) {
    # if relevant rows of x and target all unique
    x_join <- left_join(x[,by], target, by = by)
    out <- .unique_value(x_join[[group]])

  } else {
    # else, join will result in duplicates so must use alternate method
    target_split <- split(target, target[[group]])
    out <- any(vapply(target_split, .is_group_subset, x = x, by = by, FALSE))
  }

  out
}


#' @noRd
.is_group_subset <- function(x, y, by) {
  # utility to test whether 'x' is a subset of a single group within 'target'

  if (nrow(x) > nrow(y)) {
    # if x has more rows than y, then x is not a subset of y
    is_subset <- FALSE
  } else {
    # else..

    y$row <- 1:nrow(y)   # add row id to y
    i <- 1L              # initiate counter for focal row of x
    m <- 0L              # initiate tracker for matching row from y

    # loop through rows of x, looking for matching rows in y
    while (!is.na(m) & i <= nrow(x)) {
      m <- left_join(x[i,], y, by = by)$row[1]  # find first matching row
      if (!is.na(m)) y <- y[y$row != m,]        # remove matching row from y
      i <- i + 1L                               # increment i
    }
    is_subset <- ifelse(!is.na(m), TRUE, FALSE)
  }
  is_subset
}


#' @noRd
.unique_value <- function(x) {
  !any(is.na(x)) & length(unique(x)) == 1L
}
epicentre-msf/llutils documentation built on Nov. 9, 2020, 8:24 p.m.