R/remove_group_subsets.R

Defines functions remove_group_subsets

Documented in remove_group_subsets

#' Remove groups of rows within a data.frame that are proper subsets of other
#' defined groups, to limit group duplication
#'
#' @param x Focal data.frame containing one or more defined groups
#' @param target Optional target data.frame containing one or more defined
#'   groups. Defaults to \code{NULL}, in which case the target is \code{x}
#'   itself.
#' @param group Name of the column within \code{x} (and \code{target}, if
#'   applicable) defining groups
#' @param by Character vector giving the column(s) by which \code{x} and
#'   \code{target} should be matched and compared.
#' @param invert Logical indicating whether to invert the results, and thus
#'   retain groups that are subsets rather than removing them.
#'
#' @return
#' The focal data.frame \code{x}, excluding groups representing subsets of other
#' groups.
#'
#' @examples
#' x <- data.frame(
#'   x1 = c("a", "a", "a",  "b", "b",  "a", "a",  "b", "c",  "c", "b"),
#'   x2 = c("a", "a", "a",  "b", "b",  "a", "a",  "b", "c",  "c", "b"),
#'   g =  c("1", "1", "1",  "2", "2",  "3", "3",  "4", "4",  "5", "5"),
#'   stringsAsFactors = FALSE
#' )
#'
#' remove_group_subsets(x, group = "g", by = c("x1", "x2"))
#'
#' @importFrom dplyr bind_rows
#' @export remove_group_subsets
remove_group_subsets <- function(x, target = NULL, group, by, invert = FALSE) {

  if (!(group %in% names(x))) {
    stop("Argument 'group' must be a name of a column within 'x'")
  }
  if (!is.null(target) && !(group %in% names(target))) {
    stop("Argument 'group' must be a name of a column within 'target'")
  }
  if (!all(by %in% names(x))) {
    stop("Argument 'x' must contain all columns specified in 'by'")
  }
  if (!is.null(target) && !all(by %in% names(x))) {
    stop("Argument 'target' must contain all columns specified in 'by'")
  }

  if (nrow(x) > 0) {
    # split x by group and arrange by nrow
    x_split <- split(x, x[[group]])
    x_split_n <- vapply(x_split, nrow, 0)
    x_split <- x_split[order(x_split_n)]

    if (is.null(target)) {
      # if no target provided, 'x' is itself the target

      if (length(x_split) == 1L) {
        # if single group in 'x', not a subset of other groups in 'x'
        is_subset <- FALSE
      } else {
        # else, if multiple groups in 'x'
        is_subset <- vapply(1:(length(x_split)-1), function(ii) {
          is_group_subset(x = x_split[[ii]],
                          target = bind_rows(x_split[(ii+1):length(x_split)]),
                          group = group,
                          by = by)
        }, FALSE)

        # add FALSE b/c last element never subset
        is_subset <- c(is_subset, FALSE)
      }

    } else {
      # else, a 'target' is provided
      is_subset <- vapply(seq_along(x_split), function(ii) {
        is_group_subset(x = x_split[[ii]],
                        target = target,
                        group = group,
                        by = by)
      }, FALSE)
    }

    rows <- if (invert) is_subset else !is_subset
    out <- dplyr::bind_rows(x_split[rows])
  } else {
    out <- x
  }

  # hack to ensure retention of original cols if nrow(out) == 0
  if (nrow(out) == 0) {
    out <- x[0, , drop = FALSE]
  }

  return(out)
}
epicentre-msf/llutils documentation built on Nov. 9, 2020, 8:24 p.m.