#' Remove groups of rows within a data.frame that are proper subsets of other
#' defined groups, to limit group duplication
#'
#' @param x Focal data.frame containing one or more defined groups
#' @param target Optional target data.frame containing one or more defined
#' groups. Defaults to \code{NULL}, in which case the target is \code{x}
#' itself.
#' @param group Name of the column within \code{x} (and \code{target}, if
#' applicable) defining groups
#' @param by Character vector giving the column(s) by which \code{x} and
#' \code{target} should be matched and compared.
#' @param invert Logical indicating whether to invert the results, and thus
#' retain groups that are subsets rather than removing them.
#'
#' @return
#' The focal data.frame \code{x}, excluding groups representing subsets of other
#' groups.
#'
#' @examples
#' x <- data.frame(
#' x1 = c("a", "a", "a", "b", "b", "a", "a", "b", "c", "c", "b"),
#' x2 = c("a", "a", "a", "b", "b", "a", "a", "b", "c", "c", "b"),
#' g = c("1", "1", "1", "2", "2", "3", "3", "4", "4", "5", "5"),
#' stringsAsFactors = FALSE
#' )
#'
#' remove_group_subsets(x, group = "g", by = c("x1", "x2"))
#'
#' @importFrom dplyr bind_rows
#' @export remove_group_subsets
remove_group_subsets <- function(x, target = NULL, group, by, invert = FALSE) {
if (!(group %in% names(x))) {
stop("Argument 'group' must be a name of a column within 'x'")
}
if (!is.null(target) && !(group %in% names(target))) {
stop("Argument 'group' must be a name of a column within 'target'")
}
if (!all(by %in% names(x))) {
stop("Argument 'x' must contain all columns specified in 'by'")
}
if (!is.null(target) && !all(by %in% names(x))) {
stop("Argument 'target' must contain all columns specified in 'by'")
}
if (nrow(x) > 0) {
# split x by group and arrange by nrow
x_split <- split(x, x[[group]])
x_split_n <- vapply(x_split, nrow, 0)
x_split <- x_split[order(x_split_n)]
if (is.null(target)) {
# if no target provided, 'x' is itself the target
if (length(x_split) == 1L) {
# if single group in 'x', not a subset of other groups in 'x'
is_subset <- FALSE
} else {
# else, if multiple groups in 'x'
is_subset <- vapply(1:(length(x_split)-1), function(ii) {
is_group_subset(x = x_split[[ii]],
target = bind_rows(x_split[(ii+1):length(x_split)]),
group = group,
by = by)
}, FALSE)
# add FALSE b/c last element never subset
is_subset <- c(is_subset, FALSE)
}
} else {
# else, a 'target' is provided
is_subset <- vapply(seq_along(x_split), function(ii) {
is_group_subset(x = x_split[[ii]],
target = target,
group = group,
by = by)
}, FALSE)
}
rows <- if (invert) is_subset else !is_subset
out <- dplyr::bind_rows(x_split[rows])
} else {
out <- x
}
# hack to ensure retention of original cols if nrow(out) == 0
if (nrow(out) == 0) {
out <- x[0, , drop = FALSE]
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.