#' Compare data frames columns before merging
#'
#' Generate a comparison of data.frames (or similar objects) that indicates if
#' they will successfully bind together by rows.
#'
#' @details Due to the returned "column_name" column, no input data.frame may be
#' named "column_name".
#'
#' The `strict_description` argument is most typically used to understand
#' if factor levels match or are bindable. Factors are typically bindable,
#' but the behavior of what happens when they bind differs based on the
#' binding method ("bind_rows" or "rbind"). Even when
#' `strict_description` is `FALSE`, data.frames may still bind
#' because some classes (like factors and characters) can bind even if they
#' appear to differ.
#'
#' @param ... A combination of data.frames, tibbles, and lists of
#' data.frames/tibbles. The values may optionally be named arguments; if
#' named, the output column will be the name; if not named, the output column
#' will be the data.frame name (see examples section).
#' @param return Should a summary of "all" columns be returned, only return
#' "match"ing columns, or only "mismatch"ing columns?
#' @param bind_method What method of binding should be used to determine
#' matches? With "bind_rows", columns missing from a data.frame would be
#' considered a match (as in `dplyr::bind_rows()`; with "rbind", columns
#' missing from a data.frame would be considered a mismatch (as in
#' `base::rbind()`.
#' @param strict_description Passed to `describe_class`. Also, see the
#' Details section.
#' @return A data.frame with a column named "column_name" with a value named
#' after the input data.frames' column names, and then one column per
#' data.frame (named after the input data.frame). If more than one input has
#' the same column name, the column naming will have suffixes defined by
#' sequential use of `base::merge()` and may differ from expected naming.
#' The rows within the data.frame-named columns are descriptions of the
#' classes of the data within the columns (generated by
#' `describe_class`).
#' @examples
#' compare_df_cols(data.frame(A = 1), data.frame(B = 2))
#' # user-defined names
#' compare_df_cols(dfA = data.frame(A = 1), dfB = data.frame(B = 2))
#' # a combination of list and data.frame input
#' compare_df_cols(listA = list(dfA = data.frame(A = 1), dfB = data.frame(B = 2)), data.frame(A = 3))
#' @family data frame type comparison
#' @export
compare_df_cols <- function(..., return = c("all", "match", "mismatch"), bind_method = c("bind_rows", "rbind"), strict_description = FALSE) {
# Input checking
return <- match.arg(return)
bind_method <- match.arg(bind_method)
args <- list(...)
mask_input_data_frame <- sapply(X = args, FUN = is.data.frame)
mask_input_list <- sapply(X = args, FUN = is.list) & !mask_input_data_frame
mask_input_other <- !(mask_input_data_frame | mask_input_list)
if (any(mask_input_other)) {
stop(
"Input given with `...` must be either a data.frame or a list of data.frames. Argument ",
# the `collapse` argument is required for msg1 to prevent an ngettext
# error; the input must be scalar.
ngettext(
sum(mask_input_other),
msg1 = paste("number", which(mask_input_other), "is not.", collapse = "\n"),
msg2 = paste("numbers", paste(which(mask_input_other), collapse = ", "), "are not.")
)
)
}
bad_list_inputs <- numeric(0)
for (idx in which(mask_input_list)) {
bad_list_inputs <-
c(
bad_list_inputs,
if (!all(sapply(X = args[[idx]], FUN = is.data.frame))) {
idx
} else {
numeric(0)
}
)
}
if (length(bad_list_inputs)) {
stop(
"List inputs must be lists of data.frames. List input ",
if (length(bad_list_inputs) == 1) {
paste("number", bad_list_inputs, "is not a list of data.frames.")
} else if (length(bad_list_inputs) < 6) {
paste("numbers", paste(bad_list_inputs, collapse = ", "), "are not lists of data.frames.")
} else {
paste("numbers", paste(c(bad_list_inputs[1:5], "..."), collapse = ", "), "are not lists of data.frames.")
}
)
}
# Generate and check column names
direct_names <- names(args)
indirect_names <- as.character(match.call(expand.dots = TRUE))
indirect_names <- indirect_names[!(indirect_names %in% as.character(match.call(expand.dots = FALSE)))]
if (is.null(direct_names)) {
final_names <- indirect_names
} else {
final_names <- direct_names
mask_replace <- final_names %in% ""
final_names[mask_replace] <- indirect_names[mask_replace]
}
final_names <- as.list(final_names)
for (idx in which(mask_input_list)) {
current_list_names <- names(args[[idx]])
final_names[[idx]] <-
if (is.null(current_list_names)) {
paste(final_names[[idx]], seq_along(args[[idx]]), sep = "_")
} else if (any(mask_unnamed_list <- current_list_names %in% "")) {
current_list_names[mask_unnamed_list] <-
paste(
final_names[[idx]][mask_unnamed_list],
seq_len(sum(mask_unnamed_list)),
sep = "_"
)
current_list_names
} else {
current_list_names
}
}
if (any(unlist(final_names) %in% "column_name")) {
stop("None of the input ... argument names or list names may be `column_name`.")
}
ret <- compare_df_cols_df_maker(args, class_colname = final_names, strict_description = strict_description)
if (return == "all" | ncol(ret) == 2) {
if (return != "all") {
warning("Only one data.frame provided, so all its classes are provided.")
}
rownames(ret) <- NULL
ret
} else {
# Choose which way to test if the rows are bindable (NA matches or not).
bind_method_fun <-
list(
rbind = function(idx) {
all(unlist(ret[idx, 3:ncol(ret)]) %in% ret[idx, 2])
},
bind_rows = function(idx) {
all(
unlist(ret[idx, 3:ncol(ret)]) %in%
c(
NA_character_,
na.omit(unlist(ret[idx, 2:ncol(ret)]))[1]
)
)
}
)
mask_match <-
sapply(
X = seq_len(nrow(ret)),
FUN = bind_method_fun[[bind_method]]
)
ret <-
if (return == "match") {
ret[mask_match, ]
} else if (return == "mismatch") {
ret[!mask_match, ]
}
rownames(ret) <- NULL
ret
}
}
#' This is the workhorse for making a data.frame description used by
#' compare_df_cols
#' @param x The data.frame or list of data.frames
#' @param class_colname The name for the column-name-defining column
#' @param strict_description Passed to `describe_class`
#' @return A 2-column data.frame with the first column naming all the columns of
#' `x` and the second column (named after the value in
#' `class_colname`) defining the classes using
#' `describe_class()`.
#' @noRd
compare_df_cols_df_maker <- function(x, class_colname = "class", strict_description) {
UseMethod("compare_df_cols_df_maker")
}
#' @exportS3Method NULL
compare_df_cols_df_maker.data.frame <- function(x, class_colname = "class", strict_description) {
if (class_colname == "column_name") {
stop('`class_colname` cannot be "column_name"')
}
if (ncol(x) == 0) {
warning(class_colname, " has zero columns and will not appear in output.")
ret <- data.frame(column_name = character(0), stringsAsFactors = FALSE)
} else {
ret <-
data.frame(
column_name = names(x),
X = sapply(X = x, FUN = describe_class, strict_description = strict_description),
stringsAsFactors = FALSE
)
names(ret)[2] <- class_colname
}
ret
}
#' @exportS3Method NULL
compare_df_cols_df_maker.list <- function(x, class_colname = "class", strict_description = strict_description) {
if (length(class_colname) != length(x)) {
stop("`x` and `class_colname` must be the same length.")
} else if (any(class_colname == "column_name")) {
stop('`class_colname` cannot be "column_name"')
}
ret <-
lapply(
X = seq_along(x),
FUN = function(idx) {
compare_df_cols_df_maker(
x = x[[idx]],
class_colname = class_colname[[idx]],
strict_description = strict_description
)
}
)
Reduce(f = function(x, y) {
merge(x, y, by = "column_name", all = TRUE)
}, x = ret)
}
#' Do the the data.frames have the same columns & types?
#'
#' Check whether a set of data.frames are row-bindable. Calls `compare_df_cols()`
#' and returns `TRUE` if there are no mis-matching rows.
#'
#' @inheritParams compare_df_cols
#' @param verbose Print the mismatching columns if binding will fail.
#' @return `TRUE` if row binding will succeed or `FALSE` if it will fail.
#' @family data frame type comparison
#' @examples
#' compare_df_cols_same(data.frame(A = 1), data.frame(A = 2))
#' compare_df_cols_same(data.frame(A = 1), data.frame(B = 2))
#' compare_df_cols_same(data.frame(A = 1), data.frame(B = 2), verbose = FALSE)
#' compare_df_cols_same(data.frame(A = 1), data.frame(B = 2), bind_method = "rbind")
#' @export
compare_df_cols_same <- function(..., bind_method = c("bind_rows", "rbind"), verbose = TRUE) {
bind_method <- match.arg(bind_method)
ret <- compare_df_cols(..., return = "mismatch", bind_method = bind_method)
if (nrow(ret) & verbose) {
print(ret)
}
nrow(ret) == 0
}
#' Describe the class(es) of an object
#'
#' @details For package developers, an S3 generic method can be written for
#' `describe_class()` for custom classes that may need more definition
#' than the default method. This function is called by [compare_df_cols()].
#'
#' @param x The object to describe
#' @param strict_description Should differing factor levels be treated
#' as differences for the purposes of identifying mismatches?
#' `strict_description = TRUE` is stricter and factors with different
#' levels will be treated as different classes. `FALSE` is more
#' lenient: for class comparison purposes, the variable is just a "factor".
#' @return A character scalar describing the class(es) of an object where if the
#' scalar will match, columns in a data.frame (or similar object) should bind
#' together without issue.
#' @family data frame type comparison
#' @examples
#' describe_class(1)
#' describe_class(factor("A"))
#' describe_class(ordered(c("A", "B")))
#' describe_class(ordered(c("A", "B")), strict_description = FALSE)
#' @export
describe_class <- function(x, strict_description = TRUE) {
UseMethod("describe_class")
}
#' @describeIn describe_class Describe factors with their levels
#' and if they are ordered.
#' @export
describe_class.factor <- function(x, strict_description = TRUE) {
if (strict_description) {
all_classes <- class(x)
all_levels <- levels(x)
level_text <- sprintf("levels=c(%s)", paste('"', levels(x), '"', sep = "", collapse = ", "))
factor_text <- sprintf("factor(%s)", level_text)
mask_factor <- class(x) == "factor"
all_classes[mask_factor] <- factor_text
paste(all_classes, collapse = ", ")
} else {
all_classes <- setdiff(class(x), "ordered")
paste(all_classes, collapse = ", ")
}
}
#' @describeIn describe_class List all classes of an object.
#' @export
describe_class.default <- function(x, strict_description = TRUE) {
all_classes <- class(x)
paste(all_classes, collapse = ", ")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.