R/compare_df_cols.R

Defines functions describe_class.default describe_class.factor describe_class compare_df_cols_same compare_df_cols_df_maker.list compare_df_cols_df_maker.data.frame compare_df_cols_df_maker compare_df_cols

Documented in compare_df_cols compare_df_cols_same describe_class describe_class.default describe_class.factor

#' Generate a comparison of data.frames (or similar objects) that indicates if
#' they will successfully bind together by rows.
#'
#' @details Due to the returned "column_name" column, no input data.frame may be
#'   named "column_name".
#'
#'   The \code{strict_description} argument is most typically used to understand
#'   if factor levels match or are bindable.  Factors are typically bindable,
#'   but the behavior of what happens when they bind differs based on the
#'   binding method ("bind_rows" or "rbind").  Even when
#'   \code{strict_description} is \code{FALSE}, data.frames may still bind
#'   because some classes (like factors and characters) can bind even if they
#'   appear to differ.
#'
#' @param ... A combination of data.frames, tibbles, and lists of
#'   data.frames/tibbles.  The values may optionally be named arguments; if
#'   named, the output column will be the name; if not named, the output column
#'   will be the data.frame name (see examples section).
#' @param return Should a summary of "all" columns be returned, only return
#'   "match"ing columns, or only "mismatch"ing columns?
#' @param bind_method What method of binding should be used to determine
#'   matches? With "bind_rows", columns missing from a data.frame would be
#'   considered a match (as in \code{dplyr::bind_rows()}; with "rbind", columns
#'   missing from a data.frame would be considered a mismatch (as in
#'   \code{base::rbind()}.
#' @param strict_description Passed to \code{describe_class}.  Also, see the
#'   Details section.
#' @return A data.frame with a column named "column_name" with a value named
#'   after the input data.frames' column names, and then one column per
#'   data.frame (named after the input data.frame).  If more than one input has
#'   the same column name, the column naming will have suffixes defined by
#'   sequential use of \code{base::merge()} and may differ from expected naming.
#'   The rows within the data.frame-named columns are descriptions of the
#'   classes of the data within the columns (generated by
#'   \code{describe_class}).
#' @examples
#' compare_df_cols(data.frame(A=1), data.frame(B=2))
#' # user-defined names
#' compare_df_cols(dfA=data.frame(A=1), dfB=data.frame(B=2))
#' # a combination of list and data.frame input
#' compare_df_cols(listA=list(dfA=data.frame(A=1), dfB=data.frame(B=2)), data.frame(A=3))
#' @family Data frame type comparison
#' @export
compare_df_cols <- function(..., return=c("all", "match", "mismatch"), bind_method=c("bind_rows", "rbind"), strict_description=FALSE) {
  # Input checking
  return <- match.arg(return)
  bind_method <- match.arg(bind_method)
  args <- list(...)
  mask_input_data_frame <- sapply(X=args, FUN=is.data.frame)
  mask_input_list <- sapply(X=args, FUN=is.list) & !mask_input_data_frame
  mask_input_other <- !(mask_input_data_frame | mask_input_list)
  if (any(mask_input_other)) {
    stop(
      "Input given with `...` must be either a data.frame or a list of data.frames. Argument ",
      # the `collapse` argument is required for msg1 to prevent an ngettext
      # error; the input must be scalar.
      ngettext(
        sum(mask_input_other),
        msg1=paste("number", which(mask_input_other), "is not.", collapse="\n"),
        msg2=paste("numbers", paste(which(mask_input_other), collapse=", "), "are not.")
      )
    )
  }
  bad_list_inputs <- numeric(0)
  for (idx in which(mask_input_list)) {
    bad_list_inputs <-
      c(
        bad_list_inputs,
        if (!all(sapply(X=args[[idx]], FUN=is.data.frame))) {
          idx
        } else {
          numeric(0)
        }
      )
  }
  if (length(bad_list_inputs)) {
    stop(
      "List inputs must be lists of data.frames.  List input ",
      if (length(bad_list_inputs) == 1) {
        paste("number", bad_list_inputs, "is not a list of data.frames.")
      } else if (length(bad_list_inputs) < 6) {
        paste("numbers", paste(bad_list_inputs, collapse=", "), "are not lists of data.frames.")
      } else {
        paste("numbers", paste(c(bad_list_inputs[1:5], "..."), collapse=", "), "are not lists of data.frames.")
      }
    )
  }

  # Generate and check column names
  direct_names <- names(args)
  indirect_names <- as.character(match.call(expand.dots=TRUE))
  indirect_names <- indirect_names[!(indirect_names %in% as.character(match.call(expand.dots=FALSE)))]
  if (is.null(direct_names)) {
    final_names <- indirect_names
  } else {
    final_names <- direct_names
    mask_replace <- final_names %in% ""
    final_names[mask_replace] <- indirect_names[mask_replace]
  }
  final_names <- as.list(final_names)
  for (idx in which(mask_input_list)) {
    current_list_names <- names(args[[idx]])
    final_names[[idx]] <-
      if (is.null(current_list_names)) {
        paste(final_names[[idx]], seq_along(args[[idx]]), sep="_")
      } else if (any(mask_unnamed_list <- current_list_names %in% "")) {
        current_list_names[mask_unnamed_list] <-
          paste(
            final_names[[idx]][mask_unnamed_list],
            seq_len(sum(mask_unnamed_list)),
            sep="_"
          )
        current_list_names
      } else {
        current_list_names
      }
  }
  if (any(unlist(final_names) %in% "column_name")) {
    stop("None of the input ... argument names or list names may be `column_name`.")
  }
  ret <- compare_df_cols_df_maker(args, class_colname=final_names, strict_description=strict_description)
  if (return == "all" | ncol(ret) == 2) {
    if (return != "all") {
      warning("Only one data.frame provided, so all its classes are provided.")
    }
    rownames(ret) <- NULL
    ret
  } else {
    # Choose which way to test if the rows are bindable (NA matches or not).
    bind_method_fun <-
      list(
        rbind=function(idx) {
          all(unlist(ret[idx,3:ncol(ret)]) %in% ret[idx,2])
        },
        bind_rows=function(idx) {
          all(
            unlist(ret[idx,3:ncol(ret)]) %in%
              c(NA_character_,
                na.omit(unlist(ret[idx,2:ncol(ret)]))[1])
          )
        }
      )
    mask_match <-
      sapply(
        X=seq_len(nrow(ret)),
        FUN=bind_method_fun[[bind_method]]
      )
    ret <-
      if (return == "match") {
        ret[mask_match,]
      } else if (return == "mismatch") {
        ret[!mask_match,]
      }
    rownames(ret) <- NULL
    ret
  }
}

#' This is the workhorse for making a data.frame description used by
#' compare_df_cols
#' @param x The data.frame or list of data.frames
#' @param class_colname The name for the column-name-defining column
#' @param strict_description Passed to \code{describe_class}
#' @return A 2-column data.frame with the first column naming all the columns of
#'   \code{x} and the second column (named after the value in
#'   \code{class_colname}) defining the classes using
#'   \code{describe_class()}.
#' @noRd
compare_df_cols_df_maker <- function(x, class_colname="class", strict_description)
  UseMethod("compare_df_cols_df_maker")

compare_df_cols_df_maker.data.frame <- function(x, class_colname="class", strict_description) {
  if (class_colname == "column_name") {
    stop('`class_colname` cannot be "column_name"')
  }
  if (ncol(x) == 0) {
    warning(class_colname, " has zero columns and will not appear in output.")
    ret <- data.frame(column_name=character(0), stringsAsFactors=FALSE)
  } else {
    ret <-
      data.frame(
        column_name=names(x),
        X=sapply(X=x, FUN=describe_class, strict_description=strict_description),
        stringsAsFactors=FALSE
      )
    names(ret)[2] <- class_colname
  }
  ret
}

compare_df_cols_df_maker.list <- function(x, class_colname="class", strict_description=strict_description) {
  if (length(class_colname) != length(x)) {
    stop("`x` and `class_colname` must be the same length.")
  } else if (any(class_colname == "column_name")) {
    stop('`class_colname` cannot be "column_name"')
  }
  ret <-
    lapply(
      X=seq_along(x),
      FUN=function(idx) {
        compare_df_cols_df_maker(x=x[[idx]],
                                 class_colname=class_colname[[idx]],
                                 strict_description=strict_description)
      }
    )
  Reduce(f=function(x, y) {merge(x, y, by="column_name", all=TRUE)}, x=ret)
}

#' Do the the data.frames have the same columns & types?
#'
#' @description Check whether a set of data.frames are row-bindable.  Calls
#' \code{compare_df_cols()}and returns TRUE if there are no mis-matching rows.  `
#' @inheritParams compare_df_cols
#' @param verbose Print the mismatching columns if binding will fail.
#' @return \code{TRUE} if row binding will succeed or \code{FALSE} if it will
#'   fail.
#' @family Data frame type comparison
#' @examples
#' compare_df_cols_same(data.frame(A=1), data.frame(A=2))
#' compare_df_cols_same(data.frame(A=1), data.frame(B=2))
#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), verbose=FALSE)
#' compare_df_cols_same(data.frame(A=1), data.frame(B=2), bind_method="rbind")
#' @export
compare_df_cols_same <- function(..., bind_method=c("bind_rows", "rbind"), verbose=TRUE) {
  bind_method <- match.arg(bind_method)
  ret <- compare_df_cols(..., return = "mismatch", bind_method = bind_method)
  if (nrow(ret) & verbose) {
    print(ret)
  }
  nrow(ret) == 0
}

#' Describe the class(es) of an object
#'
#' @details For package developers, an S3 generic method can be written for
#'   \code{describe_class()} for custom classes that may need more definition
#'   than the default method.  This function is called by \code{compare_df_cols}.
#'
#' @param x The object to describe
#' @param strict_description Should differing factor levels be treated 
#' as differences for the purposes of identifying mismatches? 
#' \code{strict_description = `TRUE`} is stricter and factors with different 
#' levels will be treated as different classes.  \code{FALSE} is more 
#' lenient: for class comparison purposes, the variable is just a "factor".
#' @return A character scalar describing the class(es) of an object where if the
#'   scalar will match, columns in a data.frame (or similar object) should bind
#'   together without issue.
#' @family Data frame type comparison
#' @examples
#' describe_class(1)
#' describe_class(factor("A"))
#' describe_class(ordered(c("A", "B")))
#' describe_class(ordered(c("A", "B")), strict_description=FALSE)
#' @export
describe_class <- function(x, strict_description=TRUE) {
  UseMethod("describe_class")
}

#' @describeIn describe_class Describe factors with their levels
#'   and if they are ordered.
#' @export
describe_class.factor <- function(x, strict_description=TRUE) {
  if (strict_description) {
    all_classes <- class(x)
    all_levels <- levels(x)
    level_text <- sprintf("levels=c(%s)", paste('"', levels(x), '"', sep="", collapse=", "))
    factor_text <- sprintf("factor(%s)", level_text)
    mask_factor <- class(x) == "factor"
    all_classes[mask_factor] <- factor_text
    paste(all_classes, collapse=", ")
  } else {
    all_classes <- setdiff(class(x), "ordered")
    paste(all_classes, collapse=", ")
  }
}

#' @describeIn describe_class List all classes of an object.
#' @export
describe_class.default <- function(x, strict_description=TRUE) {
  all_classes <- class(x)
  paste(all_classes, collapse=", ")
}

Try the janitor package in your browser

Any scripts or data that you put into this service are public.

janitor documentation built on Feb. 16, 2023, 10:16 p.m.