#' Do a data diff
#'
#' Find differences with a reference data set. The diff can be used to \code{\link{patch_data}}, to store the difference
#' for documentation purposes using \code{\link{write_diff}} or to visualize the difference using \code{\link{render_diff}}
#'
#' @example ./examples/datadiff.R
#' @param data \code{data.frame} to check for changes
#' @param data_ref \code{data.frame} reference data frame
#' @param always_show_header \code{logical}
#' Should we always give a table header in diffs? This defaults
#' to TRUE, and - frankly - you should leave it at TRUE for now.
#' @param always_show_order \code{logical}
#' Diffs for tables where row/column order has been permuted may include
#' an extra row/column specifying the changes in row/column numbers.
#' If you'd like that extra row/column to always be included,
#' turn on this flag, and turn off never_show_order.
#' @param columns_to_ignore \code{character}
#' List of columns to ignore in all calculations. Changes
#' related to these columns are ignored.
#' @param count_like_a_spreadsheet \code{logical}
#' Should column numbers, if present, be rendered spreadsheet-style
#' as A,B,C,...,AA,BB,CC? Defaults to TRUE.
#' @param ids \code{character}
#' List of columns that make up a primary key, if known. Otherwise
#' heuristics are used to find a decent key (or a set of decent keys).
#' @param ignore_whitespace \code{logical}
#' Should whitespace be omitted from comparisons. Defaults to FALSE.
#' @param never_show_order \code{logical}
#' Diffs for tables where row/column order has been permuted may include
#' an extra row/column specifying the changes in row/column numbers.
#' If you'd like to be sure that that row/column is *never
#' included, turn on this flag, and turn off always_show_order.
#' @param ordered \code{logical}
#' Is the order of rows and columns meaningful? Defaults to `TRUE`.
#' @param padding_strategy \code{logical}
#' Strategy to use when padding columns. Valid values are "auto",
#' "smart", "dense", and "sparse". Leave null for a sensible default.
#' @param show_meta \code{logical}
#' Show changes in column properties, not just data, if available. Defaults to TRUE.
#' @param show_unchanged \code{logical}
#' Should we show all rows in diffs? We default to showing
#' just rows that have changes (and some context rows around
#' them, if row order is meaningful), but you can override
#' this here.
#' @param show_unchanged_columns \code{logical}
#' Should we show all columns in diffs? We default to showing
#' just columns that have changes (and some context columns around
#' them, if column order is meaningful), but you can override
#' this here. Irrespective of this flag, you can rely
#' on index/key columns needed to identify rows to be included
#' in the diff.
#' @param show_unchanged_meta \code{logical}
#' Show all column properties, if available, even if unchanged.
#' Defaults to FALSE.
#' @param unchanged_column_context \code{integer}
#' When showing context columns around a changed column, what
#' is the minimum number of such columns we should show?
#' @param unchanged_context \code{integer}
#' When showing context rows around a changed row, what
#' is the minimum number of such rows we should show?
#'
#' @return difference object
#'
#' @seealso differs_from
#'
#' @export
diff_data <- function(data_ref,
data,
always_show_header = TRUE,
always_show_order = FALSE,
columns_to_ignore = c(),
count_like_a_spreadsheet = TRUE,
ids = c(),
ignore_whitespace = FALSE,
never_show_order = FALSE,
ordered = TRUE,
padding_strategy = c("auto", "smart", "dense", "sparse"),
show_meta = TRUE,
show_unchanged = FALSE, # rows
show_unchanged_columns = FALSE,
show_unchanged_meta = FALSE,
unchanged_column_context = 1L,
unchanged_context = 1L # rows
)
{
# force padding_strategy to a valid value
padding_strategy <- match.arg(padding_strategy)
if(padding_strategy=='auto') padding_strategy=NULL
# stash object names
source_name <- deparse(substitute(data_ref))
target_name <- deparse(substitute(data ))
# remove specified columns not present in the actual data
cols.either <- union(colnames(data_ref), colnames(data))
ids <- intersect(ids, cols.either)
columns_to_ignore <- intersect(columns_to_ignore, cols.either)
# check for and correct duplicate column names
if(any(duplicated(colnames(data_ref))))
{
colnames(data_ref) <- make.unique(colnames(data_ref))
warning("Column names have been made unique in ", sQuote(source_name), ".")
}
if(any(duplicated(colnames(data))))
{
colnames(data) <- make.unique(colnames(data))
warning("Column names have been made unique in ", sQuote(target_name), ".")
}
ctx <- get_context()
tv <- TableView(ctx, data)
tv_ref <- TableView(ctx, data_ref)
tv_diff <- TableView(ctx)
# add target classes to diff
tv_diff$mode <- sapply(data, storage.mode)
tv_diff$is_factor <- sapply(data, is.factor)
tv_diff$levels <- lapply(data, levels)
# create object to hold options
ctx$eval("cf = new daff.CompareFlags();")
# add scalar options
cf.assign <- function(name, x)
{
ctx$assign("__temp__", x)
ctx$eval( JS(paste0("cf.", name, "=__temp__")) )
}
# add scalar options
cf.assign("always_show_header", always_show_header )
cf.assign("always_show_order", always_show_order ) #!
cf.assign("count_like_a_spreadsheet", count_like_a_spreadsheet)
cf.assign("ignore_whitespace", ignore_whitespace )
cf.assign("never_show_order", never_show_order ) #!
cf.assign("ordered", ordered )
cf.assign("padding_strategy", padding_strategy ) #!
cf.assign("show_meta", show_meta )
cf.assign("show_unchanged", show_unchanged )
cf.assign("show_unchanged_columns", show_unchanged_columns )
cf.assign("show_unchanged_meta", show_unchanged_meta )
cf.assign("unchanged_column_context", unchanged_column_context)
cf.assign("unchanged_context", unchanged_context )
# add vector options
lapply(ids, function(val) ctx$call("cf.addPrimaryKey", val) )
lapply(columns_to_ignore, function(val) ctx$call("cf.ignoreColumn", val) )
# run the diff
diff <- paste0("diff(",tv_ref$var_name,",",tv$var_name,", cf)")
ctx$assign(tv_diff$var_name, JS(diff))
class(tv_diff) <- c("data_diff", class(tv_diff))
# Store summary and flag information for later
summary <- ctx$get(JS(paste0(tv_diff$var_name, ".summary")))
flags <- ctx$get("cf")
# names of the compared objects
summary$source_name = source_name
summary$target_name = target_name
# Textual description of changes to row and column counts
if(summary$row_count_initial == summary$row_count_final)
summary$row_count_change_text <- summary$row_count_initial
else
summary$row_count_change_text <- paste0(summary$row_count_initial, " --> ", summary$row_count_final)
if(summary$col_count_initial == summary$col_count_final)
summary$col_count_change_text <- summary$col_count_initial
else
summary$col_count_change_text <- paste0(summary$col_count_initial, " --> ", summary$col_count_final)
attr(tv_diff, "summary") <- summary
attr(tv_diff, "flags" ) <- flags
tv_diff
}
#' differs from,
#'
#' This is the same function as \code{\link{diff_data}} but with arguments
#' reversed. This is more useful when using \code{dplyr} and \code{magrittr}
#'
#' @param data \code{data.frame} to check for changes
#' @param data_ref \code{data.frame} reference data frame
#' @param ... not further specified
#'
#' @return difference object
#'
#' @seealso diff_data
#'
#' @export
differs_from <- function(data, data_ref, ...){
diff_data(data_ref=data_ref, data=data, ...)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.