#' Compare two statelists
#'
#' Check set overlap between two state lists / data frames, e.g.
#' prior to merging them.
#'
#' @param df1 data frame
#' @param df2 data frame
#' @param state1 (`character(1)`) Name of the country ID var in df1, default
#' "gwcode"
#' @param state2 (`character(1)`) Name of the country ID var in df2, default
#' "gwcode"
#' @param time1 (`character(1)`) Name of the time ID var in df1, default "year"
#' @param time2 (`character(1)`) Name of the time ID var in df2, default "year"
#'
#' @details This is a helper for interactively debugging data merges for data
#' that may have slightly different state lists. For example, these
#' differences in case sets could be because of country code differences.
#'
#' @examples
#' # df2 has all countries in 2018 but some values in x1 are missing
#' df1 <- state_panel(2018, 2018, partial = "any")
#' df1$x1 <- round(runif(nrow(df1))*5)
#' df1$x1[sample.int(nrow(df1), size = 20, replace = FALSE)] <- NA
#'
#' # df2 is missing some countries and also has missing values in x2
#' df2 <- state_panel(2018, 2018, partial = "any")
#' df2 <- df2[sample.int(nrow(df2), size = 150), ]
#' df2$x2 <- round(runif(nrow(df2))*5)
#' df2$x2[sample.int(nrow(df2), size = 20, replace = FALSE)] <- NA
#'
#' comp <- compare(df1, df2)
#' comp
#'
#' report(comp)
#'
#' @export
compare <- function(df1, df2, state1 = "gwcode", time1 = "year",
state2 = "gwcode", time2 = "year") {
stopifnot(!any(is.na(df1[, state1])),
!any(is.na(df1[, time1])),
!any(is.na(df2[, state2])),
!any(is.na(df2[, time2])))
# Summarize the data frames and discard vars
missval_df1 <- 1L - as.integer(stats::complete.cases(df1))
df1 <- df1[, c(state1, time1)]
df1$case_in_df1 <- 1L
df1$missval_df1 <- missval_df1
missval_df2 <- 1L - as.integer(stats::complete.cases(df2))
df2 <- df2[, c(state2, time2)]
df2$case_in_df2 <- 1L
df2$missval_df2 <- missval_df2
# Combine and start getting set overlap
df <- merge(df1, df2, all = TRUE,
by.x = c(state1, time1), by.y = c(state2, time2))
df <- df[order(df[, state1], df[, time1]), ]
df$case_in_df1[is.na(df$case_in_df1)] <- 0L
df$case_in_df2[is.na(df$case_in_df2)] <- 0L
# for missval, if the case is not in a dataset, we don't know. Code an
# explicit value for this
lbls <- c("0", "1", "unknown")
df$missval_df1 <- factor(df$missval_df1, levels = c(0, 1, NA), labels = lbls,
exclude = NULL)
df$missval_df2 <- factor(df$missval_df2, levels = c(0, 1, NA), labels = lbls,
exclude = NULL)
class(df) <- c("state_sets", class(df))
df
}
#' @export
#' @importFrom rlang .data
summary.state_sets <- function(object, ...) {
tbl <- object %>%
dplyr::group_by(.data$case_in_df1, .data$case_in_df2,
.data$missval_df1, .data$missval_df2) %>%
dplyr::summarize(n = dplyr::n()) %>%
dplyr::ungroup()
tbl
}
#' @export
print.state_sets <- function(x, ...) {
print(summary(x))
}
#' @export
#' @rdname compare
#' @param x a "state_sets" object produced by `compare()`
report <- function(x) {
splits <- split(x, x[, c("case_in_df1", "case_in_df2", "missval_df1", "missval_df2")])
splits <- splits[sapply(splits, nrow) > 0]
set <- names(splits)
# some combinations are impossible, e.g. we don't know anything about
# cases that are missing from both data sets, and we also don't know anything
# about the completeness of records from cases that are missing in one dataset
set_text <- list(
"1.1.0.0" = "match and have no missing values",
"1.1.1.1" = "match but have missing values in both",
"1.1.0.1" = "match but have missing values in df2",
"1.1.1.0" = "match but have missing values in df1",
"1.0.0.unknown" = "in df1 (no missing values) but not df2",
"1.0.1.unknown" = "in df1 (with missing values) but not df2",
"0.1.unknown.0" = "not in df1 but in df2 (no missing values)",
"0.1.unknown.1" = "not in df1 but in df2 (with missing values)"
)
outstr <- ""
outstr <- paste0(outstr, (sprintf("%s total rows\n", nrow(x))))
outstr <- paste0(outstr, sprintf("%s rows in df1\n", sum(x$case_in_df1)))
outstr <- paste0(outstr, sprintf("%s rows in df2\n", sum(x$case_in_df2)))
for (i in 1:length(set_text)) {
set_name <- names(set_text)[i]
if (!set_name %in% names(splits)) next
set_df <- splits[[set_name]]
outstr <- paste0(outstr, sprintf("\n%s rows %s\n", nrow(set_df), set_text[[i]]))
cy_str <- paste(set_df[, 1], set_df[, 2], sep = "-")
if (length(cy_str) > 10) {
cy_str <- c(utils::head(cy_str, 10), sprintf("and %s more", length(cy_str) - 10))
}
cy_str <- paste0(paste0(cy_str, collapse = ", "), "\n")
outstr <- paste0(outstr, cy_str)
}
cat(paste0(outstr, collapse = "\n"))
invisible(outstr)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.