library(testthat)
# Load already included functions pkgload::load_all(export_all = FALSE)
#' Check Duplicates #' #' Description #' #' @param .source #' The Source Dataframe. #' Must contain a unique column id and the columns you want to match on #' @param .target #' The Target Dataframe. #' Must contain a unique column id and the columns you want to match on #' @param .check #' Check only column that are also in source, or all columns #' @return A list with duplicates #' #' @noRd check_dup <- function(.source, .target, .check = c("source", "all")) { check_ <- match.arg(.check, c("source", "all")) .source <- tibble::as_tibble(.source) .target <- tibble::as_tibble(.target) cols_s_ <- stats::setNames(colnames(.source), paste0("s_", colnames(.source))) cols_t_ <- stats::setNames(colnames(.target), paste0("t_", colnames(.target))) cols_s_ <- cols_s_[!cols_s_ == "id"] cols_t_ <- cols_t_[!cols_t_ == "id"] cols_t_ <- cols_t_[order(match(cols_t_,cols_s_))] if (check_ == "source") { cols_t_ <- cols_t_[cols_t_ %in% cols_s_] } s_ <- tibble::as_tibble(.source) t_ <- tibble::as_tibble(.target) ind_ <- c( purrr::map_int(cols_s_, ~ sum(duplicated(s_[[.x]]))), purrr::map_int(cols_t_, ~ sum(duplicated(t_[[.x]]))) ) cum_ <- c( purrr::map_int( .x = stats::setNames(seq_len(length(cols_s_)), names(cols_s_)), .f = ~ sum(duplicated(apply(s_[, cols_s_[1:.x]], 1, paste, collapse = "-"))) ), purrr::map_int( .x = stats::setNames(seq_len(length(cols_t_)), names(cols_t_)), .f = ~ sum(duplicated(apply(t_[, cols_t_[1:.x]], 1, paste, collapse = "-"))) ) ) list(ind = ind_, cum = cum_) }
check_dup(table_source, table_target)
test_that("check_dup works", { expect_true(inherits(check_dup, "function")) })
#' Check columns for NA values #' #' Description #' @param .source #' The Source Dataframe. #' Must contain a unique column id and the columns you want to match on #' @param .target #' The Target Dataframe. #' Must contain a unique column id and the columns you want to match on #' @param .check #' Check only column that are also in source, or all columns #' @return A list with the number of NAs #' #' @noRd check_nas <- function(.source, .target, .check = c("source", "all")) { check_ <- match.arg(.check, c("source", "all")) .source <- tibble::as_tibble(.source) .target <- tibble::as_tibble(.target) cols_s_ <- stats::setNames(colnames(.source), paste0("s_", colnames(.source))) cols_t_ <- stats::setNames(colnames(.target), paste0("t_", colnames(.target))) if (check_ == "source") { cols_t_ <- cols_t_[cols_t_ %in% cols_s_] } c( purrr::map_int(cols_s_, ~ sum(is.na(.source[[.x]]))), purrr::map_int(cols_t_, ~ sum(is.na(.target[[.x]]))) ) }
check_nas(table_source, table_target)
test_that("check_nas works", { expect_true(inherits(check_nas, "function")) })
.source <- table_source .target <- table_target
#' Check ID COlumns #' #' Description #' #' @param .source #' The Source Dataframe. #' Must contain a unique column id and the columns you want to match on #' @param .target #' The Target Dataframe. #' Must contain a unique column id and the columns you want to match on #' #' @return Either Errors or a list #' #' @noRd check_id <- function(.source, .target, .error = TRUE) { cols_s_ <- colnames(.source) cols_t_ <- colnames(.target) .source <- tibble::as_tibble(.source) .target <- tibble::as_tibble(.target) es_ <- "id" %in% cols_s_ et_ <- "id" %in% cols_t_ if (es_) us_ <- !any(duplicated(.source[["id"]])) else us_ <- NA if (et_) ut_ <- !any(duplicated(.target[["id"]])) else ut_ <- NA if (.error) { if (!es_ & !et_) { stop("Both datasets must have an 'id' column", call. = FALSE) } else if (!es_) { stop("Source dataset must have an 'id' column", call. = FALSE) } else if (!et_) { stop("Target dataset must have an 'id' column", call. = FALSE) } if (!us_ & !ut_) { stop("Both datasets must have unique IDs", call. = FALSE) } else if (!us_) { stop("Source dataset must have unique IDs", call. = FALSE) } else if (!ut_) { stop("Target dataset must have aunique IDs", call. = FALSE) } } list(e_s = es_, e_t = et_, u_s = us_, u_t = ut_) }
check_id(table_source, table_target)
test_that("check_id works", { t_ <- tibble::tibble expect_type(check_id(t_(id = 1), t_(id = 1)), type = "list") expect_error(check_id(t_(ids = 1), t_(id = 1))) expect_error(check_id(t_(id = 1), t_(ids = 1))) expect_error(check_id(t_(ids = 1), t_(ids = 1))) expect_type(check_id(t_(id = 1:2), t_(id = 1:2)), type = "list") expect_error(check_id(t_(id = c(1,2)), t_(id = c(1,1)))) expect_error(check_id(t_(id = c(1,1)), t_(id = c(1,2)))) expect_error(check_id(t_(id = c(1,1)), t_(id = c(1,1)))) })
.source <- table_source .target <- table_source .html = TRUE
#' Check Source and Target Dataframe #' #' Description #' #' @param .source #' The Source Dataframe. #' Must contain a unique column id and the columns you want to match on #' @param .target #' The Target Dataframe. #' Must contain a unique column id and the columns you want to match on #' @param .check #' Check only column that are also in source, or all columns #' @param .html #' Output Check as hatml table? #' @return Messages #' #' @export check_data <- function(.source, .target, .check = c("source", "all"), .html = TRUE) { name <- df <- check <- ind <- cum <- value <- s <- Matrix <- Source <- Target <- NULL .source <- tibble::as_tibble(.source) .target <- tibble::as_tibble(.target) check_ <- match.arg(.check, c("source", "all")) lst_ids_ <- check_id(.source, .target, .error = FALSE) tab_ids_ <- tibble::enframe(lst_ids_) %>% tidyr::separate(name, c("check", "df"), sep = "_", extra = "merge") %>% tidyr::pivot_wider(names_from = df) %>% dplyr::mutate( check = dplyr::if_else(check == "e", "Column: 'id' exists", "Column: 'id' is unique") ) %>% dplyr::mutate(dplyr::across(dplyr::everything(), as.character)) lst_nas_ <- check_nas(.source, .target, .check = check_) tab_nas_ <- tibble::enframe(lst_nas_) %>% tidyr::separate(name, c("df", "check"), sep = "_", extra = "merge") %>% tidyr::pivot_wider(names_from = df) %>% dplyr::mutate(check = paste0("Check NAs for column: '", check, "'")) %>% dplyr::mutate( dplyr::across(c(s, t), ~ scales::comma(., 1)) ) %>% dplyr::mutate(dplyr::across(dplyr::everything(), as.character)) lst_dup_ <- check_dup(.source, .target, .check = check_) tab_ind_ <- tibble::enframe(lst_dup_$ind, value = "ind") tab_cum_ <- tibble::enframe(lst_dup_$cum, value = "cum") tab_dup_ <- dplyr::left_join(tab_ind_, tab_cum_, by = "name") %>% dplyr::mutate( dplyr::across(c(ind, cum), ~ scales::comma(., 1)), cum = paste0("(", cum, ")") ) %>% tidyr::unite(value, ind, cum, sep = " ") %>% tidyr::separate(name, c("df", "check"), sep = "_", extra = "merge") %>% tidyr::pivot_wider(names_from = df) %>% dplyr::mutate(check = paste0("Check (cumulative) duplicates for column(s): '", check, "'")) %>% dplyr::mutate(dplyr::across(dplyr::everything(), as.character)) ns_ <- as.numeric(nrow(.source)) nt_ <- as.numeric(nrow(.target)) tab_col_ <- tibble::tibble( check = c( "Elements in Dataframe/Matrix", "Estimated memory allocation" ), s = c(ns_, NA_real_), t = c(nt_, NA_real_), Matrix = c(ns_ * nt_, ns_ * nt_ * 8 / 1e6 * 20) ) %>% dplyr::mutate(dplyr::across(c(s, t, Matrix), ~ scales::comma(., 1))) %>% dplyr::mutate(Matrix = dplyr::if_else(dplyr::row_number() == 2, paste(Matrix, "MB"), Matrix)) tab_ <- dplyr::bind_rows( tab_ids_, tibble::tibble(check = "", s = "", t = ""), tab_nas_, tibble::tibble(check = "", s = "", t = ""), tab_dup_, tibble::tibble(check = "", s = "", t = ""), tab_col_ ) %>% dplyr::rename(Check = check, Source = s, Target = t) if (.html) { tab_ %>% dplyr::mutate( dplyr::across( .cols = c(Source, Target, Matrix), .fns = ~ dplyr::case_when( . == "" | is.na(.) ~ "", startsWith(Check, "Check NAs") ~ kableExtra::cell_spec(., bold = T), grepl("\\([1-9]+\\)|\\(.*,.*\\)|FALSE", .) ~ kableExtra::cell_spec(., bold = T, color = "red"), grepl("\\(0\\)|TRUE", .) ~ kableExtra::cell_spec(., bold = T, color = "green"), TRUE ~ . ) ) ) %>% kableExtra::kbl(escape = FALSE, align = "lccc") %>% kableExtra::kable_paper() %>% kableExtra::kable_styling(font_size = 14, html_font = "Times New Roman", bootstrap_options = "condensed") } else { tab_ } }
check_data(table_source, table_target)
test_that("check_data works", { expect_true(inherits(check_data, "function")) })
#' Filter duplicate values #' #' Description #' #' @param .data A dataframe #' @param ... Columns on which the dataframe should be unique #' #' @return A dataframe #' #' @export filter_duplicates <- function(.data, ...) { tmp_ <- NULL .data <- tibble::as_tibble(.data) vars_ <- dplyr::enquos(...) tibble::as_tibble(.data) %>% dplyr::mutate(tmp_ = paste0(!!!vars_)) %>% dplyr::filter(duplicated(tmp_) |duplicated(tmp_, fromLast = TRUE)) %>% dplyr::select(-tmp_) %>% dplyr::arrange(!!!vars_) }
filter_duplicates(table_source, name)
test_that("filter_duplicates works", { expect_true(inherits(filter_duplicates, "function")) })
#' Make Dataframe Unique #' #' Description #' #' @param .data A dataframe #' @param ... Columns on which the dataframe should be unique #' @param .rem #' "all": All duplicates are removed (also the first non-duplicated entry)\cr #' "last": Only subsequent duplicates are removed (first value remains) #' #' @return A dataframe #' #' @export make_unique <- function(.data, ..., .rem = c("all", "last")) { rem_ <- match.arg(.rem, c("all", "last")) tmp_ <- NULL .data <- tibble::as_tibble(.data) vars_ <- dplyr::enquos(...) tab_ <- tibble::as_tibble(.data) %>% dplyr::mutate(tmp_ = paste0(!!!vars_)) if (rem_ == "all") { tab_ <- dplyr::filter(tab_, !(duplicated(tmp_) | duplicated(tmp_, fromLast = TRUE))) } else { tab_ <- dplyr::filter(tab_, !(duplicated(tmp_))) } dplyr::select(tab_, -tmp_) }
make_unique(table_source, name)
test_that("make_unique works", { expect_true(inherits(make_unique, "function")) })
fusen::inflate( flat_file = "dev/flat_checks.Rmd", vignette_name = NA, check = FALSE, overwrite = TRUE ) devtools::check(vignettes = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.