R/check_data.R

Defines functions check_data

Documented in check_data

# Generated by fusen: do not edit by hand

#' Check Source and Target Dataframe
#' 
#' Description
#' 
#' @param .source
#' The Source Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' @param .target
#' The Target Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' @param .check 
#' Check only column that are also in source, or all columns
#' @param .html
#' Output Check as hatml table?
#' @return Messages
#' 
#' @export
#' @examples
#' check_data(table_source, table_target)
check_data <- function(.source, .target, .check = c("source", "all"), .html = TRUE) {
  name <- df <- check <- ind <- cum <- value <- s <- Matrix <- Source <-
    Target <- NULL
  
  .source <- tibble::as_tibble(.source)
  .target <- tibble::as_tibble(.target)
  
  check_ <- match.arg(.check, c("source", "all"))
  
  lst_ids_ <- check_id(.source, .target, .error = FALSE)
  
  tab_ids_ <- tibble::enframe(lst_ids_) %>%
    tidyr::separate(name, c("check", "df"), sep = "_", extra = "merge") %>%
    tidyr::pivot_wider(names_from = df) %>%
    dplyr::mutate(
      check = dplyr::if_else(check == "e", "Column: 'id' exists", "Column: 'id' is unique")
    ) %>%
    dplyr::mutate(dplyr::across(dplyr::everything(), as.character))

  lst_nas_ <- check_nas(.source, .target, .check = check_)
  tab_nas_ <- tibble::enframe(lst_nas_) %>%
    tidyr::separate(name, c("df", "check"), sep = "_", extra = "merge") %>%
    tidyr::pivot_wider(names_from = df) %>%
    dplyr::mutate(check = paste0("Check NAs for column: '", check, "'")) %>%
    dplyr::mutate(
      dplyr::across(c(s, t), ~ scales::comma(., 1))
    ) %>%
    dplyr::mutate(dplyr::across(dplyr::everything(), as.character))

  lst_dup_ <- check_dup(.source, .target, .check = check_)
  tab_ind_ <- tibble::enframe(lst_dup_$ind, value = "ind")
  tab_cum_ <- tibble::enframe(lst_dup_$cum, value = "cum")
  tab_dup_ <- dplyr::left_join(tab_ind_, tab_cum_, by = "name") %>%
    dplyr::mutate(
      dplyr::across(c(ind, cum), ~ scales::comma(., 1)),
      cum = paste0("(", cum, ")")
    ) %>%
    tidyr::unite(value, ind, cum, sep = " ") %>%
    tidyr::separate(name, c("df", "check"), sep = "_", extra = "merge") %>%
    tidyr::pivot_wider(names_from = df) %>%
    dplyr::mutate(check = paste0("Check (cumulative) duplicates for column(s): '", check, "'")) %>%
    dplyr::mutate(dplyr::across(dplyr::everything(), as.character))

  
  ns_ <- as.numeric(nrow(.source))
  nt_ <- as.numeric(nrow(.target))
  
  tab_col_ <- tibble::tibble(
    check = c(
      "Elements in Dataframe/Matrix",
      "Estimated memory allocation"
    ),
    s = c(ns_, NA_real_),
    t = c(nt_, NA_real_),
    Matrix = c(ns_ * nt_, ns_ * nt_ * 8 / 1e6 * 20)
  ) %>%
    dplyr::mutate(dplyr::across(c(s, t, Matrix), ~ scales::comma(., 1))) %>%
    dplyr::mutate(Matrix = dplyr::if_else(dplyr::row_number() == 2, paste(Matrix, "MB"), Matrix))


  tab_ <- dplyr::bind_rows(
    tab_ids_, tibble::tibble(check = "", s = "", t = ""),
    tab_nas_, tibble::tibble(check = "", s = "", t = ""),
    tab_dup_, tibble::tibble(check = "", s = "", t = ""),
    tab_col_
  ) %>%
    dplyr::rename(Check = check, Source = s, Target = t)

  if (.html) {
    tab_ %>%
      dplyr::mutate(
        dplyr::across(
          .cols = c(Source, Target, Matrix),
          .fns = ~ dplyr::case_when(
            . == "" | is.na(.) ~ "",
            startsWith(Check, "Check NAs") ~ kableExtra::cell_spec(., bold = T),
            grepl("\\([1-9]+\\)|\\(.*,.*\\)|FALSE", .) ~ kableExtra::cell_spec(., bold = T, color = "red"),
            grepl("\\(0\\)|TRUE", .) ~ kableExtra::cell_spec(., bold = T, color = "green"),
            TRUE ~ .
          )
        )
      ) %>%
      kableExtra::kbl(escape = FALSE, align = "lccc") %>%
      kableExtra::kable_paper() %>%
      kableExtra::kable_styling(font_size = 14, html_font = "Times New Roman", bootstrap_options = "condensed")
  } else {
    tab_
  }
}
MatthiasUckert/Rmatch documentation built on Jan. 3, 2022, 11:09 p.m.