In MatthiasUckert/Rmatch: A General Package For Any Matching Tasks

library(testthat)

# Load already included functions
pkgload::load_all(export_all = FALSE)

check_dup

#' Check Duplicates
#' 
#' Description
#'
#' @param .source
#' The Source Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' @param .target
#' The Target Dataframe.
#' Must contain a unique column id and the columns you want to match on 
#' @param .check 
#' Check only column that are also in source, or all columns
#' @return A list with duplicates
#' 
#' @noRd
check_dup <- function(.source, .target, .check = c("source", "all")) {
  check_ <- match.arg(.check, c("source", "all"))

  .source <- tibble::as_tibble(.source)
  .target <- tibble::as_tibble(.target)

  cols_s_ <- stats::setNames(colnames(.source), paste0("s_", colnames(.source)))
  cols_t_ <- stats::setNames(colnames(.target), paste0("t_", colnames(.target)))
  cols_s_ <- cols_s_[!cols_s_ == "id"]
  cols_t_ <- cols_t_[!cols_t_ == "id"]
  cols_t_ <- cols_t_[order(match(cols_t_,cols_s_))]

  if (check_ == "source") {
    cols_t_ <- cols_t_[cols_t_ %in% cols_s_]
  }

  s_ <- tibble::as_tibble(.source)
  t_ <- tibble::as_tibble(.target)



  ind_ <- c(
    purrr::map_int(cols_s_, ~ sum(duplicated(s_[[.x]]))),
    purrr::map_int(cols_t_, ~ sum(duplicated(t_[[.x]])))
  )

  cum_ <- c(
    purrr::map_int(
      .x = stats::setNames(seq_len(length(cols_s_)), names(cols_s_)),
      .f = ~ sum(duplicated(apply(s_[, cols_s_[1:.x]], 1, paste, collapse = "-")))
    ),
    purrr::map_int(
      .x = stats::setNames(seq_len(length(cols_t_)), names(cols_t_)),
      .f = ~ sum(duplicated(apply(t_[, cols_t_[1:.x]], 1, paste, collapse = "-")))
    )
  )

  list(ind = ind_, cum = cum_)

}

check_dup(table_source, table_target)

test_that("check_dup works", {
  expect_true(inherits(check_dup, "function")) 
})

check_nas

#' Check columns for NA values
#' 
#' Description
#' @param .source
#' The Source Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' @param .target
#' The Target Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' @param .check 
#' Check only column that are also in source, or all columns
#' @return A list with the number of NAs
#' 
#' @noRd
check_nas <- function(.source, .target, .check = c("source", "all")) {
  check_ <- match.arg(.check, c("source", "all"))

  .source <- tibble::as_tibble(.source)
  .target <- tibble::as_tibble(.target)

  cols_s_ <- stats::setNames(colnames(.source), paste0("s_", colnames(.source)))
  cols_t_ <- stats::setNames(colnames(.target), paste0("t_", colnames(.target)))

  if (check_ == "source") {
    cols_t_ <- cols_t_[cols_t_ %in% cols_s_]
  }


  c(
    purrr::map_int(cols_s_, ~ sum(is.na(.source[[.x]]))),
    purrr::map_int(cols_t_, ~ sum(is.na(.target[[.x]])))
  )
}

check_nas(table_source, table_target)

test_that("check_nas works", {
  expect_true(inherits(check_nas, "function")) 
})

check_id

.source <- table_source
.target <- table_target

#' Check ID COlumns
#' 
#' Description
#' 
#' @param .source
#' The Source Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' @param .target
#' The Target Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' 
#' @return Either Errors or a list 
#' 
#' @noRd
check_id <- function(.source, .target, .error = TRUE) {
  cols_s_ <- colnames(.source)
  cols_t_ <- colnames(.target)

  .source <- tibble::as_tibble(.source)
  .target <- tibble::as_tibble(.target)

  es_ <- "id" %in% cols_s_
  et_ <- "id" %in% cols_t_

  if (es_) us_ <- !any(duplicated(.source[["id"]])) else us_ <- NA
  if (et_) ut_ <- !any(duplicated(.target[["id"]])) else ut_ <- NA

  if (.error) {
    if (!es_ & !et_) {
      stop("Both datasets must have an 'id' column", call. = FALSE)
    } else if (!es_) {
      stop("Source dataset must have an 'id' column", call. = FALSE)
    } else if (!et_) {
      stop("Target dataset must have an 'id' column", call. = FALSE)
    }

    if (!us_ & !ut_) {
      stop("Both datasets must have unique IDs", call. = FALSE)
    } else if (!us_) {
      stop("Source dataset must have unique IDs", call. = FALSE)
    } else if (!ut_) {
      stop("Target dataset must have aunique IDs", call. = FALSE)
    }
  }

  list(e_s = es_, e_t = et_, u_s = us_, u_t = ut_)
}

check_id(table_source, table_target)

test_that("check_id works", {
  t_ <- tibble::tibble
  expect_type(check_id(t_(id = 1), t_(id = 1)), type = "list")
  expect_error(check_id(t_(ids = 1), t_(id = 1)))
  expect_error(check_id(t_(id = 1), t_(ids = 1)))
  expect_error(check_id(t_(ids = 1), t_(ids = 1)))

  expect_type(check_id(t_(id = 1:2), t_(id = 1:2)), type = "list")
  expect_error(check_id(t_(id = c(1,2)), t_(id = c(1,1))))
  expect_error(check_id(t_(id = c(1,1)), t_(id = c(1,2))))
  expect_error(check_id(t_(id = c(1,1)), t_(id = c(1,1))))

})

check_data

.source <- table_source
.target <- table_source
.html = TRUE

#' Check Source and Target Dataframe
#' 
#' Description
#' 
#' @param .source
#' The Source Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' @param .target
#' The Target Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' @param .check 
#' Check only column that are also in source, or all columns
#' @param .html
#' Output Check as hatml table?
#' @return Messages
#' 
#' @export
check_data <- function(.source, .target, .check = c("source", "all"), .html = TRUE) {
  name <- df <- check <- ind <- cum <- value <- s <- Matrix <- Source <-
    Target <- NULL

  .source <- tibble::as_tibble(.source)
  .target <- tibble::as_tibble(.target)

  check_ <- match.arg(.check, c("source", "all"))

  lst_ids_ <- check_id(.source, .target, .error = FALSE)

  tab_ids_ <- tibble::enframe(lst_ids_) %>%
    tidyr::separate(name, c("check", "df"), sep = "_", extra = "merge") %>%
    tidyr::pivot_wider(names_from = df) %>%
    dplyr::mutate(
      check = dplyr::if_else(check == "e", "Column: 'id' exists", "Column: 'id' is unique")
    ) %>%
    dplyr::mutate(dplyr::across(dplyr::everything(), as.character))

  lst_nas_ <- check_nas(.source, .target, .check = check_)
  tab_nas_ <- tibble::enframe(lst_nas_) %>%
    tidyr::separate(name, c("df", "check"), sep = "_", extra = "merge") %>%
    tidyr::pivot_wider(names_from = df) %>%
    dplyr::mutate(check = paste0("Check NAs for column: '", check, "'")) %>%
    dplyr::mutate(
      dplyr::across(c(s, t), ~ scales::comma(., 1))
    ) %>%
    dplyr::mutate(dplyr::across(dplyr::everything(), as.character))

  lst_dup_ <- check_dup(.source, .target, .check = check_)
  tab_ind_ <- tibble::enframe(lst_dup_$ind, value = "ind")
  tab_cum_ <- tibble::enframe(lst_dup_$cum, value = "cum")
  tab_dup_ <- dplyr::left_join(tab_ind_, tab_cum_, by = "name") %>%
    dplyr::mutate(
      dplyr::across(c(ind, cum), ~ scales::comma(., 1)),
      cum = paste0("(", cum, ")")
    ) %>%
    tidyr::unite(value, ind, cum, sep = " ") %>%
    tidyr::separate(name, c("df", "check"), sep = "_", extra = "merge") %>%
    tidyr::pivot_wider(names_from = df) %>%
    dplyr::mutate(check = paste0("Check (cumulative) duplicates for column(s): '", check, "'")) %>%
    dplyr::mutate(dplyr::across(dplyr::everything(), as.character))


  ns_ <- as.numeric(nrow(.source))
  nt_ <- as.numeric(nrow(.target))

  tab_col_ <- tibble::tibble(
    check = c(
      "Elements in Dataframe/Matrix",
      "Estimated memory allocation"
    ),
    s = c(ns_, NA_real_),
    t = c(nt_, NA_real_),
    Matrix = c(ns_ * nt_, ns_ * nt_ * 8 / 1e6 * 20)
  ) %>%
    dplyr::mutate(dplyr::across(c(s, t, Matrix), ~ scales::comma(., 1))) %>%
    dplyr::mutate(Matrix = dplyr::if_else(dplyr::row_number() == 2, paste(Matrix, "MB"), Matrix))


  tab_ <- dplyr::bind_rows(
    tab_ids_, tibble::tibble(check = "", s = "", t = ""),
    tab_nas_, tibble::tibble(check = "", s = "", t = ""),
    tab_dup_, tibble::tibble(check = "", s = "", t = ""),
    tab_col_
  ) %>%
    dplyr::rename(Check = check, Source = s, Target = t)

  if (.html) {
    tab_ %>%
      dplyr::mutate(
        dplyr::across(
          .cols = c(Source, Target, Matrix),
          .fns = ~ dplyr::case_when(
            . == "" | is.na(.) ~ "",
            startsWith(Check, "Check NAs") ~ kableExtra::cell_spec(., bold = T),
            grepl("\\([1-9]+\\)|\\(.*,.*\\)|FALSE", .) ~ kableExtra::cell_spec(., bold = T, color = "red"),
            grepl("\\(0\\)|TRUE", .) ~ kableExtra::cell_spec(., bold = T, color = "green"),
            TRUE ~ .
          )
        )
      ) %>%
      kableExtra::kbl(escape = FALSE, align = "lccc") %>%
      kableExtra::kable_paper() %>%
      kableExtra::kable_styling(font_size = 14, html_font = "Times New Roman", bootstrap_options = "condensed")
  } else {
    tab_
  }
}

check_data(table_source, table_target)

test_that("check_data works", {
  expect_true(inherits(check_data, "function")) 
})

filter_duplicates

#' Filter duplicate values
#' 
#' Description
#' 
#' @param .data A dataframe
#' @param ... Columns on which the dataframe should be unique
#'
#' @return A dataframe
#' 
#' @export
filter_duplicates <- function(.data, ...) {
  tmp_ <- NULL
  .data <- tibble::as_tibble(.data)

  vars_ <- dplyr::enquos(...)
  tibble::as_tibble(.data) %>%
    dplyr::mutate(tmp_ = paste0(!!!vars_)) %>%
    dplyr::filter(duplicated(tmp_) |duplicated(tmp_, fromLast = TRUE)) %>%
    dplyr::select(-tmp_) %>%
    dplyr::arrange(!!!vars_)
}

filter_duplicates(table_source, name)

test_that("filter_duplicates works", {
  expect_true(inherits(filter_duplicates, "function")) 
})

make_unique

#' Make Dataframe Unique
#' 
#' Description
#' 
#' @param .data A dataframe
#' @param ... Columns on which the dataframe should be unique 
#' @param .rem 
#' "all": All duplicates are removed (also the first non-duplicated entry)\cr
#' "last": Only subsequent duplicates are removed (first value remains)
#'
#' @return A dataframe
#' 
#' @export
make_unique <- function(.data, ..., .rem = c("all", "last")) {
  rem_ <- match.arg(.rem, c("all", "last"))
  tmp_ <- NULL

  .data <- tibble::as_tibble(.data)

  vars_ <- dplyr::enquos(...)
  tab_ <- tibble::as_tibble(.data) %>%
    dplyr::mutate(tmp_ = paste0(!!!vars_))

  if (rem_ == "all") {
    tab_ <- dplyr::filter(tab_, !(duplicated(tmp_) | duplicated(tmp_, fromLast = TRUE)))
  } else {
    tab_ <- dplyr::filter(tab_, !(duplicated(tmp_)))
  }

  dplyr::select(tab_, -tmp_)
}

make_unique(table_source, name)

test_that("make_unique works", {
  expect_true(inherits(make_unique, "function")) 
})

Inflate

fusen::inflate(
  flat_file = "dev/flat_checks.Rmd", 
  vignette_name = NA, 
  check = FALSE, 
  overwrite = TRUE
  )
devtools::check(vignettes = FALSE)