R/assert.R

Defines functions get_duplicates assert_unique assert_no_na_in_key assert_has_key assert_dt assert_pk

Documented in assert_dt assert_has_key assert_no_na_in_key assert_pk assert_unique get_duplicates

#' @title Assert the data.table has a primary key
#' @description The primary key here is similar to the concept in relational DB.
#'  It means the key uniquely identifies each record in the table. It's an important
#'  attribute when `x` is an information table, which is later quried / merged against
#'  by other tables. A primary-keyed table can avoid duplicates in such cases.
#' @inheritParams assert_dt
#' @details Throws error unless `x` has a key, contains no `NA` values in key columns,
#'  and is unique for key values.
#' @export
assert_pk <- function(x) {
  # as assert_no_na_in_key() would call assert_dt() and assert_has_key()
  # we don't need to call them explicitly
  assert_no_na_in_key(x)
  assert_unique(x)
}

#' @title Assert a data.table object
#' @param x a `data.table` object
#' @export
assert_dt <- function(x) {
  if (!is.data.table(x))
    stop(sprintf("x (%s) is not a data.table", toString(class(x))))
  invisible(x)
}

#' @title Assert the data.table has key
#' @inheritParams assert_dt
#' @export
assert_has_key <- function(x) {
  assert_dt(x)
  if (!haskey(x))
    stop("x doesn't have a key")
  invisible(x)
}

#' @title Assert the key contains no NA
#' @inheritParams assert_dt
#' @export
assert_no_na_in_key <- function(x) {
  assert_dt(x)
  assert_has_key(x)
  has_na <- vapply(key(x), function(col) anyNA(x[[col]]), logical(1L))
  na_cols <- sprintf("%s(%d)", colnames(x)[has_na], which(has_na))
  if (length(na_cols))
    stop(sprintf("columns %s contain NA", toString(na_cols)))
  invisible(x)
}

#' @title Assert uniqueness of the data.table
#' @description Throws error when duplicates are found in the table
#' @details When `x` has a key, it only check the key columns.
#' @inheritParams assert_dt
#' @export
assert_unique <- function(x) {
  assert_dt(x)
  if (!identical(anyDuplicated(x, by = key(x)), 0L))
    stop("x contains duplicates, run `dtext::get_duplicates()` to see the values")
  invisible(x)
}

#' @title Find the duplicates
#' @inheritParams assert_dt
#' @param by The columns used for uniqueness checks. By default, it only relies on
#'  the key columns of `x` to decide duplication. See [data.table::duplicated()]
#'  for more info.
#' @param only.key When `TRUE` and `x` has key, only the key columns are returned.
#' @note When `x` has keys, the duplicateness is dependes on the key columns only.
#' @return The duplicated records of `x`.
#' @export
get_duplicates <- function(x, only.key = FALSE, by = key(x)) {
  cols <- if (haskey(x) && only.key) key(x) else colnames(x)
  dup <- duplicated(x, by = by, fromLast = FALSE) | duplicated(x, by = by, fromLast = TRUE)
  x[dup, cols, with = FALSE]
}
shrektan/dtkit documentation built on Nov. 26, 2021, 10:50 p.m.