R/dataprep_enforce_types.R

Defines functions to_big_int yesno_logical_to_logical na_introduced_warning_handler string_to_as lookup_col_types apply_col_types enforce_types

#' Enforce column types in a \strong{PANDORA table}
#'
#' @param x data.frame (PANDORA table)
#' @param suppress_na_introduced_warnings logical. Suppress warnings caused by data removal in
#' type transformation due to unrecognised columns
#'
#' @return data.frame (PANDORA table) with correct column types
#' 
#' @keywords internal
#' @noRd
enforce_types <- function(x, suppress_na_introduced_warnings = TRUE) {
  
  purrr::map2_df(
    x, 
    names(x), 
    .f = apply_col_types,
    suppress_na_introduced_warnings = suppress_na_introduced_warnings
  )
  
}

#### helpers ####

apply_col_types <- function(col_data, col_name, suppress_na_introduced_warnings) {
  res <- col_data
  # lookup type for variable in hash
  col_type <- lookup_col_types(col_name)
  # get trans function
  col_trans_function <- string_to_as(col_type)
  # transform variable, if trans function is available
  if (!is.null(col_trans_function)) {
    if (suppress_na_introduced_warnings) {
      withCallingHandlers({
        res <- col_trans_function(res) 
      }, warning = na_introduced_warning_handler
      )
    } else 
      res <- col_trans_function(res) 
  }
  return(res)
}

lookup_col_types <- function(col_names) {
  col_type <- rep(NA, length(col_names))
  # check which variables can be looked up
  col_in_hash <- col_names %in% hash::keys(hash_sidora_col_name_col_type)
  # lookup type for variable in hash
  col_type[col_in_hash] <- hash::values(hash_sidora_col_name_col_type, col_names[col_in_hash])
  return(unlist(col_type))
}

string_to_as <- function(x) {
  switch(
    x,
    "integer" = as.integer,
    "big_integer" = to_big_int,
    "double" = as.numeric,
    "factor" = as.factor,
    "logical" = as.logical,
    "character" = as.character,
    "yesno_logical" = yesno_logical_to_logical,
    "datetime" = as.POSIXct,
    NA
  )
}

na_introduced_warning_handler <- function(x) {
  if (any(
    grepl("NAs introduced by coercion", x)
  )) {
    invokeRestart("muffleWarning")
  }
}

yesno_logical_to_logical <- function(x) {
  tolower(x) == "yes"
}

to_big_int <- function(x) {
  # step 1: get a clean character vector that encodes the numbers in a clean notation
  cleaned_character_vector <- sapply(x, function(y) {
    # no data
    if (is.na(y) || y == "" || !grepl("(^[0-9]*$)|([0-9]E\\+)", y)) {
      NA_character_
    # scientific notation
    } else if (grepl("\\+", y)) {
      ss <- strsplit(y, "E\\+")[[1]]
      multiplier <- sub("\\.|\\,", "", ss[1])
      number_of_zeros <- as.integer(ss[2]) - nchar(multiplier) + 1
      number_of_zeros <- ifelse(number_of_zeros < 0, 0, number_of_zeros)
      paste0(
        multiplier, 
        paste(rep("0", number_of_zeros), collapse = "")
      )
    # everything is already alright
    } else {
      y
    }
  })
  # step 2: transform cleaned character vector to integer64
  bit64::as.integer64(cleaned_character_vector)
}
sidora-tools/sidora.core documentation built on May 4, 2024, 12:14 a.m.