#' Function to make database fixes after import
#'
#' Use this function to make updates to your data while avoiding adding PHI,
#' such as MRNs, to your code and subsequently PHI in GitHub. You provide a file
#' of database fixes that is three columns: 1. An expression that selects a line
#' in the database to update (e.g. `MRN == "12345678"`), 2. The column name that
#' will be updated, and 3. The updated value. See the examples for the
#' structure of the database fix input.
#'
#' @param data data frame with errors
#' @param engine function to import file of database fixes
#' @param ... arguments passed to the engine function to import the database fixes
#' @return updated data frame
#' @export
#' @examples
#' df_fixes <-
#' tibble::tribble(
#' ~id, ~variable, ~value,
#' "id == 1", "age", "56",
#' "id == 2", "trt", "Drug C"
#' )
#' trial %>%
#' dplyr::mutate(id = dplyr::row_number()) %>%
#' fix_database_error(
#' engine = I,
#' x = df_fixes
#' )
fix_database_error <- function(data, engine = readr::read_csv, ...) {
# importing list of database fixes -------------------------------------------
database_fixes <- do.call(engine, list(...))
# checking inputs ------------------------------------------------------------
# checking the names of the imported database fixes
if (!identical(names(database_fixes), c("id", "variable", "value"))) {
stop("Expecting a database fix file with columns `id`, `variable`, and `value`.",
call. = FALSE)
}
# checking all variables are in `data=`
if (any(!unique(database_fixes$variable) %in% names(data))) {
missing_vars <-
unique(database_fixes$variable)[!unique(database_fixes$variable) %in% names(data)] %>%
{glue::glue_collapse(sQuote(.), sep = ", ", last = ", and ")}
stop(glue::glue("Columns {missing_vars} are not in `data`."), call. = FALSE)
}
# checking that there are no factor variables
# if there are, give an error and suggest using character variables instead
if(purrr::every(data %>%
dplyr::select(dplyr::intersect(database_fixes[["variable"]], names(data))),
~!is.factor(.x)) == FALSE) {
stop(
glue::glue(
"Factor variables cannot be updated. ",
" To update factor variables, convert using `as.character()` first. ",
"The following variables are factors: ",
glue::glue_collapse(
data %>% dplyr::select(dplyr::intersect(database_fixes[["variable"]], names(data))) %>%
dplyr::select_if(is.factor) %>% names(), sep = ", ")
),
call. = FALSE
)
}
# checking there are no duplicates in file
dup_n <- database_fixes %>% dplyr::group_by(.data$id, .data$variable) %>%
dplyr::mutate(n = dplyr::n()) %>% dplyr::pull("n") %>% max()
if (dup_n > 1) {
stop("There are duplicates in the database fixes.", call. = FALSE)
}
# Note... there is no difference between a date and a datetime in POSIX dates.
# The lubridate::as_datetime function gives you a POSIX object (because these can deal with times)
# If there is a time associated with a date this will automatically be included in POSIX
# TODO: Confirm that merging a "date" (from database fixes) with a "datetime" (from main data) works correctly
# setting up fixed data ------------------------------------------
#browser()
database_fixes2 <-
database_fixes %>%
dplyr::mutate(
id_expr = purrr::map(.data$id, rlang::parse_expr),
nrows_modified =
purrr::pmap_int(
list(.data$id_expr, .data$id),
~ check_id_logical(data, ..1, ..2)
),
variable_type = purrr::map_chr(.data$variable, ~ class(data[[.x]])[1]),
value_type = purrr::pmap(
list(.data$id, .data$variable, .data$variable_type, .data$value),
function(id, variable, variable_type, value) {
switch(
variable_type,
"character" = as.character(value),
"logical" = as.logical(value),
"integer" = fix_integer(variable, value, id),
"numeric" = as.numeric(value),
#"factor" = fix_factor($data, variable, value, id),
"POSIXct" = fix_date(variable_type, variable, value, id),
"POSIXlt" = fix_date(variable_type, variable, value, id),
"POSIXt" = fix_date(variable_type, variable, value, id), # TODO: Is this necessary??
"Date" = fix_date(variable_type, variable, value, id),
"difftime" = fix_difftime(data, variable, value)
) %||% stop("ERROR")
}
)
)
database_fixes2
# more checking --------------------------------------------------------------
# Print a message if any IDs don't match
if(any(database_fixes2$nrows_modified) == 0) {
message(
glue::glue(
"The following IDs did not match any rows: ",
glue::glue_collapse(
database_fixes2 %>% dplyr::filter(.data$nrows_modified == 0) %>% dplyr::pull("id"),
sep = ", ")))
}
# checking that each id identifies one row and only one row
if(any(database_fixes2$nrows_modified > 1)) {
stop("Each `id` must be associated with only one observation in the main data.",
call. = FALSE)
}
# checking there is a value type of each correction
values_error <- purrr::map_lgl(database_fixes2$value_type, is.na) %>% sum()
if(sum(values_error) > 0) {
print(database_fixes2[values_error, c("variable", "value")])
stop(paste("Could not identify variable type/class.",
"If the class is a Base R class, please file a GitHub issue."),
call. = FALSE)
}
# make changes in database ---------------------------------------------------
# create updated dataset
data_updated <- data
for (i in 1:nrow(database_fixes2)) {
data_updated[with(data, eval(database_fixes2$id_expr[[i]])), database_fixes2$variable[i]] <-
database_fixes2$value_type[[i]]
}
# checking updated data against old data -------------------------------------
if (!identical(names(data), names(data_updated))) {
stop("Error in column names after updating data.", call. = FALSE)
}
if (!identical(purrr::map_chr(data, ~class(.x)[1]), purrr::map_chr(data_updated, ~class(.x)[1])) |
!identical(purrr::map_chr(data, ~typeof(.x)[1]), purrr::map_chr(data_updated, ~typeof(.x)[1]))) {
stop("Error where column class changed after updating data.", call. = FALSE)
}
# returning updated data ------------------------------------------------------
data_updated
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.