Nothing
#' Recode New Variables With Formats
#'
#' @name recode
#'
#' @description
#' Instead of writing multiple if-clauses to recode values into a new variable,
#' you can use formats to recode a variable into a new one.
#'
#' @param data_frame A data frame which contains the the original variables to recode.
#' @param new_var The name of the newly created and recoded variable.
#' @param ... [recode()] Pass in the original variable name that should be recoded
#' along with the corresponding format container in the form: variable = format.
#'
#' In [recode_multi()] multiple variables can be recoded in one go and multilabels
#' can be applied. This overwrites the original variables and duplicates rows if
#' multilabels are applied. In occasions were you want to use format containers to
#' afterwards perform operations with other packages, you can make use of this
#' principle with this function.
#'
#' @details
#' [recode()] is based on the 'SAS' function put(), which provides an efficient
#' and readable way, to generate new variables with the help of formats.
#'
#' When creating a format you can basically write code like you think: This new
#' category consists of these original values. And after that you just apply these
#' new categories to the original values to create a new variable. No need for multiple
#' if_else statements.
#'
#' @return
#' Returns a data frame with the newly recoded variable.
#'
#' @seealso
#' Creating formats: [discrete_format()] and [interval_format()].
#'
#' Functions that also make use of formats: [frequencies()], [crosstabs()],
#' [any_table()].
#'
#' @examples
#' # Example formats
#' age. <- discrete_format(
#' "under 18" = 0:17,
#' "18 to under 25" = 18:24,
#' "25 to under 55" = 25:54,
#' "55 to under 65" = 55:64,
#' "65 and older" = 65:100)
#'
#' # Example data frame
#' my_data <- dummy_data(1000)
#'
#' # Call function
#' my_data <- my_data |> recode("age_group1", age = age.)
#'
#' # Formats can also be passed as characters
#' my_data <- my_data |> recode("age_group2", age = "age.")
#'
#' # Multilabel recode
#' sex. <- discrete_format(
#' "Total" = 1:2,
#' "Male" = 1,
#' "Female" = 2)
#'
#' income. <- interval_format(
#' "Total" = 0:99999,
#' "below 500" = 0:499,
#' "500 to under 1000" = 500:999,
#' "1000 to under 2000" = 1000:1999,
#' "2000 and more" = 2000:99999)
#'
#' multi_data <- my_data |> recode_multi(sex = sex., income = income.)
#'
#' @rdname recode
#'
#' @export
recode <- function(data_frame,
new_var,
...){
# Measure the time
start_time <- Sys.time()
# Convert to character
new_var <- gsub("\"", "", deparse(substitute(new_var)))
if (new_var %in% names(data_frame)){
message(" X ERROR: Variable '", new_var, "' already exists. Recoding aborted, variable won't be overwritten.")
return(data_frame)
}
# Translate ... into separately controllable arguments
formats_list <- list(...)
# Evaluate formats early, otherwise apply formats can't evaluate them in unit
# test situation.
formats <- stats::setNames(
lapply(formats_list, function(expression){
# Catch expression if passed as string
if (is.character(expression)) {
tryCatch(get(expression, envir = parent.frame()),
error = function(e) NULL)
}
# Catch expression if passed as symbol
else{
tryCatch(eval(expression, envir = parent.frame()),
error = function(e) NULL)
}
}),
names(formats_list))
# Get information from ... list
current_var <- names(formats)[1]
format_df <- formats[[current_var]]
if (!current_var %in% names(data_frame)){
message(" X ERROR: Variable '", current_var, "' not found in the input data frame. No format will be applied.")
return(data_frame)
}
if (!data.table::is.data.table(format_df)){
message(" X ERROR: The format for '", current_var, "' must be a data table. No format will be applied.")
return(data_frame)
}
if (names(format_df)[1] == "value" && any(duplicated(format_df[["value"]]))){
message(" ! WARNING: The format for '", current_var, "' is a multilabel. A multilabel can't be fully applied in recode.\n",
" Only one of the matching categories will be applied.")
format_df <- format_df |> unique(by = "value", fromLast = FALSE)
}
if (is.factor(data_frame[[current_var]])){
message(" ~ NOTE: '", current_var, "' is a factor variable. Formats only work if the visible character values\n",
" are specified as input values and not the factor levels.")
}
# Look up variable names in format data frame to check whether it is an
# interval or discrete format
interval_variables <- c("from", "to")
actual_variables <- names(format_df)[1:2]
# In case of interval format
if (identical(interval_variables, actual_variables)){
# Remove NA values
if (any(is.na(data_frame[[current_var]]))){
message(" ! WARNING: Variable '", current_var, "' has NA values. Interval merge only works without NA values.\n",
" NA values will be removed.")
}
data_frame <- data_frame |>
collapse::fsubset(!is.na(data_frame[[current_var]]))
# Get number of rows from data frame to compare after the merge to check for multilabel
original_rows <- nrow(data_frame)
# Generate pseudo variables for range merging
data_frame[["qol_from"]] <- data_frame[[as.character(current_var)]]
data_frame[["qol_to"]] <- data_frame[[as.character(current_var)]]
data_frame[["qol_ID"]] <- seq_len(nrow(data_frame))
# Set key variables
temp_dt <- data.table::as.data.table(data_frame)
format_dt <- data.table::copy(format_df)
data.table::setkey(temp_dt, qol_from, qol_to)
data.table::setkey(format_dt, from, to)
# Merge data frame with format by range
data_frame <- data.table::foverlaps(temp_dt, format_dt,
by.x = c("qol_from", "qol_to"),
by.y = c("from", "to")) |>
collapse::frename(stats::setNames("label", new_var)) |>
data.table::setorder(qol_ID) |>
unique(by = "qol_ID") |>
dropp("qol_ID", "qol_from", "qol_to", "from", "to")
data_frame <- data_frame |>
data.table::setcolorder(new_var, after = ncol(data_frame))
if (nrow(data_frame) > original_rows){
message(" ! WARNING: The format for '", current_var, "' is a multilabel. For interval formats this leads to\n",
" doubling observations.")
}
}
# In case of discrete format
else{
# Rename label column to be specific to the variable
format_df <- format_df |>
collapse::frename(stats::setNames("value", current_var))
# Join format with data frame
data_frame <- data_frame |>
collapse::join(format_df,
on = current_var,
how = "left",
verbose = FALSE) |>
collapse::frename(stats::setNames("label", new_var))
}
end_time <- round(difftime(Sys.time(), start_time, units = "secs"), 3)
message("- - - 'recode' execution time: ", end_time, " seconds")
data_frame
}
#' @rdname recode
#'
#' @export
recode_multi <- function(data_frame, ...){
# Measure the time
start_time <- Sys.time()
# Translate ... into separately controllable arguments
formats_list <- list(...)
# Evaluate formats early, otherwise apply formats can't evaluate them in unit
# test situation.
formats <- stats::setNames(
lapply(formats_list, function(expression){
# Catch expression if passed as string
if (is.character(expression)) {
tryCatch(get(expression, envir = parent.frame()),
error = function(e) NULL)
}
# Catch expression if passed as symbol
else{
tryCatch(eval(expression, envir = parent.frame()),
error = function(e) NULL)
}
}),
names(formats_list))
# Get information from ... list
variables <- names(formats)
var_order <- names(data_frame)
data_frame <- data_frame |>
apply_format(formats, variables) |>
data.table::setcolorder(var_order)
end_time <- round(difftime(Sys.time(), start_time, units = "secs"), 3)
message("- - - 'recode' execution time: ", end_time, " seconds")
data_frame
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.