R/clean.R

Defines functions data_rm_zeros data_rm_missings data_clean_default data_clean data_prepare

Documented in data_clean data_clean_default data_prepare data_rm_missings data_rm_zeros

#' Prepare data for calculation
#'
#' Clean data, check column selection, remove cases with missing values
#'
#' @keywords internal
#'
#' @param data Data frame to be prepared.
#' @param cols The first column selection.
#' @param cross The second column selection.
#' @param cols.categorical A tidy selection of columns to be checked for categorical values.
#' @param cols.numeric A tidy selection of columns to be converted to numeric values.
#' @param cols.reverse A tidy selection of columns with reversed codings.
#' @param clean Whether to clean data using \link{data_clean}.
#'
#' @return Prepared data frame.
#' @examples
#' data <- volker::chatgpt
#' data_prepare(data, sd_age, sd_gender)
#'
#' @export
#'
data_prepare <- function(data, cols, cross, cols.categorical, cols.numeric, cols.reverse, clean = TRUE) {
  # 1. Checks
  check_is_dataframe(data)
  check_has_column(data, {{ cols }})

  if (!missing(cross)) {
      check_has_column(data, {{ cross }})
  }

  # 2. Apply cleaning plan
  if (clean) {
    data <- data_clean(data, clean)
  }

  # 3. Convert to numeric
  if (!missing(cols.numeric)) {
    data <- data_num(data, {{ cols.numeric }})
  }

  # 4. Remove missings
  if (!missing(cross)) {
    data <- data_rm_missings(data, c({{ cols }}, {{ cross }}))
  } else {
    data <- data_rm_missings(data, {{ cols }})
  }

  # 5. Reverse items
  if (!missing(cols.reverse)) {
    data <- data_rev(data, {{ cols.reverse }})
  }

  # 6. Check categorical values
  if (!missing(cols.categorical)) {
    check_is_categorical(data, {{ cols.categorical }})
    data <- data_cat(data, {{ cols.categorical }})
  }

  # # 6. Remove negatives
  # if (isTRUE(rm.negatives) & !missing(cross)) {
  #   data <- data_rm_negatives(data, c({{ cols }}, {{ cross }}))
  # }
  # else if (isTRUE(rm.negatives) & missing(cross)) {
  #   data <- data_rm_negatives(data, {{ cols }})
  # }
  # else if (rm.negatives == "cols") {
  #   data <- data_rm_negatives(data, {{ cols }})
  # }

  check_is_dataframe(data)
  attr(data, "cases") <- nrow(data)
  data
}

#' Prepare dataframe for the analysis
#'
#' Depending on the selected cleaning plan, for example,
#' recodes residual values to NA.
#'
#' The tibble remembers whether it was already cleaned and
#' the cleaning plan is only applyed once in the first call.
#'
#' @keywords internal
#'
#' @param data Data frame.
#' @param plan The cleaning plan. By now, only "default" is supported. See \link{data_clean_default}.
#' @param ... Other parameters passed to the appropriate cleaning function.
#' @return Cleaned data frame with vlkr_df class.
#' @examples
#' ds <- volker::chatgpt
#' ds <- data_clean(ds)
#' @export
data_clean <- function(data, plan = "default", ...) {

  # Prepare only once
  if ("vlkr_df" %in% class(data)) {
    return (data)
  }

  if (isTRUE(plan)) {
    plan <- "default"
  }

  if (plan == "default") {
    data <- data_clean_default(data,...)
  }

  .to_vlkr_df(data)
}

#' Prepare data originating from SoSci Survey or SPSS
#'
#' Preparation steps:
#' - Remove the avector class from all columns
#'   (comes from SoSci and prevents combining vectors)
#' - Recode residual factor values to NA (e.g. "[NA] nicht beantwortet")
#' - Recode residual numeric values to NA (e.g. -9)
#'
#' The tibble remembers whether it was already prepared and
#' the operations are only performed once in the first call.
#'
#' @keywords internal
#'
#' @param data Data frame
#' @param remove.na.levels Remove residual values from factor columns.
#'                      Either a character vector with residual values or TRUE to use defaults in \link{VLKR_NA_LEVELS}.
#'                      You can also define or disable residual levels by setting the global option vlkr.na.levels
#'                      (e.g. `options(vlkr.na.levels=c("Not answered"))` or to disable `options(vlkr.na.levels=FALSE)`).
#' @param remove.na.numbers Remove residual values from numeric columns.
#'                      Either a numeric vector with residual values or TRUE to use defaults in \link{VLKR_NA_NUMBERS}.
#'                      You can also define or disable residual values by setting the global option vlkr.na.numbers
#'                      (e.g. `options(vlkr.na.numbers=c(-2,-9))` or to disable `options(vlkr.na.numbers=FALSE)`).
#' @return Data frame with vlkr_df class (the class is used to prevent double preparation).
#' @examples
#' ds <- volker::chatgpt
#' ds <- data_clean_default(ds)
#' @export
data_clean_default <- function(data, remove.na.levels = TRUE, remove.na.numbers = TRUE) {

  # Prepare only once
  if ("vlkr_df" %in% class(data)) {
    return (data)
  }

  # Remove avector class
  for (i in c(1:ncol(data))) {
    class(data[[i]]) <- setdiff(class(data[[i]]), "avector")
  }

  # Add missing residual labels to numeric columns that have at least one label
  data <- labs_impute(data)

  # Store codebook before mutate operations
  data <- labs_store(data)

  # Remove residual levels such as "[NA] nicht beantwortet"
  if ((length(remove.na.levels) > 0) | any(remove.na.levels != FALSE)) {
    data <- data_rm_na_levels(data, remove.na.levels)
  }

  # Remove residual numbers such as -9
  # (but only if they are listed in the attributes of a column)
  if ((length(remove.na.numbers) > 0) | any(remove.na.numbers != FALSE)) {
    data <- data_rm_na_numbers(data, remove.na.numbers)
  }

  # Restore codebook
  data <- labs_restore(data)

  .to_vlkr_df(data)
}

#' Remove missings and output a message
#'
#' @keywords internal
#'
#' @param data Data frame.
#' @param cols A tidy column selection.
#' @param force By default, cases with missings are only removed
#'             when the vlkr.na.omit option is TRUE.
#'             Set force to TRUE to always remove such cases.
#' @return Data frame.
data_rm_missings <- function(data, cols, force = FALSE) {

  cleaned <- tidyr::drop_na(data, {{ cols }})
  cases <-  nrow(data) - nrow(cleaned)

  if (cases > 0) {

    na.omit <- force || dplyr::coalesce(getOption("vlkr.na.omit"), VLKR_NA_OMIT)
    colnames <- rlang::as_label(rlang::enquo(cols))

    if (na.omit == TRUE) {
      data <- cleaned
    }

    data <- .attr_insert(data, "missings", "na", list("cols" = colnames, "n" = cases, "omit" = na.omit))
  }

  data
}

#' Remove zero values, drop missings and output a message
#'
#' @keywords internal
#'
#' @param data Data frame.
#' @param cols A tidy column selection.
#' @return Data frame.
data_rm_zeros <- function(data, cols) {

  cleaned <- data |>
    labs_store() |>
    dplyr::mutate(dplyr::across({{ cols }}, ~ dplyr::if_else(. == 0, NA, .))) |>
    labs_restore() |>
    tidyr::drop_na({{ cols }})


  cases <-  nrow(data) - nrow(cleaned)

  if (cases > 0) {
    data <- cleaned
    colnames <- rlang::as_label(rlang::enquo(cols))
    data <- .attr_insert(data, "missings", "zero", list("cols" = colnames, "n" = cases))
  }

  data
}

#' Remove negatives and output a warning
#'
#' @keywords internal
#'
#' @param data Data frame
#' @param cols A tidy column selection
#' @return Data frame
data_rm_negatives <- function(data, cols) {

  data_clean <- data |>
    labs_store() |>
    dplyr::mutate(dplyr::across({{ cols }}, ~ ifelse(. < 0, NA, .))) |>
    labs_restore() |>
    #TODO: only drop rows that had negatives, not all missings
    tidyr::drop_na({{ cols }})

  cases <-  nrow(data) - nrow(data_clean)

  if (cases > 0) {
    data <- data_clean
    colnames <- rlang::as_label(rlang::enquo(cols))
    data <- .attr_insert(data, "missings", "negative", list("cols" = colnames, "n"=cases))
  }

  data
}


#' Remove NA levels
#'
#' @keywords internal
#'
#' @param data Data frame
#' @param na.levels Residual values to remove from factor columns.
#'                  Either a character vector with residual values or TRUE to use defaults in \link{VLKR_NA_LEVELS}.
#'                  You can define default residual levels by setting the global option vlkr.na.levels
#'                  (e.g. `options(vlkr.na.levels=c("Not answered"))`).
#' @param default The default na levels, if not explicitly provided by na.levels or the options.
#' @return Data frame
data_rm_na_levels <- function(data, na.levels = TRUE, default = VLKR_NA_LEVELS) {
  if (is.logical(na.levels)) {
    na.levels <- getOption("vlkr.na.levels")
    if (is.null(na.levels)) {
      na.levels <- default
    } else if (all(na.levels == FALSE)) {
      na.levels <- c()
    }
  }

  dplyr::mutate(
    data,
    dplyr::across(
      tidyselect::where(~ is.factor(.)),
      ~ .factor_with_attr(replace(., . %in% na.levels, NA),setdiff(levels(.), na.levels))
    )
  )

}

#' Remove NA numbers
#'
#' @keywords internal
#'
#' @param data Data frame
#' @param na.numbers Either a numeric vector with residual values or TRUE to use defaults in \link{VLKR_NA_NUMBERS}.
#'                   You can also define residual values by setting the global option vlkr.na.numbers
#'                   (e.g. `options(vlkr.na.numbers=c(-9))`).
#' @param check.labels Whether to only remove NA numbers that are listed in the attributes of a column.
#' @param default The default na numbers, if not explicitly provided by na.numbers or the options.
#' @return Data frame
data_rm_na_numbers <- function(data, na.numbers = TRUE, check.labels = TRUE, default = VLKR_NA_NUMBERS) {
  if (is.logical(na.numbers)) {
    na.numbers <- cfg_get_na_numbers(default)
  }

  data %>%
    dplyr::mutate(
      dplyr::across(
        dplyr::where(is.numeric),
        ~ dplyr::if_else(
          . %in% na.numbers &
            (
              !check.labels |
                (as.character(.) %in% names(attributes(.))) |
                (as.character(.) %in% as.character(attr(., "labels", TRUE)))
            ),
          NA,
          .
        )
      )
    )
}

#' Reverse item values
#'
#' @keywords internal
#'
#' @param data A data frame containing the items to be reversed.
#' @param cols A tidy selection of columns to reverse.
#'             For example, if you want to calculate an index of the
#'             two items "I feel bad about this" and "I like it",
#'             both coded with 1=not at all to 5=fully agree,
#'             you need to reverse one of them to make the
#'             codings compatible.
#' @return A data frame with the specified items reversed.
data_rev <- function(data, cols) {

  # Get limits
  limits <- get_limits(data, {{ cols }})

  rev_eval <- tidyselect::eval_select(expr = enquo(cols), data = data)
  for (rev_col in rev_eval) {
    data[[rev_col]] <- (limits[2] - data[[rev_col]]) + limits[1]
  }

  data
}

#' Convert values to numeric values
#'
#' @keywords internal
#'
#' @param data A data frame containing the items to be converted.
#' @param cols A tidy selection of columns to convert.
#' @return A data frame with the converted values
data_num <- function(data, cols) {

  # Get limits
  #limits <- get_limits(data, {{ cols }})

  cols_eval <- tidyselect::eval_select(expr = enquo(cols), data = data)
  for (col in cols_eval) {
    old_attr <- attributes(data[[col]])
    old_attr[c("class", "levels")] <- NULL
    data[[col]] <- as.numeric(data[[col]])
    attributes(data[[col]]) <- old_attr

  }

  data
}

#' Convert numeric values to string
#'
#' @keywords internal
#'
#' @param data A data frame containing the items to be converted.
#' @param cols A tidy selection of columns to convert.
#' @return A data frame with the converted values
data_cat <- function(data, cols) {

  cols_eval <- tidyselect::eval_select(expr = enquo(cols), data = data)
  for (col in cols_eval) {
    if (is.numeric(data[[col]])) {
      old_attr <- attributes(data[[col]])
      old_attr[c("class", "levels")] <- NULL
      data[[col]] <- as.character(data[[col]])
      attributes(data[[col]]) <- old_attr
    }
  }

  data
}

#' Round and format selected numeric columns
#'
#' Round and format specified numeric columns in a data frame
#' to a fixed number of decimal places.
#'
#' For each selected numeric column:
#' \itemize{
#'   \item The column is rounded using \code{round()}.
#'   \item It is then formatted as a character vector with a fixed number of
#'         decimal places using \code{sprintf()}.
#'   \item Missing values (\code{NA}) are preserved as \code{NA_character_}.
#'   \item Original non-essential attributes (except \code{class} and
#'         \code{levels}) are restored.
#' }
#'
#' @keywords internal
#'
#' @param data A data frame or tibble.
#' @param cols A tidyselect expression specifying which columns to round
#'   (e.g., \code{c(var1, var2)} or \code{starts_with("score")}).
#' @param digits Integer; number of decimal places to round.
#' @return The input data frame, with the specified numeric columns rounded
#'   and formatted as character vectors.
data_round <- function(data, cols, digits) {
  cols_eval <- tidyselect::eval_select(expr = enquo(cols), data = data)
  for (col in cols_eval) {
    if (is.numeric(data[[col]])) {
      old_attr <- attributes(data[[col]])
      old_attr[c("class", "levels")] <- NULL
      data[[col]] <- round(data[[col]], digits)
      data[[col]] <- sprintf(paste0("%.", digits, "f"), data[[col]])
      data[[col]][data[[col]] == "NA"] <- NA_character_
      attributes(data[[col]]) <- old_attr
    }
  }
  data
}

#' One-hot encode selected columns
#'
#' @keywords internal

#' @param data A data frame or tibble.
#' @param ... Tidyselect expressions specifying columns to one-hot encode
#'
#' @return Data frame with one hot encoded data
data_onehot <- function(data, ...) {

  cols_eval <- tidyselect::eval_select(expr = rlang::expr(c(...)), data = data)
  selected_cols <- names(cols_eval)

  df_selected <- data[, selected_cols, drop = FALSE]

  df_selected[] <- lapply(df_selected, function(x) paste0("_", as.character(x)))
  mm <- stats::model.matrix(~ . - 1, data = df_selected)
  mm <- mm == 1

  mm <- dplyr::bind_cols(
    dplyr::select(data, -dplyr::all_of(selected_cols)),
    as.data.frame(mm)
  )

  mm
}

#' Get a formatted baseline from attributes of an object.
#'
#' The following attributes are considered:
#' - cases: Number of cases.
#' - missing: Removed zero, negative, and missing cases.
#' - focus: Focus category.
#' - auto: The k value of cluster methods.
#' - reversed: A list of reversed items.
#' - split: Items that were split at the median.
#' - adjust: The adjustment method for p values.
#'
#'
#' @keywords internal
#'
#' @param obj An object with supported attributes.
#' @param ignore Characer vector of attributes to ignore.
#' @return A formatted message or NULL if none of the supported attributes is present.
get_baseline <- function(obj, ignore = c()) {

  for (attrname in ignore) {
    attr(obj, attrname) <- NULL
  }

  baseline <- c()

  # Cases
  cases <- attr(obj, "cases", exact = TRUE)
  if (!is.null(cases)) {
    baseline <- c(baseline, paste0("n=", cases,"."))
  }

  # Focus categories
  focus <- attr(obj, "focus", exact = TRUE)
  if (!is.null(focus)) {
    baseline <- c(baseline, paste0("Frequencies based on values: ", paste(focus, collapse=", "), "."))
  }

  # Automatically selected k values
  auto <- attr(obj, "auto", exact = TRUE)
  if(!is.null(auto)) {
    baseline <- c(baseline, auto$msg)
  }

  # Reversed items
  reversed <- attr(obj, "reversed", exact=TRUE)
  if (!is.null(reversed)) {
    baseline <- c(baseline, paste0("Reversed items: ", paste(reversed, collapse = ", "), "."))
  }

  # Split
  split <- attr(obj, "split", exact = TRUE)
  if (!is.null(split)) {
    baseline <- c(baseline, paste0(split, "."))
  }

  # Missings
  missings <- attr(obj, "missings", exact = TRUE)
  if (!is.null(missings)) {
    cols <- c()
    baseline_missing <- c()
    if (!is.null(missings$na)) {
      postfix <- ifelse(isTRUE(missings$na$omit), " missing", " case(s) with missing")
      baseline_missing <- c(baseline_missing, paste0(missings$na$n,postfix))
    }

    if (!is.null(missings$zero)) {
      postfix <- ifelse(isTRUE(missings$na$omit), " zero", " case(s) with zero")
      baseline_missing <- c(baseline_missing, paste0(missings$zero$n, postfix))
    }

    if (!is.null(missings$negative)) {
      postfix <- ifelse(isTRUE(missings$na$omit), " negative", " case(s) with negative")
      baseline_missing <- c(baseline_missing, paste0(missings$negative$n,postfix))
    }

    if (isTRUE(missings$na$omit)) {
      baseline <- c(baseline, paste0(paste0(baseline_missing, collapse = ", "), " case(s) omitted."))
    } else {
      baseline <- c(baseline, paste0(paste0(baseline_missing, collapse = ", "), " values."))
    }
  }

  # Adjust
  adjust <- attr(obj, "adjust", exact = TRUE)
  if (!is.null(adjust)) {
    if (adjust != FALSE) {
      baseline <- c(baseline, paste0("Adjusted significance p values with ", adjust, " method."))
    }
  }


  # Assemble baseline
  if (length(baseline) > 0) {
    baseline = paste0(baseline, collapse = " ")
  } else {
    baseline <- NULL
  }

  baseline
}

#' Add vlkr_df class - that means, the data frame has been prepared
#'
#' @keywords internal
#'
#' @param data A tibble.
#' @return A tibble of class vlkr_df.
.to_vlkr_df <- function(data, digits = NULL) {
  data <- dplyr::as_tibble(data)
  class(data) <- c("vlkr_df", setdiff(class(data), "vlkr_df"))
  data
}

Try the volker package in your browser

Any scripts or data that you put into this service are public.

volker documentation built on Nov. 5, 2025, 5:21 p.m.