R/split_var.R

Defines functions split_var_helper split_var_if split_var

Documented in split_var split_var_if

#' @title Split numeric variables into smaller groups
#' @name split_var
#'
#' @description Recode numeric variables into equal sized groups, i.e. a
#'   variable is cut into a smaller number of groups at specific cut points.
#'   \code{split_var_if()} is a scoped variant of \code{split_var()}, where
#'   transformation will be applied only to those variables that match the
#'   logical condition of \code{predicate}.
#'
#' @seealso \code{\link{group_var}} to group variables into equal ranged groups,
#'          or \code{\link{rec}} to recode variables.
#'
#' @param n The new number of groups that \code{x} should be split into.
#' @param inclusive Logical; if \code{TRUE}, cut point value are included in
#'          the preceding group. This may be necessary if cutting a vector into
#'          groups does not define proper ("equal sized") group sizes.
#'          See 'Note' and 'Examples'.
#'
#' @inheritParams to_dummy
#' @inheritParams group_var
#' @inheritParams rec
#'
#' @return A grouped variable with equal sized groups. If \code{x} is a data frame,
#'   for \code{append = TRUE}, \code{x} including the grouped variables as new
#'   columns is returned; if \code{append = FALSE}, only the grouped variables
#'   will be returned. If \code{append = TRUE} and \code{suffix = ""},
#'   recoded variables will replace (overwrite) existing variables.
#'
#' @details \code{split_var()} splits a variable into equal sized groups, where
#'   the amount of groups depends on the \code{n}-argument. Thus, this
#'   functions \code{\link{cut}s} a variable into groups at the specified
#'   \code{\link[stats]{quantile}s}.
#'   \cr \cr
#'   By contrast, \code{\link{group_var}} recodes a variable into groups, where
#'   groups have the same value range (e.g., from 1-5, 6-10, 11-15 etc.).
#'   \cr \cr
#'   \code{split_var()} also works on grouped data frames
#'   (see \code{\link[dplyr]{group_by}}). In this case, splitting is applied to
#'   the subsets of variables in \code{x}. See 'Examples'.
#'
#' @note In case a vector has only few number of unique values, splitting into
#'   equal sized groups may fail. In this case, use the \code{inclusive}-argument
#'   to shift a value at the cut point into the lower, preceeding group to
#'   get equal sized groups. See 'Examples'.
#'
#' @examples
#' data(efc)
#' # non-grouped
#' table(efc$neg_c_7)
#'
#' # split into 3 groups
#' table(split_var(efc$neg_c_7, n = 3))
#'
#' # split multiple variables into 3 groups
#' split_var(efc, neg_c_7, pos_v_4, e17age, n = 3, append = FALSE)
#' frq(split_var(efc, neg_c_7, pos_v_4, e17age, n = 3, append = FALSE))
#'
#' # original
#' table(efc$e42dep)
#'
#' # two groups, non-inclusive cut-point
#' # vector split leads to unequal group sizes
#' table(split_var(efc$e42dep, n = 2))
#'
#' # two groups, inclusive cut-point
#' # group sizes are equal
#' table(split_var(efc$e42dep, n = 2, inclusive = TRUE))
#'
#' # Unlike dplyr's ntile(), split_var() never splits a value
#' # into two different categories, i.e. you always get a clean
#' # separation of original categories
#' library(dplyr)
#'
#' x <- dplyr::ntile(efc$neg_c_7, n = 3)
#' table(efc$neg_c_7, x)
#'
#' x <- split_var(efc$neg_c_7, n = 3)
#' table(efc$neg_c_7, x)
#'
#' # works also with gouped data frames
#' mtcars %>%
#'   split_var(disp, n = 3, append = FALSE) %>%
#'   table()
#'
#' mtcars %>%
#'   group_by(cyl) %>%
#'   split_var(disp, n = 3, append = FALSE) %>%
#'   table()
#' @export
split_var <- function(x, ..., n, as.num = FALSE, val.labels = NULL, var.label = NULL, inclusive = FALSE, append = TRUE, suffix = "_g") {

  # evaluate arguments, generate data
  .dat <- get_dot_data(x, dplyr::quos(...))

  recode_fun(
    x = x,
    .dat = .dat,
    fun = get("split_var_helper", asNamespace("sjmisc")),
    suffix = suffix,
    append = append,
    groupcount = n,
    as.num = as.num,
    var.label = var.label,
    val.labels = val.labels,
    inclusive = inclusive
  )
}


#' @rdname split_var
#' @export
split_var_if <- function(x, predicate, n, as.num = FALSE, val.labels = NULL, var.label = NULL, inclusive = FALSE, append = TRUE, suffix = "_g") {

  # select variables that match logical conditions
  .dat <- dplyr::select_if(x, .predicate = predicate)

  # if no variable matches the condition specified
  # in predicate, return original data

  if (sjmisc::is_empty(.dat)) {
    if (append)
      return(x)
    else
      return(.dat)
  }


  recode_fun(
    x = x,
    .dat = .dat,
    fun = get("split_var_helper", asNamespace("sjmisc")),
    suffix = suffix,
    append = append,
    groupcount = n,
    as.num = as.num,
    var.label = var.label,
    val.labels = val.labels,
    inclusive = inclusive
  )
}


split_var_helper <- function(x, groupcount, as.num, val.labels, var.label, inclusive) {
  # retrieve variable label
  if (is.null(var.label))
    var_lab <- sjlabelled::get_label(x)
  else
    var_lab <- var.label

  # do we have any value labels?
  val_lab <- val.labels

  # amount of "cuts" is groupcount - 1
  zaehler <- seq_len(groupcount - 1)

  # prepare division
  nenner <- rep(groupcount, length(zaehler))

  # get quantiles
  qu_prob <- zaehler / nenner

  # get quantile values
  grp_cuts <- stats::quantile(x, qu_prob, na.rm = TRUE)

  # create breaks. need to check if these are non-unique
  breaks <- unique(c(0, grp_cuts, max(x, na.rm = TRUE)))

  # cut variables into groups
  retval <- cut(
    x = x,
    breaks = breaks,
    include.lowest = !inclusive,
    right = inclusive
  )

  # rename factor levels
  levels(retval) <- seq_len(groupcount)

  # to numeric?
  if (as.num) retval <- sjlabelled::as_numeric(retval)

  # set back variable and value labels
  retval <- suppressWarnings(sjlabelled::set_label(retval, label = var_lab))
  retval <- suppressWarnings(sjlabelled::set_labels(retval, labels = val_lab))

  # return value
  retval
}

Try the sjmisc package in your browser

Any scripts or data that you put into this service are public.

sjmisc documentation built on Dec. 11, 2021, 9:34 a.m.