R/to_factor.R

Defines functions unlabelled .to_factor_col_data_frame to_factor.data.frame to_factor.haven_labelled to_factor.default to_factor.factor to_factor

Documented in to_factor to_factor.data.frame to_factor.haven_labelled unlabelled

#' Convert input to a factor.
#'
#' The base function [base::as.factor()] is not a generic, but this variant
#' is. By default, `to_factor()` is a wrapper for [base::as.factor()].
#' Please note that `to_factor()` differs slightly from [haven::as_factor()]
#' method provided by \pkg{haven} package.
#'
#' @param x Object to coerce to a factor.
#' @param ... Other arguments passed down to method.
#' @export
to_factor <- function(x, ...) {
  UseMethod("to_factor")
}

#' @export
to_factor.factor <- function(x, ...) {
  x
}

#' @export
to_factor.default <- function(x, ...) {
  vl <- var_label(x)
  x <- as.factor(x)
  var_label(x) <- vl
  x
}

#' @rdname to_factor
#' @param levels What should be used for the factor levels: the labels, the
#' values or labels prefixed with values?
#' @param ordered `TRUE` for ordinal factors, `FALSE` (default) for nominal
#' factors.
#' @param nolabel_to_na Should values with no label be converted to `NA`?
#' @param sort_levels How the factor levels should be sorted? (see Details)
#' @param decreasing Should levels be sorted in decreasing order?
#' @param drop_unused_labels Should unused value labels be dropped?
#'   (applied only if `strict = FALSE`)
#' @param user_na_to_na Convert user defined missing values into `NA`?
#' @param strict Convert to factor only if all values have a defined label?
#' @param unclass If not converted to a factor (when `strict = TRUE`),
#'   convert to a character or a numeric factor by applying [base::unclass()]?
#' @param explicit_tagged_na Should tagged NA (cf. [haven::tagged_na()]) be
#'   kept as explicit factor levels?
#' @details
#'   If some values doesn't have a label, automatic labels will be created,
#'   except if `nolabel_to_na` is `TRUE`.
#'
#'   If `sort_levels == 'values'`, the levels will be sorted according to the
#'   values of `x`.
#'   If `sort_levels == 'labels'`, the levels will be sorted according to
#'   labels' names.
#'   If `sort_levels == 'none'`, the levels will be in the order the value
#'   labels are defined in `x`. If some labels are automatically created, they
#'   will be added at the end.
#'   If `sort_levels == 'auto'`, `sort_levels == 'none'` will be used, except
#'   if some values doesn't have a defined label. In such case,
#'   `sort_levels == 'values'` will be applied.
#' @examples
#' v <- labelled(
#'   c(1, 2, 2, 2, 3, 9, 1, 3, 2, NA),
#'   c(yes = 1, no = 3, "don't know" = 9)
#' )
#' to_factor(v)
#' to_factor(v, nolabel_to_na = TRUE)
#' to_factor(v, "p")
#' to_factor(v, sort_levels = "v")
#' to_factor(v, sort_levels = "n")
#' to_factor(v, sort_levels = "l")
#'
#' x <- labelled(c("H", "M", "H", "L"), c(low = "L", medium = "M", high = "H"))
#' to_factor(x, ordered = TRUE)
#'
#' # Strict conversion
#' v <- labelled(c(1, 1, 2, 3), labels = c(No = 1, Yes = 2))
#' to_factor(v)
#' to_factor(v, strict = TRUE) # Not converted because 3 does not have a label
#' to_factor(v, strict = TRUE, unclass = TRUE)
#' @export
to_factor.haven_labelled <- function(
    x, levels = c(
      "labels", "values",
      "prefixed"
    ), ordered = FALSE, nolabel_to_na = FALSE,
    sort_levels = c("auto", "none", "labels", "values"), decreasing = FALSE,
    drop_unused_labels = FALSE, user_na_to_na = FALSE, strict = FALSE,
    unclass = FALSE, explicit_tagged_na = FALSE,
    ...) {
  vl <- var_label(x)
  levels <- match.arg(levels)
  sort_levels <- match.arg(sort_levels)
  if (user_na_to_na) {
    x <- user_na_to_na(x)
  }
  if (explicit_tagged_na && is.double(x)) {
    new_labels <- to_character(val_labels(x), explicit_tagged_na = TRUE)
    x <- to_character(unclass(x), explicit_tagged_na = TRUE)
    if (any(is.na(new_labels))) { # regular NA with a label
      x[is.na(x)] <- "NA"
      new_labels[is.na(new_labels)] <- "NA"
    }
    val_labels(x) <- new_labels
  } else {
    l <- val_labels(x)
    val_labels(x) <- l[!is.na(l)] # keeping not NA values
  }
  if (strict) {
    allval <- unique(x)
    allval <- allval[!is.na(allval)]
    nolabel <- allval[!allval %in% val_labels(x)]
    if (length(nolabel) > 0) {
      if (unclass) {
        x <- unclass(x)
      }
      return(x)
    }
  }
  if (nolabel_to_na) {
    x <- nolabel_to_na(x)
  }
  labels <- val_labels(x)
  allval <- unique(x)
  allval <- allval[!is.na(allval)]
  nolabel <- sort(allval[!allval %in% labels])
  # if there are some values with no label
  if (length(nolabel) > 0) {
    names(nolabel) <- as.character(nolabel)
    levs <- c(labels, nolabel)
  } else {
    levs <- labels
  }

  if (sort_levels == "auto" && length(nolabel) > 0) {
    sort_levels <- "values"
  }
  if (sort_levels == "labels") {
    levs <- levs[order(names(levs), decreasing = decreasing)]
  }
  if (sort_levels == "values") {
    levs <- sort(levs, decreasing = decreasing)
  }

  if (levels == "labels") {
    labs <- names(levs)
  }
  if (levels == "values") {
    labs <- unname(levs)
  }
  if (levels == "prefixed") {
    labs <- names_prefixed_by_values(levs)
  }
  levs <- unname(levs)
  x <- factor(x,
    levels = levs, labels = labs, ordered = ordered,
    ...
  )
  if (drop_unused_labels) {
    x <- droplevels(x)
  }
  var_label(x) <- vl
  x
}

#' @rdname to_factor
#' @param labelled_only for a data.frame, convert only labelled variables to
#' factors?
#' @details
#'   When applied to a data.frame, only labelled vectors are converted by
#'   default to a factor. Use `labelled_only = FALSE` to convert all variables
#'   to factors.
#' @export
to_factor.data.frame <- function(
    x,
    levels = c("labels", "values", "prefixed"),
    ordered = FALSE,
    nolabel_to_na = FALSE,
    sort_levels = c("auto", "none", "labels", "values"),
    decreasing = FALSE,
    labelled_only = TRUE,
    drop_unused_labels = FALSE,
    strict = FALSE,
    unclass = FALSE,
    explicit_tagged_na = FALSE,
    ...) {
  cl <- class(x)
  x <- dplyr::as_tibble(
    lapply(
      x,
      .to_factor_col_data_frame,
      levels = levels,
      ordered = ordered,
      nolabel_to_na = nolabel_to_na,
      sort_levels = sort_levels,
      decreasing = decreasing,
      labelled_only = labelled_only,
      drop_unused_labels = drop_unused_labels,
      strict = strict,
      unclass = unclass,
      explicit_tagged_na = explicit_tagged_na,
      ...
    )
  )
  class(x) <- cl
  x
}

.to_factor_col_data_frame <- function(
    x,
    levels = c("labels", "values", "prefixed"),
    ordered = FALSE,
    nolabel_to_na = FALSE,
    sort_levels = c("auto", "none", "labels", "values"),
    decreasing = FALSE,
    labelled_only = TRUE,
    drop_unused_labels = FALSE,
    strict = FALSE,
    unclass = FALSE,
    explicit_tagged_na = FALSE,
    ...) {
  if (inherits(x, "haven_labelled")) {
    x <- to_factor(x,
      levels = levels,
      ordered = ordered,
      nolabel_to_na = nolabel_to_na,
      sort_levels = sort_levels,
      decreasing = decreasing,
      drop_unused_labels = drop_unused_labels,
      strict = strict,
      unclass = unclass,
      explicit_tagged_na = explicit_tagged_na,
      ...
    )
  } else if (!labelled_only) {
    x <- to_factor(x)
  }
  x
}

#' @rdname to_factor
#' @description
#' `unlabelled(x)` is a shortcut for
#' `to_factor(x, strict = TRUE, unclass = TRUE, labelled_only = TRUE)`.
#' @details
#' `unlabelled()` is a shortcut for quickly removing value labels of a vector
#' or of a data.frame. If all observed values have a value label, then the
#' vector will be converted into a factor. Otherwise, the vector will be
#' unclassed.
#' If you want to remove value labels in all cases, use [remove_val_labels()].
#' @examples
#'
#' df <- data.frame(
#'   a = labelled(c(1, 1, 2, 3), labels = c(No = 1, Yes = 2)),
#'   b = labelled(c(1, 1, 2, 3), labels = c(No = 1, Yes = 2, DK = 3)),
#'   c = labelled(
#'     c("a", "a", "b", "c"),
#'     labels = c(No = "a", Maybe = "b", Yes = "c")
#'   ),
#'   d = 1:4,
#'   e = factor(c("item1", "item2", "item1", "item2")),
#'   f = c("itemA", "itemA", "itemB", "itemB"),
#'   stringsAsFactors = FALSE
#' )
#' if (require(dplyr)) {
#'   glimpse(df)
#'   glimpse(unlabelled(df))
#' }
#' @export
unlabelled <- function(x, ...) {
  if (is.data.frame(x)) {
    to_factor(x, strict = TRUE, unclass = TRUE, labelled_only = TRUE, ...)
  } else if (inherits(x, "haven_labelled")) {
    to_factor(x, strict = TRUE, unclass = TRUE, ...)
  } else {
    x
  }
}
larmarange/labelled documentation built on June 3, 2024, 11:24 a.m.