R/clean_names.R

Defines functions clean_names.tbl_lazy clean_names.tbl_graph clean_names.sf clean_names.default clean_names

Documented in clean_names clean_names.default clean_names.sf clean_names.tbl_graph clean_names.tbl_lazy

#' @title Cleans names of an object (usually a data.frame).
#'
#' @description
#' Resulting names are unique and consist only of the \code{_} character, numbers, and letters.
#' Capitalization preferences can be specified using the \code{case} parameter.
#'
#' Accented characters are transliterated to ASCII.  For example, an "o" with a
#' German umlaut over it becomes "o", and the Spanish character "enye" becomes
#' "n".
#' 
#' This function takes and returns a data.frame, for ease of piping with
#' \code{`\%>\%`}. For the underlying function that works on a character vector
#' of names, see \code{\link[janitor]{make_clean_names}}.  \code{clean_names} 
#' relies on the versatile function \code{\link[snakecase]{to_any_case}}, which 
#' accepts many arguments.  See that function's documentation for ideas on getting 
#' the most out of \code{clean_names}.  A few examples are included below.
#' 
#' A common issue is that the micro/mu symbol is replaced by "m" instead of "u".
#' The replacement with "m" is more correct when doing Greek-to-ASCII
#' transliteration but less correct when doing scientific data-to-ASCII
#' transliteration.  A warning will be generated if the "m" replacement occurs.
#' To replace with "u", please add the argument \code{replace=janitor:::mu_to_u}
#' which is a character vector mapping all known mu or micro Unicode code points
#' (characters) to "u".
#'
#' @param dat the input data.frame.
#' @inheritDotParams make_clean_names -string
#' @return Returns the data.frame with clean names.
#' 
#' @details \code{clean_names()} is intended to be used on \code{data.frames}
#'   and \code{data.frame}-like objects. For this reason there are methods to
#'   support using \code{clean_names()} on \code{sf} and \code{tbl_graph} (from
#'   \code{tidygraph}) objects as well as on database connections through
#'   \code{dbplyr}. For cleaning other named objects like named lists 
#'   and vectors, use \code{make_clean_names()}.
#' 
#' @export
#' @family Set names
#' @examples
#' 
#' # --- Simple Usage ---
#' x <- data.frame(caseID = 1, DOB = 2, Other = 3)
#' clean_names(x)
#'
#' # or pipe in the input data.frame:
#' x %>%
#'   clean_names()
#'
#' # if you prefer camelCase variable names:
#' x %>%
#'   clean_names(., "lower_camel")
#'
#' # (not run) run clean_names after reading in a spreadsheet:
#' # library(readxl)
#' # read_excel("messy_excel_file.xlsx") %>%
#' #   clean_names()
#' 
#' # --- Taking advantage of the underlying snakecase::to_any_case arguments ---
#' 
#' # Restore column names to Title Case, e.g., for plotting
#' mtcars %>%
#'   clean_names(case = "title")
#'   
#' # Tell clean_names to leave certain abbreviations untouched:
#' x %>%
#'   clean_names(case = "upper_camel", abbreviations = c("ID", "DOB")) 
#'   
clean_names <- function(dat, ...) {
  UseMethod("clean_names")
}

#' @rdname clean_names
#' @export
clean_names.default <- function(dat, ...) {
  if(is.null(names(dat)) && is.null(dimnames(dat))) {
    stop(
      "`clean_names()` requires that either names or dimnames be non-null.",
      call. = FALSE
    )
  }
  if(is.null(names(dat))) {
    dimnames(dat) <- lapply(dimnames(dat), make_clean_names, ...)
  } else {
    names(dat) <- make_clean_names(names(dat), ...)
  }
  dat
}

#' @rdname clean_names
#' @export
clean_names.sf <- function(dat, ...) {
  if (!requireNamespace("sf", quietly = TRUE)) { # nocov start
    stop(
      "Package 'sf' needed for this function to work. Please install it.",
      call. = FALSE
    )
  } # nocov end
  # get old names
  sf_names <- names(dat) 
  # identify ending column index to clean
  n_cols <- length(dat)-1 
  # clean all but last column
  sf_cleaned <- make_clean_names(sf_names[1:n_cols], ...) 
  # rename original df
  names(dat)[1:n_cols] <- sf_cleaned 
  
  return(dat)
}

#' @rdname clean_names
#' @export
clean_names.tbl_graph <- function(dat, ...) {
  if (!requireNamespace("tidygraph", quietly = TRUE)) { # nocov start
    stop(
      "Package 'tidygraph' needed for this function to work. Please install it.", 
      call. = FALSE
    )
  } # nocov end
  dplyr::rename_all(dat, .funs=make_clean_names, ...)
}

#' @rdname clean_names
#' @export
clean_names.tbl_lazy <- function(dat, ...) {
  if (!requireNamespace("dbplyr", quietly = TRUE)) { # nocov start
    stop(
      "Package 'dbplyr' needed for this function to work. Please install it.", 
      call. = FALSE
    )
  } # nocov end
  dplyr::rename_with(dat, janitor::make_clean_names, .cols = dplyr::everything(), ...)
}


# TODO: According to https://www.compart.com/en/unicode/U+03BC reviewed on
# 2021-07-10, there are some UTF-32 encoding characters that are also mu or
# micro.  This only handles the utf-8 values; to add more characters, just add
# to this character vector.

#' Constant to help map from mu to u
#' 
#' This is a character vector with names of all known Unicode code points that
#' look like the Greek mu or the micro symbol and values of "u".  This is
#' intended to simplify mapping from mu or micro in Unicode to the character "u"
#' with \code{clean_names()} and \code{make_clean_names()}.
#' 
#' See the help in \code{clean_names()} for how to use this.
#'
#' @family Set names
mu_to_u <-
  # setNames is used instead of setting the names directly because it prevents a
  # warning like "unable to translate '<U+3382>' to native encoding" for several
  # of the items.
  setNames(
    rep("u", 10),
    nm=
      c(
        "\u00b5", "\u03bc", "\u3382", "\u338c", "\u338d",
        "\u3395", "\u339b", "\u33b2", "\u33b6", "\u33bc"
      )
  )

Try the janitor package in your browser

Any scripts or data that you put into this service are public.

janitor documentation built on Feb. 16, 2023, 10:16 p.m.