R/clean_colnames.R

Defines functions print_duplicated_columns clean_colnames

Documented in clean_colnames

print_duplicated_columns <- function(x) {
  tab <- table(x)
  out <- names(tab[tab > 1])
  out <- paste(out, collapse = ", ")
  out
}

##' Clean column names
##'
##' It cleans column names (my relative definition of
##' 'clean'). Specifically, it separates with underscore (e.g.,
##' "pikaPika" to "pika_Pika"), replaces blank spaces with underscore,
##' replaces multiple symbols with single underscore, and removes
##' symbols after last word. These steps are followed IN THAT ORDER.
##' @importFrom stringr str_replace_all str_to_lower
##' @param column_names A vector with column names
##' @return A vector with clean names
##' @examples
##' clean_colnames(c("bart Simpson", "LisaSimpson", "maggie..simpson!",
##'                  "MARGE-Simpson", "Homer Simpson :-)"))
##'
##' ## Get warning if there are repeated colum names
##' \dontrun{
##' clean_colnames(c("bart  Simpson", "LisaSimpson",
##'                  "maggie..simpson!", "MARGE-Simpson", "bart-Simpson",
##'                  "Homer Simpson :-)"))
##' }
##'
##' @author Guillermo Basulto-Elias
##' @export
clean_colnames <- function(column_names) {
  out <- column_names  %>%
    ## "pikaPika" to "pikaPika"
    str_replace_all("([a-z]+)([A-Z])", "\\1_\\2") %>%
    ## Replaces blank spaces by an underscore
    str_replace_all("[:blank:]+", "_") %>%
    ## Replace punctuation by underscore
    str_replace_all("[:punct:]+", "_") %>%
    ## Remove underscore at the end
    str_replace_all("_+$", "") %>%
    ## From upper to lower case
    str_to_lower()

  ## Compute number of unique names
  length1 <- length(unique(column_names))
  length2 <- length(unique(out))

  ## Send warning message if there are repeated column names
  if (length1 != length2) {
    warning(
      paste("The following column names are duplicated :",
            print_duplicated_columns(out))
    )
  }

  out
}
gbasulto/rmiscfun documentation built on July 25, 2019, 8:56 p.m.