R/clean_names.R

Defines functions clean_vec clean_names

Documented in clean_names clean_vec

#' clean_names
#'
#' @param dataset a dataframe
#' @param verbose logical
#' @param translit logical remove non ascii character
#'
#' @return a dataframe
#' @encoding UTF-8
#' @export
#'
#' @examples
#' data(iris)
#' clean_names(iris)
#'
clean_names <- function(dataset,
                        verbose = FALSE,
                        translit = TRUE) {
  old <- names(dataset)
  names(dataset) <-
    clean_vec(names(dataset), verbose = FALSE, translit = translit)
  if (verbose)
    print(data.frame(old = old, new = names(dataset)))
  invisible(dataset)
}

#' Clean character vector
#'
#' @param vec character vector to clean
#'
#' @param verbose logical is the function verbose
#' @param unique logical do we have to apply make_unique
#' @param keep_number logical keep number at begining
#' @param translit logical remove non ascii character
#' @param punct logical do you remove punctuation
#'
#' @importFrom stringi stri_trans_general
#'
#' @encoding UTF-8
#' @export
clean_vec <- function(vec,
                      verbose = FALSE,
                      unique = TRUE,
                      keep_number = FALSE,
                      translit = TRUE,
                      punct = TRUE) {
  old <- vec
  vec <- tolower(vec)
  if (unique) {
    vec <- make_unique(vec)
  }
  if (translit) {
    vec <- stringi::stri_trans_general(vec, "latin-ascii")
  }
  if (!keep_number) {
    vec <- make.names(vec)
  }
  if (punct) {
    vec <- vec %>%
      gsub(perl = TRUE, "[[:punct:]]+", "_", .)
  }
  vec <- vec %>%
    # gsub(perl = TRUE,"[[:punct:]]+", "_",.) %>% # la ponctuation
    gsub(perl = TRUE, "[[:space:]]+", "_", .) %>% # les espaces
    gsub(perl = TRUE, "^_+", "", .) %>% # les _ au debut
    gsub(perl = TRUE, "_+$", "", .) %>% # les _ a la fin
    gsub(perl = TRUE, "_+", "_", .) %>% # les successions de _
    tolower

  if (!keep_number) {
    vec <- make.names(vec)
  }
  if (unique) {
    vec <- make_unique(vec)
  }

  if (verbose) {
    print(data.frame(old = old, new = vec))
  }
  invisible(vec)

}

#' Clean levels label
#'
#' @param vec a factor
#'
#' @param verbose booleen is the function verbose
#' @param translit booleen remove non ascii character
#' @param punct booleen do you remove punctuation
#'
#' @encoding UTF-8
#' @export
#' @importFrom assertthat assert_that

clean_levels<-function(vec,verbose=FALSE,
                       translit=FALSE,
                       punct=FALSE){
  assert_that(is.factor(vec))
  var <- vec
  old <- levels(vec)
  vec <- levels(vec)
  vec <- clean_vec(vec,unique = FALSE,keep_number = TRUE,
                 translit=translit,punct=punct)
  vec <- gsub("_+"," ",vec)
  vec <- gsub("\\s+$","",vec)
  vec <- gsub("\\s$","",vec)
  if (verbose)   print(data.frame(old=old,new=vec))
  levels(var) <- vec
  invisible(var)
}
ThinkRstat/ThinkR documentation built on Aug. 29, 2022, 6:06 a.m.