vartors: Transform Definition of Variables to R Scripts

Documented in shorten tidy_rname

# -*- encoding:ASCII -*-

##################
# function tidy_rname
# Created the 2014-09-26 by Joris Muller
##################

## documentation ------------------------------------------------

#' @title Transform char to rname
#'
#' @description Transform a character vector to another one with good names for data.frames header following several rules.
#'
#'
#' @param char A character vector with the original columns names.
#' @param ascii Boolean. Only produce ASCII char. Default is \code{TRUE}.
#' @param no_dots Boolean. Replace dots by underscores. Default is \code{TRUE}.
#' @param max_size Integer. Number max of character. If 0, no limits. Will not cut words < of 8 letters. Default is 16. Will keep the first letters and the last ones (a least 3)
#' @param no_duplicate Boolean. If there is a duplicate, add a number at the end of the name.
#' @param lowercase Boolean. Change to lower cases. Default is \code{TRUE}.
#' @details The rules applied are :
#' \enumerate{
#'  \item rules from official doc : use the function \code{\link{make.names}}
#'  \item rules of good practice
#'    \itemize{
#'      \item only ASCII character
#'      \item don't use dots because they have a special meaning in R
#'      \item avoid duplicates
#'      \item don't use to long name (less than 16 char).
#'      \item should be only lower cases
#'    }
#'  }
#' @return A character vector.
#' @seealso This function is used by \code{\link{descvars_skeleton}} to produce
#' the column rname. The base \code{\link{make.names}} function does a part of
#' the work.
#' @export
#' @author Joris Muller
#' @examples
#' bad_names <- c("white space", "multiples...dots..", "duplicate",
#'                "duplicate", "$pécial char", "looooooooognameeeeeeee")
#' transformed_names <- tidy_rname(bad_names)
#' transformed_names
#'

tidy_rname <- function(char, ascii = TRUE, no_dots = TRUE, no_duplicate = TRUE,
                     max_size = 16, lowercase = TRUE){

  # char must be a charcter vector
  if(class(char) != "character") stop("char must be a character vector")

  # Always perform a make.names
  clean_names <- make.names(char, unique = no_duplicate)

  # Transform to ASCII if asked
  if (ascii) {
    clean_names <- ascii_text <- iconv(clean_names, to="ASCII//TRANSLIT")

    #remove all unecessary char.
    clean_names <- gsub("[^a-z A-Z 0-9. _]", replacement = "", x = clean_names)

  }

  # Transform to lowercases
  if (lowercase) {
    clean_names <- tolower(clean_names)
  }

  # Remove dots because they are used for non compatible characters
  if (no_dots) {
    clean_names <- gsub(pattern = "\\.+", replacement = "_", x = clean_names)

    # remove dots if at the end
    clean_names <- gsub(pattern = "_+$", replacement = "", x = clean_names)

    # remove dots if at the begining. Because not allowed at the begining.
    clean_names <- gsub(pattern = "^_+", replacement = "", x = clean_names)
  }

  # Shorten the name if too long
  if (max_size > 0) {
    clean_names <- shorten(char = clean_names, max_size = max_size)
  }
  return(clean_names)
}

#' @title Shorten names
#'
#' @description Shorten names in a character vector, keeping the begining and the end
#'
#'
#' @param char A character vector with the names.
#' @param max_size Integer. Number max of character. If 0, no limits.
#' Will not cut words < of 8 letters. Default is 16.
#' Will keep the first letters and the last ones (a least 3)
#' @param tail_keep Integer. Number max of character to keep at the end of
#' the words. If 0, don't keep any character. Must be < to max_size.
#' Default is 3.
#' @details The function keep
#' @return A character vector with shorten names.
#' @seealso This function is used by \code{\link{tidy_rname}} to produce
#' the short enough names.
#' @export
#' @keywords internal
#' @author Joris Muller
#' @examples
#' bad_names <- c("short", "looooooooognameeeeeeee")
#' transformed_names <- shorten(bad_names)
#' transformed_names


shorten <- function(char, max_size = 16, tail_keep = 3) {
  size <- nchar(char)

  new_char <- ifelse(test = (size > max_size), yes = {
    # calculate the number of leading character
    # max_size - tail_keep - 1 (for the _)
    nb_leadings <- max_size - tail_keep - 1
    leadings_chars <- substr(x = char, start = 1, stop = nb_leadings)

    # Calcultate the numbre of tail characters
    # Risk of overlapping if nb_leading + tail_keep + 1 > size
    # TODO: fix it
    tail_chars <- substr(x = char, start = size - tail_keep + 1, stop = size)
     paste(leadings_chars, tail_chars, sep = "_")
  },  no = {
     char
  }
  )

  return(new_char)
}