R/clean.df.R

Defines functions clean.df

Documented in clean.df

#' Clean the names of a data frame
#'
#' Clean up a data frame by changing all column names to lower case, removing spaces and punctuation.
#'
#' @param df a data frame
#' @param keep.underscore a logical value indicating whether underscores be kept or removed
#' @param pidmpattern a regular expression to be matched against the names of df. Set to `NULL` to turn off pattern mathing.
#'
#' @details If no column named `pidm` is found and pidmpattern is not `NULL`, then we look if there's only
#'
#' @return a data frame of class tbl_df.
#' @export

clean.df <- function(df, keep.underscore = TRUE, pidmpattern = 'bannerid|banner_id') {


  if( keep.underscore) {
    # remove everything except alphanumeric and underscore
    names(df)  <- gsub("\\W", "", names(df) )
  } else {
    # remove punctuation and then remove the underscores, too
    names(df)  <- gsub("_", "",  gsub("\\W", "", names(df) ) )
  }

  # make lowercase
  # names(df) <- tolower(names(df))

  # get rid of leading ï
  names(df)  <- stringr::str_remove(names(df), stringr::fixed("\u00EF"))


  # if you don't have a column named pidm already, look for other bannerid and use that?
  if( !is.null(pidmpattern) & !any(names(df) == 'pidm')  ) {

    n_pidmcols  <- sum(grepl(pidmpattern, names(df) ))
    coltorename = names(df)[grepl(pidmpattern, names(df)) ]

    # don't rename if you have more than one matching col
    if( n_pidmcols > 1) {

      warning(n_pidmcols, " columns match pidmpattern regex: ", pidmpattern, ". Possible pidm columns not renamed.")

    } else if  (n_pidmcols == 1) {

      message("Renaming ", coltorename, " to pidm ...")
      df  <- dplyr::rename_at(df, dplyr::vars(matches(pidmpattern)), dplyr::funs(gsub(., ".*", "pidm")) )

    }

  }


  dplyr::as_tibble(df)

}
crazybilly/ucbudar documentation built on July 15, 2024, 5:03 p.m.