R/data_cleaning.R

Defines functions freplace_na replace_outliers select_nonzero_rows suniq rm_col_words filter_interval int64_to_char colname_prefix clean2env keep_alphabetic

# Data Cleaning -----------------------------------------------------------

#' Negate %in% operator
`%nin%` <- function (x, table){
  match(x, table, nomatch = 0L) > 0L
}

#' Fast NA repacement for data.frames
#'
#' @return A data.table with 0's instead of NA's.
freplace_na = function(dt) {
  # taken from https://stackoverflow.com/questions/7235657/fastest-way-to-replace-nas-in-a-large-data-table
  # by Matt dowle, creator of data.table
  categoricals <- discard(dt, is.numeric)
  dt <- keep(dt, is.numeric)
  walk(1:ncol(dt), ~set(dt, which(is.na(dt[[.x]])), .x, 0))
  return(cbind(categoricals, dt))
}

#'Outlier replacement on a numeric vector
#' @param x Numeric vector
#' @param replacement Value that will replace outliers. Default is mean(x, na.rm = TRUE)
#' @param limit Value that will identify outliers. Defualt is quantile(x, .99)
#' @param quant Quantile limit for ouliers. Default is .99
replace_outliers <- function(x,  replacement = NULL, limit = NULL, quant = .99){
  if(!is.numeric(x)){
    warning('Vector is not numeric, no replacement was made.')
    return(x)
  }
  if(is.null(replacement)){
    replacement <- mean(x, na.rm = TRUE)
  }
  if(is.null(limit)){
    limit <- quantile(x, quant)
  }
  x[x > limit] <- replacement
  return(x)
}

#'Selects only the rows where the sum of numeric variables is not zero
select_nonzero_rows <- function(dt){
  dt[rowSums(keep(dt, is.numeric)) != 0]
}

#' Unique sorted values of a vector
#'
#' @param x A vector
#' @return A vector containing the unique values of x, sorted.
suniq <- function(x){x %>% unique() %>% sort()}

#' Removes strings from column names
#' @param string Character string to be removed fromcolumn names.
rm_col_words <- function(dt, string){
  colnames(dt) <- colnames(dt) %>% stringr::str_remove(string)
  return(dt)
}

#' Filters data.frame to keep only data between two dates (inclusive)
#' @param dt data.frame with at least one Date column.
#' @param date_col Character string specifying the name of the column of dt.
#' @param start_date Date object.
#' @param end_date Date object.
filter_interval <- function(dt, datecol, start_date, end_date){
  setDT(dt)
  # matching exact dates is considerably faster that using between()
  interval <- seq(start_date, end_date, by = 'days')
  dt[(datecol) := ymd(..datecol)]
  dt <- dt[..datecol %in% interval]
}

#'Convert all int64 columns to character
int64_to_char <- function(dt){
  setDT(dt)
  cols64 <- colnames(dt)[map_chr(dt, class) == 'integer64']
  dt[,(cols64):= lapply(.SD, as.character), .SDcols = cols64]
}

#' Add a prefix to column names
#' @param prefix character string to add to dt's column names
#' @param exclude character vector specifying the names of the columns
#' that won't have the prefix added.
colname_prefix <- function(dt, prefix, sep = '_', exclude = NULL){
  if(is.null(exclude)){
    colnames(dt) <- paste(prefix, colnames(dt), sep = sep)
  }else{
    change_cols <- !grepl(exclude, colnames(dt))
    colnames(dt)[change_cols] <- paste(prefix, colnames(dt)[change_cols], sep = sep)
  }
  return(dt)
}

# assigns each element of a list into the global environment, with clean names and a prefix
clean2env <- function(data_list, prefix = NULL, envir = .GlobalEnv){
    if (!requireNamespace("janitor", quietly = TRUE)) {
      stop("Package \"janitor\" needed for this function to work. Please install it, or use list2env() instead",
           call. = FALSE)
    }
 data_list <- data_list %>%
    purrr::walk(janitor::clean_names)

  if(!is.null(prefix)){
    data_list <- data_list %>% purrr::set_names(paste(prefix, names(data_list), sep =  '_'))
  }
  print(names(data_list))
  data_list %>% list2env(envir)
}

#' Removes punctuation, double spacing and numbers from strings
#' @param x Character vector
keep_alphabetic <- function(x){
  x %>%
    stringr::str_remove_all("[!#$%&'()*+,'-./?:;<=>?@^_`{|}~+]+") %>%
    stringr::str_remove_all('"') %>%
    stringr::str_remove_all('[[:digit:]]+') %>%
    stringr::str_remove_all('\\s{2,}') %>%
    stringr::str_trim()
}
pheymanss/dq documentation built on March 12, 2020, 1:29 a.m.