# Data Cleaning -----------------------------------------------------------
#' Negate %in% operator
`%nin%` <- function (x, table){
match(x, table, nomatch = 0L) > 0L
}
#' Fast NA repacement for data.frames
#'
#' @return A data.table with 0's instead of NA's.
freplace_na = function(dt) {
# taken from https://stackoverflow.com/questions/7235657/fastest-way-to-replace-nas-in-a-large-data-table
# by Matt dowle, creator of data.table
categoricals <- discard(dt, is.numeric)
dt <- keep(dt, is.numeric)
walk(1:ncol(dt), ~set(dt, which(is.na(dt[[.x]])), .x, 0))
return(cbind(categoricals, dt))
}
#'Outlier replacement on a numeric vector
#' @param x Numeric vector
#' @param replacement Value that will replace outliers. Default is mean(x, na.rm = TRUE)
#' @param limit Value that will identify outliers. Defualt is quantile(x, .99)
#' @param quant Quantile limit for ouliers. Default is .99
replace_outliers <- function(x, replacement = NULL, limit = NULL, quant = .99){
if(!is.numeric(x)){
warning('Vector is not numeric, no replacement was made.')
return(x)
}
if(is.null(replacement)){
replacement <- mean(x, na.rm = TRUE)
}
if(is.null(limit)){
limit <- quantile(x, quant)
}
x[x > limit] <- replacement
return(x)
}
#'Selects only the rows where the sum of numeric variables is not zero
select_nonzero_rows <- function(dt){
dt[rowSums(keep(dt, is.numeric)) != 0]
}
#' Unique sorted values of a vector
#'
#' @param x A vector
#' @return A vector containing the unique values of x, sorted.
suniq <- function(x){x %>% unique() %>% sort()}
#' Removes strings from column names
#' @param string Character string to be removed fromcolumn names.
rm_col_words <- function(dt, string){
colnames(dt) <- colnames(dt) %>% stringr::str_remove(string)
return(dt)
}
#' Filters data.frame to keep only data between two dates (inclusive)
#' @param dt data.frame with at least one Date column.
#' @param date_col Character string specifying the name of the column of dt.
#' @param start_date Date object.
#' @param end_date Date object.
filter_interval <- function(dt, datecol, start_date, end_date){
setDT(dt)
# matching exact dates is considerably faster that using between()
interval <- seq(start_date, end_date, by = 'days')
dt[(datecol) := ymd(..datecol)]
dt <- dt[..datecol %in% interval]
}
#'Convert all int64 columns to character
int64_to_char <- function(dt){
setDT(dt)
cols64 <- colnames(dt)[map_chr(dt, class) == 'integer64']
dt[,(cols64):= lapply(.SD, as.character), .SDcols = cols64]
}
#' Add a prefix to column names
#' @param prefix character string to add to dt's column names
#' @param exclude character vector specifying the names of the columns
#' that won't have the prefix added.
colname_prefix <- function(dt, prefix, sep = '_', exclude = NULL){
if(is.null(exclude)){
colnames(dt) <- paste(prefix, colnames(dt), sep = sep)
}else{
change_cols <- !grepl(exclude, colnames(dt))
colnames(dt)[change_cols] <- paste(prefix, colnames(dt)[change_cols], sep = sep)
}
return(dt)
}
# assigns each element of a list into the global environment, with clean names and a prefix
clean2env <- function(data_list, prefix = NULL, envir = .GlobalEnv){
if (!requireNamespace("janitor", quietly = TRUE)) {
stop("Package \"janitor\" needed for this function to work. Please install it, or use list2env() instead",
call. = FALSE)
}
data_list <- data_list %>%
purrr::walk(janitor::clean_names)
if(!is.null(prefix)){
data_list <- data_list %>% purrr::set_names(paste(prefix, names(data_list), sep = '_'))
}
print(names(data_list))
data_list %>% list2env(envir)
}
#' Removes punctuation, double spacing and numbers from strings
#' @param x Character vector
keep_alphabetic <- function(x){
x %>%
stringr::str_remove_all("[!#$%&'()*+,'-./?:;<=>?@^_`{|}~+]+") %>%
stringr::str_remove_all('"') %>%
stringr::str_remove_all('[[:digit:]]+') %>%
stringr::str_remove_all('\\s{2,}') %>%
stringr::str_trim()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.