Nothing
####################################################################
#' One Hot Smart Encoding (Dummy Variables)
#'
#' This function lets the user automatically transform a dataframe with
#' categorical columns into numerical by one hot encoding technic.
#'
#' @family Data Wrangling
#' @family Feature Engineering
#' @family One Hot Encoding
#' @inheritParams cache_write
#' @param df Dataframe
#' @param redundant Boolean. Should we keep redundant columns? i.e. If the
#' column only has two different values, should we keep both new columns?
#' Is set to \code{NULL}, only binary variables will dump redundant columns.
#' @param drop Boolean. Drop automatically some useless features?
#' @param ignore Vector or character. Which column should be ignored?
#' @param dates Boolean. Do you want the function to create more features
#' out of the date/time columns?
#' @param holidays Boolean. Include holidays as new columns?
#' @param country Character or vector. For which countries should the holidays
#' be included?
#' @param currency_pair Character. Which currency exchange do you
#' wish to get the history from? i.e, USD/COP, EUR/USD...
#' @param trim Integer. Trim names until the nth character
#' @param limit Integer. Limit one hot encoding to the n most frequent
#' values of each column. Set to \code{NA} to ignore argument.
#' @param variance Numeric. Drop columns with more than n variance.
#' Range: 0-1. For example: if a variable contains 91 unique different
#' values out of 100 observations, this column will be suppressed if
#' value is set to 0.9
#' @param other_label Character. With which text do you wish to replace
#' the filtered values with?
#' @param sep Character. Separator's string
#' @param quiet Boolean. Quiet all messages and summaries?
#' @return data.frame on which all features are numerical by nature or
#' transformed with one hot encoding.
#' @examples
#' data(dft)
#' dft <- dft[, c(2, 3, 5, 9, 11)]
#'
#' ohse(dft, limit = 3) %>% head(3)
#' ohse(dft, limit = 3, redundant = NULL) %>% head(3)
#'
#' # Getting rid of columns with no (or too much) variance
#' dft$no_variance1 <- 0
#' dft$no_variance2 <- c("A", rep("B", nrow(dft) - 1))
#' dft$no_variance3 <- as.character(rnorm(nrow(dft)))
#' dft$no_variance4 <- c(rep("A", 20), round(rnorm(nrow(dft) - 20), 4))
#' ohse(dft, limit = 3) %>% head(3)
#' @export
ohse <- function(df,
redundant = FALSE,
drop = TRUE,
ignore = NULL,
dates = FALSE,
holidays = FALSE,
country = "Venezuela",
currency_pair = NA,
trim = 0,
limit = 10,
variance = 0.9,
other_label = "OTHER",
sep = "_",
quiet = FALSE,
...) {
if (is.vector(df)) {
df <- data.frame(var = df)
} else {
df <- data.frame(df)
}
order <- colnames(df)
# Dummy variables that will be filled
no_need_to_convert <- converted <- converted_binary <- NULL
# Leave some columns out of the logic
ignore <- unique(ignore)
if (!is.null(ignore)) {
if (!quiet) message(">>> Omitting transformations for ", vector2text(ignore))
ignored <- select(df, any_of(ignore))
df <- select(df, -any_of(ignore))
} else {
ignored <- NULL
}
# No variance columns
no_variance <- zerovar(df)
if (drop) df <- select(df, !any_of(no_variance))
# Create features out of date/time variables
if (dates == TRUE || holidays == TRUE || !is.na(currency_pair)) {
times <- df_str(df, return = "names", quiet = TRUE)$time
if (length(times) <= 1) {
df_dates <- date_feats(df,
drop = FALSE,
append = FALSE,
holidays = holidays,
country = country,
currency_pair = currency_pair,
quiet = quiet
)
if (ncol(df_dates) != ncol(df)) {
df <- left_join(df, df_dates, by = as.character(times[1])) %>% distinct()
}
}
}
# Name and type of variables
types <- data.frame(
name = colnames(df),
type = unlist(lapply(lapply(df, class), `[[`, 1))
)
# Iterate all columns
for (i in seq_along(df)) {
vector_type <- types[i, "type"]
vector_name <- as.character(types$name[i])
vector_levels <- length(unique(df[, c(vector_name)]))
vector_values <- df[toString(types[i, "name"])]
# Non numeric or date/time variables
if (!vector_type %in% c("integer", "numeric", "POSIXct", "POSIXt", "Date")) {
# Char columns with too much variance (unique values vs total observations)
if (vector_levels >= variance * nrow(df)) {
no_variance <- c(no_variance, vector_name)
}
vector_values <- vector_values %>%
mutate_all(as.character) %>%
replace(., is.na(.), "NAs")
vector_values[, 1] <- paste0(sep, vector_values[, 1])
# Columns with 2 possible values
if (vector_levels == 2 && !isTRUE(redundant)) {
which <- as.character(levels(as.factor(df[, c(vector_name)]))[2])
df[, c(vector_name)] <- as.integer(as.factor(df[, c(vector_name)])) - 1
converted_binary <- rbind(converted_binary, vector_name)
df <- rename_at(df, vars(vector_name), list(~ paste0(vector_name, "_", which)))
}
# ONE HOT ENCODING
if (!colnames(vector_values) %in% c(converted_binary, no_variance)) {
if (vector_levels >= 2 && !vector_name %in% converted_binary) {
options("na.action" = "na.pass")
reduced <- categ_reducer(
vector_values, !!as.name(vector_name),
top = limit,
other_label = paste0(sep, other_label)
)
dummy_matx <- data.frame(model.matrix(~ . - 1, reduced))
colnames(dummy_matx) <- paste0(vector_name, sort(unique(reduced[, 1])))
if (isFALSE(redundant)) dummy_matx <- dummy_matx[, 1:(ncol(dummy_matx) - 1)]
df <- cbind(df, dummy_matx)
converted <- rbind(converted, vector_name)
}
}
}
no_need_to_convert <- rbind(no_need_to_convert, vector_name)
}
# Shorten up the long names of some variables
if (trim > 0) colnames(df) <- substr(colnames(df), 1, trim)
# Summary of transformations
if (!quiet) {
total_converted <- rbind(converted, converted_binary)
if (length(total_converted) > 1) {
message(paste(
">>> One Hot Encoding applied to", length(total_converted),
"variables:", vector2text(total_converted)
))
}
if (length(no_variance) > 0 && drop) {
no_variance <- no_variance[no_variance %in% ignore]
if (length(no_variance) > 0) {
message(paste0(
">>> Automatically dropped ", length(no_variance),
" columns with 0% or >=", round(variance * 100),
"% variance: ", vector2text(no_variance)
))
}
}
}
# Return only useful columns
if (drop && length(c(converted, no_variance)) > 0) {
df <- df[, c(!colnames(df) %in% c(converted, no_variance))]
}
# Bind ignored untouched columns and order
order <- order[order %in% colnames(df)]
df <- bind_cols(df, ignored) %>% select(any_of(order), everything())
return(as_tibble(df))
}
####################################################################
#' One Hot Encoding for a Vector with Comma Separated Values
#'
#' This function lets the user do one hot encoding on a variable with
#' comma separated values
#'
#' @family Data Wrangling
#' @family One Hot Encoding
#' @param df Dataframe. May contain one or more columns with comma separated
#' values which will be separated as one hot encoding
#' @param ... Variables. Which variables to split into new columns?
#' @param sep Character. Which regular expression separates the elements?
#' @param noval Character. No value text
#' @param remove Boolean. Remove original variables?
#' @return data.frame on which all features are numerical by nature or
#' transformed with one hot encoding.
#' @examples
#' df <- data.frame(
#' id = c(1:5),
#' x = c("AA, D", "AA,B", "B, D", "A,D,B", NA),
#' z = c("AA+BB+AA", "AA", "BB, AA", NA, "BB+AA")
#' )
#' ohe_commas(df, x, remove = TRUE)
#' ohe_commas(df, z, sep = "\\+")
#' ohe_commas(df, x, z)
#' @export
ohe_commas <- function(df, ..., sep = ",", noval = "NoVal", remove = FALSE) {
vars <- quos(...)
var <- gsub("~", "", as.character(vars))
df <- as.data.frame(df)
for (i in var) {
df$temp <- as.character(df[, i])
# Handling missingness
df$temp[as.character(df$temp) == "" | is.na(df$temp)] <- noval
vals <- v2t(as.character(df$temp), quotes = FALSE)
vals <- unique(trimws(unlist(strsplit(vals, sep))))
# aux <- sprintf("--%s--", vals)
l <- strsplit(df$temp, sep)
mat <- NULL
for (i in seq_along(vals)) {
which <- unlist(lapply(l, function(x) any(trimws(x) %in% vals[i])))
mat <- cbind(mat, which)
}
colnames(mat) <- gsub('"', "", paste(var, vals, sep = "_"))
df$temp <- NULL
df <- cbind(df, mat)
}
if (remove) df <- df[, !colnames(df) %in% var]
return(as_tibble(df, .name_repair = "minimal"))
}
####################################################################
#' One Hot Encoding for Date/Time Variables (Dummy Variables)
#'
#' This function lets the user automatically create new columns out
#' of a dataframe or vector with date/time variables.
#'
#' @family Data Wrangling
#' @family Feature Engineering
#' @family One Hot Encoding
#' @param dates Vector or dataframe. Non-date/time columns will be
#' automatically ignored/extracted.
#' @param drop Boolean. Should the original date/time columns be
#' kept in the results? Only valid when input is a dataframe.
#' @param only Character or vector. Which columns do you wish to process? If
#' non are explicitly defined, all will be processed
#' @param append Boolean. Append results to existing data.frame? If FALSE,
#' only calculated values will be returned.
#' @param holidays Boolean. Include holidays as new columns?
#' @param country Character or vector. For which countries should the holidays
#' be included?
#' @param currency_pair Character. Which currency exchange do you
#' wish to get the history from? i.e, USD/COP, EUR/USD...
#' @param quiet Boolean. Quiet all messages?
#' @return data.frame with additional features calculated out of time or date vectors.
#' @examples
#' df <- data.frame(
#' dates = sample(seq(Sys.Date() - 365, Sys.Date(), by = 1), 50),
#' times = sample(seq(Sys.time() - 1e7, Sys.time(), by = 1), 50)
#' )
#'
#' # Input as a vector or dataframe
#' date_feats(df, drop = TRUE) %>% head(10)
#'
#' # Holidays given a date range and country
#' \dontrun{
#' hol <- date_feats(
#' seq(Sys.Date() - 365, Sys.Date(), by = 1),
#' holidays = TRUE,
#' country = "Venezuela"
#' )
#' head(hol[!is.na(hol$holiday_name), ])
#' }
#' @export
date_feats <- function(dates,
drop = FALSE,
only = NA,
append = FALSE,
holidays = FALSE,
country = "Venezuela",
currency_pair = NA,
quiet = FALSE) {
results <- NULL
original <- dates
date_cols <- df_str(dates, return = "names", quiet = TRUE)$time
vector <- is.null(dim(dates))
if (length(date_cols) == 0) {
return(dates)
}
if (!is.na(only)) {
date_cols <- date_cols[date_cols %in% only]
}
iters <- ifelse(date_cols == "df", 1, length(date_cols))[1]
if (!is.na(iters)) {
if (!quiet) {
message(paste(">>> Processing", iters, "date/time columns:", vector2text(date_cols)))
}
} else {
return(dates)
}
if (!"data.frame" %in% class(dates) && iters == 1) {
dates <- data.frame(values_date = dates)
date_cols <- "values_date"
}
if (holidays || !is.na(currency_pair)) {
search_dates <- dates[, c(colnames(dates) %in% date_cols)]
search_dates[] <- unlist(lapply(search_dates, function(x) gsub(" .*", "", as.character(x))))
alldates <- as.Date(unlist(search_dates, use.names = FALSE))
alldates <- alldates[!is.na(alldates)]
}
if (holidays) {
years <- sort(unique(year(alldates)))
holidays_dates <- holidays(countries = country, years)
colnames(holidays_dates)[1] <- "values_date"
holidays_dates$values_date <- as.character(as.Date(holidays_dates$values_date))
cols <- paste0("values_date_holiday_", colnames(holidays_dates)[4:ncol(holidays_dates)])
colnames(holidays_dates)[-(1:3)] <- cols
}
# Features creator
for (col in 1:iters) {
col_name <- date_cols[col]
result <- dates %>% select(!!as.name(col_name))
values <- result[, 1]
result$values_date <- as.character(as.Date(values))
result$values_date_year <- year(values)
result$values_date_month <- month(values)
result$values_date_day <- day(values)
result$values_date_week <- week(values)
result$values_date_weekday <- weekdays(values, abbreviate = TRUE)
result$values_date_weekend <- format(values, "%u") %in% c(6, 7)
result$values_date_year_day <- as.integer(difftime(
values, floor_date(values, unit = "year"),
units = "day"
))
if (any(grepl(class(values), "POSIX"))) {
result$values_date_hour <- hour(values)
result$values_date_minute <- minute(values)
result$values_date_minutes <- as.integer(difftime(
values, floor_date(values, unit = "day"),
units = "mins"
))
result$values_date_second <- second(values)
}
# Holidays data
if (holidays) {
result <- result %>%
left_join(holidays_dates, by = "values_date", relationship = "many-to-many") %>%
mutate(values_date_holiday_county = as.character(.data$values_date_holiday_county)) %>%
mutate_at(vars(cols), list(~ replace(., which(is.na(.)), FALSE)))
}
# Currencies data
if (!is.na(currency_pair)) {
currency <- get_currency(currency_pair, from = min(alldates), to = max(alldates))
colnames(currency) <- c("values_date", paste0("values_date_", tolower(cleanText(currency_pair))))
currency[, 1] <- as.character(currency[, 1])
result <- result %>% left_join(currency, by = "values_date")
}
col_name <- ifelse(col_name == "values_date", "", paste0(col_name, "_"))
colnames(result)[-1] <- gsub("values_date_", col_name, colnames(result)[-1])
results <- results %>%
bind_cols(result) %>%
select(-contains("values_date"))
if (vector) colnames(results)[1] <- "values"
}
if (append) results <- bind_cols(original, select(results, -any_of(colnames(original))))
if (drop) results <- results[, !colnames(results) %in% date_cols]
return(as_tibble(results))
}
####################################################################
#' Holidays in your Country
#'
#' This function lets the user automatically scrap holiday dates from
#' any country and year within +- 5 years. Thanks to timeanddate.com!
#'
#' @family Data Wrangling
#' @family Feature Engineering
#' @family Scrapper
#' @family One Hot Encoding
#' @param years Character or vector. For which year(s) do you wish to import
#' holiday dates?
#' @param countries Character or vector. For which country(ies) should the
#' holidays be imported?
#' @param quiet Boolean. Default FALSE which disables verbosity of the function.
#' @param include_regions Boolean. Default FALSE. If TRUE, for countries with
#' internal subdivisions, it will provide details on which sub-state the found
#' holidays apply.
#' @return \code{data.frame} with holidays data for given \code{countries} and \code{years}.
#' @examples
#' \donttest{
#' holidays(countries = "Argentina")
#' holidays(countries = c("Argentina", "Venezuela"), years = c(2019, 2020))
#' holidays(countries = "Germany", years = 2021:2023, include_regions = TRUE)
#' }
#' @export
holidays <- function(countries = "Venezuela",
years = year(Sys.Date()),
quiet = FALSE,
include_regions = FALSE) {
# Further improvement: let the user bring more than +-5 years
results <- NULL
if (any(!years %in% (year(Sys.Date()) - 5):(year(Sys.Date()) + 5))) {
warning(paste(
"Only allowing \u00b1 5 years from today. Check:", v2t(years)
))
}
year <- year(Sys.Date())
years <- years[years %in% ((year - 5L):(year + 5L))]
combs <- expand.grid(years, countries) %>%
dplyr::rename(year = "Var1", country = "Var2")
for (i in seq_len(nrow(combs))) {
if (!quiet) {
message(paste0(">>> Extracting ", combs$country[i], "'s holidays for ", combs$year[i]))
}
url <- paste0("https://www.timeanddate.com/holidays/", tolower(combs$country[i]), "/", combs$year[i])
# call httr's GET however set header to only accept English named date parts (months)
# otherwise if user uses own locale, for instance German, an error can occur parsing dates of holidays
# compare with plain call without additional headers in different locale: holidays <- content(GET(url))
ret <- content(GET(url, add_headers("Accept-Language" = "en")))
holidays <- ret %>%
html_nodes(".table") %>%
html_table(fill = TRUE) %>%
data.frame(.) %>%
filter(!is.na(.data$Date)) %>%
select(-2L) %>%
mutate(Date = paste(.data$Date, combs$year[i])) %>%
.[-1L, ] %>%
removenacols(all = TRUE) %>%
removenarows(all = TRUE)
colnames(holidays) <- if (include_regions & ncol(holidays) > 3) {
c("Date", "Holiday", "Holiday.Type", "Holiday.Details")
} else {
c("Date", "Holiday", "Holiday.Type")
}
# the table might contain comment about interstate holidays like
# '* Observed only in some communities of this state.
# Hover your mouse over the region or click on the holiday for details.'
# this will not parse as Date but create a warning, hence handling it here
grep_comment <- grep("*", holidays$Date, fixed = TRUE)
if (length(grep_comment) != 0L) {
holidays <- holidays[-grep_comment, ]
}
holidays$Date <- tryCatch(
{
lubridate::dmy(holidays$Date)
},
error = function(cond) {
stop(
"Unaccounted problem(s) occurred parsing the date column.\n Check sample: ",
v2t(head(holidays$Date, 3))
)
}
)
result <- data.frame(
holiday = holidays$Date,
holiday_name = holidays$Holiday,
holiday_type = holidays$Holiday.Type
)
if (include_regions) result$holiday_details <- holidays$Holiday.Details
result <- result %>%
mutate(
national = grepl("National|Federal", holidays$Holiday.Type),
observance = grepl("Observance", holidays$Holiday.Type),
bank = grepl("Bank", holidays$Holiday.Type),
nonwork = grepl("Non-working", holidays$Holiday.Type),
season = grepl("Season", holidays$Holiday.Type),
hother = !grepl("National|Federal|Observance|Season", holidays$Holiday.Type)
) %>%
{
if (length(unique(countries)) > 1L) {
mutate(., country = combs$country[i])
} else {
.
}
}
result$county <- combs$country[i]
results <- bind_rows(results, result)
}
results <- results %>%
filter(!is.na(.data$holiday)) %>%
cleanNames() %>%
as_tibble()
return(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.