lares: Lean Analytics and Robust Exploration Sidekick

Documented in ohse

####################################################################
#' One Hot Smart Encoding (Dummy Variables)
#'
#' This function lets the user automatically transform a dataframe with
#' categorical columns into numerical by one hot encoding technic.
#'
#' @family Data Wrangling
#' @family Feature Engineering
#' @family One Hot Encoding
#' @inheritParams cache_write
#' @param df Dataframe
#' @param redundant Boolean. Should we keep redundant columns? i.e. If the
#' column only has two different values, should we keep both new columns?
#' Is set to \code{NULL}, only binary variables will dump redundant columns.
#' @param drop Boolean. Drop automatically some useless features?
#' @param ignore Vector or character. Which column should be ignored?
#' @param dates Boolean. Do you want the function to create more features
#' out of the date/time columns?
#' @param holidays Boolean. Include holidays as new columns?
#' @param country Character or vector. For which countries should the holidays
#' be included?
#' @param currency_pair Character. Which currency exchange do you
#' wish to get the history from? i.e, USD/COP, EUR/USD...
#' @param trim Integer. Trim names until the nth character
#' @param limit Integer. Limit one hot encoding to the n most frequent
#' values of each column. Set to \code{NA} to ignore argument.
#' @param variance Numeric. Drop columns with more than n variance.
#' Range: 0-1. For example: if a variable contains 91 unique different
#' values out of 100 observations, this column will be suppressed if
#' value is set to 0.9
#' @param other_label Character. With which text do you wish to replace
#' the filtered values with?
#' @param sep Character. Separator's string
#' @return data.frame on which all features are numerical by nature or
#' transformed with one hot encoding.
#' @examples
#' data(dft)
#' dft <- dft[, c(2, 3, 5, 9, 11)]
#'
#' ohse(dft, limit = 3) %>% head(3)
#' ohse(dft, limit = 3, redundant = NULL) %>% head(3)
#'
#' # Getting rid of columns with no (or too much) variance
#' dft$no_variance1 <- 0
#' dft$no_variance2 <- c("A", rep("B", nrow(dft) - 1))
#' dft$no_variance3 <- as.character(rnorm(nrow(dft)))
#' dft$no_variance4 <- c(rep("A", 20), round(rnorm(nrow(dft) - 20), 4))
#' ohse(dft, limit = 3) %>% head(3)
#' @export
ohse <- function(df,
                 redundant = FALSE,
                 drop = TRUE,
                 ignore = NULL,
                 dates = FALSE,
                 holidays = FALSE,
                 country = "Venezuela",
                 currency_pair = NA,
                 trim = 0,
                 limit = 10,
                 variance = 0.9,
                 other_label = "OTHER",
                 sep = "_",
                 quiet = FALSE,
                 ...) {
  if (is.vector(df)) {
    df <- data.frame(var = df)
  } else {
    df <- data.frame(df)
  }

  order <- colnames(df)

  # Dummy variables that will be filled
  no_need_to_convert <- converted <- converted_binary <- NULL

  # Leave some columns out of the logic
  ignore <- unique(ignore)
  if (!is.null(ignore)) {
    if (!quiet) message(">>> Omitting transformations for ", vector2text(ignore))
    ignored <- select(df, any_of(ignore))
    df <- select(df, -any_of(ignore))
  } else {
    ignored <- NULL
  }

  # No variance columns
  no_variance <- zerovar(df)
  if (drop) df <- select(df, !any_of(no_variance))

  # Create features out of date/time variables
  if (dates == TRUE || holidays == TRUE || !is.na(currency_pair)) {
    times <- df_str(df, return = "names", quiet = TRUE)$time
    if (length(times) <= 1) {
      df_dates <- date_feats(df,
        drop = FALSE,
        append = FALSE,
        holidays = holidays,
        country = country,
        currency_pair = currency_pair,
        quiet = quiet
      )

      if (ncol(df_dates) != ncol(df)) {
        df <- left_join(df, df_dates, by = as.character(times[1])) %>% distinct()
      }
    }
  }

  # Name and type of variables
  types <- data.frame(
    name = colnames(df),
    type = unlist(lapply(lapply(df, class), `[[`, 1))
  )

  # Iterate all columns
  for (i in seq_along(df)) {
    vector_type <- types[i, "type"]
    vector_name <- as.character(types$name[i])
    vector_levels <- length(unique(df[, c(vector_name)]))
    vector_values <- df[toString(types[i, "name"])]

    # Non numeric or date/time variables
    if (!vector_type %in% c("integer", "numeric", "POSIXct", "POSIXt", "Date")) {
      # Char columns with too much variance (unique values vs total observations)
      if (vector_levels >= variance * nrow(df)) {
        no_variance <- c(no_variance, vector_name)
      }

      vector_values <- vector_values %>%
        mutate_all(as.character) %>%
        replace(., is.na(.), "NAs")
      vector_values[, 1] <- paste0(sep, vector_values[, 1])

      # Columns with 2 possible values
      if (vector_levels == 2 && !isTRUE(redundant)) {
        which <- as.character(levels(as.factor(df[, c(vector_name)]))[2])
        df[, c(vector_name)] <- as.integer(as.factor(df[, c(vector_name)])) - 1
        converted_binary <- rbind(converted_binary, vector_name)
        df <- rename_at(df, vars(vector_name), list(~ paste0(vector_name, "_", which)))
      }

      # ONE HOT ENCODING
      if (!colnames(vector_values) %in% c(converted_binary, no_variance)) {
        if (vector_levels >= 2 && !vector_name %in% converted_binary) {
          options("na.action" = "na.pass")
          reduced <- categ_reducer(
            vector_values, !!as.name(vector_name),
            top = limit,
            other_label = paste0(sep, other_label)
          )
          dummy_matx <- data.frame(model.matrix(~ . - 1, reduced))
          colnames(dummy_matx) <- paste0(vector_name, sort(unique(reduced[, 1])))
          if (isFALSE(redundant)) dummy_matx <- dummy_matx[, 1:(ncol(dummy_matx) - 1)]
          df <- cbind(df, dummy_matx)
          converted <- rbind(converted, vector_name)
        }
      }
    }
    no_need_to_convert <- rbind(no_need_to_convert, vector_name)
  }

  # Shorten up the long names of some variables
  if (trim > 0) colnames(df) <- substr(colnames(df), 1, trim)

  # Summary of transformations
  if (!quiet) {
    total_converted <- rbind(converted, converted_binary)
    if (length(total_converted) > 1) {
      message(paste(
        ">>> One Hot Encoding applied to", length(total_converted),
        "variables:", vector2text(total_converted)
      ))
    }
    if (length(no_variance) > 0 && drop) {
      no_variance <- no_variance[no_variance %in% ignore]
      if (length(no_variance) > 0) {
        message(paste0(
          ">>> Automatically dropped ", length(no_variance),
          " columns with 0% or >=", round(variance * 100),
          "% variance: ", vector2text(no_variance)
        ))
      }
    }
  }

  # Return only useful columns
  if (drop && length(c(converted, no_variance)) > 0) {
    df <- df[, c(!colnames(df) %in% c(converted, no_variance))]
  }

  # Bind ignored untouched columns and order
  order <- order[order %in% colnames(df)]
  df <- bind_cols(df, ignored) %>% select(any_of(order), everything())
  as_tibble(df)
}


####################################################################
#' One Hot Encoding for a Vector with Comma Separated Values
#'
#' This function lets the user do one hot encoding on a variable with
#' comma separated values
#'
#' @family Data Wrangling
#' @family One Hot Encoding
#' @param df Dataframe. May contain one or more columns with comma separated
#' values which will be separated as one hot encoding
#' @param ... Variables. Which variables to split into new columns?
#' @param sep Character. Which regular expression separates the elements?
#' @param noval Character. No value text
#' @param remove Boolean. Remove original variables?
#' @return data.frame on which all features are numerical by nature or
#' transformed with one hot encoding.
#' @examples
#' df <- data.frame(
#'   id = c(1:5),
#'   x = c("AA, D", "AA,B", "B,  D", "A,D,B", NA),
#'   z = c("AA+BB+AA", "AA", "BB,  AA", NA, "BB+AA")
#' )
#' ohe_commas(df, x, remove = TRUE)
#' ohe_commas(df, z, sep = "\\+")
#' ohe_commas(df, x, z)
#' @export
ohe_commas <- function(df, ..., sep = ",", noval = "NoVal", remove = FALSE) {
  vars <- quos(...)
  var <- gsub("~", "", as.character(vars))

  df <- as.data.frame(df)

  for (i in var) {
    df$temp <- as.character(df[, i])
    # Handling missingness
    df$temp[as.character(df$temp) == "" | is.na(df$temp)] <- noval
    vals <- v2t(as.character(df$temp), quotes = FALSE)
    vals <- unique(trimws(unlist(strsplit(vals, sep))))
    # aux <- sprintf("--%s--", vals)
    l <- strsplit(df$temp, sep)
    mat <- NULL
    for (i in seq_along(vals)) {
      which <- unlist(lapply(l, function(x) any(trimws(x) %in% vals[i])))
      mat <- cbind(mat, which)
    }
    colnames(mat) <- gsub('"', "", paste(var, vals, sep = "_"))
    df$temp <- NULL
    df <- cbind(df, mat)
  }
  if (remove) df <- df[, !colnames(df) %in% var]
  as_tibble(df, .name_repair = "minimal")
}


####################################################################
#' One Hot Encoding for Date/Time Variables (Dummy Variables)
#'
#' This function lets the user automatically create new columns out
#' of a dataframe or vector with date/time variables.
#'
#' @family Data Wrangling
#' @family Feature Engineering
#' @family One Hot Encoding
#' @inheritParams get_mp3
#' @param dates Vector or dataframe. Non-date/time columns will be
#' automatically ignored/extracted.
#' @param drop Boolean. Should the original date/time columns be
#' kept in the results? Only valid when input is a dataframe.
#' @param only Character or vector. Which columns do you wish to process? If
#' non are explicitly defined, all will be processed
#' @param append Boolean. Append results to existing data.frame? If FALSE,
#' only calculated values will be returned.
#' @param holidays Boolean. Include holidays as new columns?
#' @param country Character or vector. For which countries should the holidays
#' be included?
#' @param currency_pair Character. Which currency exchange do you
#' wish to get the history from? i.e, USD/COP, EUR/USD...
#' @return data.frame with additional features calculated out of time or date vectors.
#' @examples
#' df <- data.frame(
#'   dates = sample(seq(Sys.Date() - 365, Sys.Date(), by = 1), 50),
#'   times = sample(seq(Sys.time() - 1e7, Sys.time(), by = 1), 50)
#' )
#'
#' # Input as a vector or dataframe
#' date_feats(df, drop = TRUE, quiet = TRUE) %>% head(10)
#'
#' # Holidays given a date range and country
#' \dontrun{
#' hol <- date_feats(
#'   seq(Sys.Date() - 365, Sys.Date(), by = 1),
#'   holidays = TRUE,
#'   country = "Venezuela"
#' )
#' head(hol[!is.na(hol$holiday_name), ])
#' }
#' @export
date_feats <- function(dates,
                       drop = FALSE,
                       only = NA,
                       append = FALSE,
                       holidays = FALSE,
                       country = "Venezuela",
                       currency_pair = NA,
                       quiet = FALSE) {
  results <- NULL
  original <- dates
  date_cols <- df_str(dates, return = "names", quiet = TRUE)$time
  vector <- is.null(dim(dates))

  if (length(date_cols) == 0) {
    dates
  } else {
    if (!is.na(only)) {
      date_cols <- date_cols[date_cols %in% only]
    }

    iters <- ifelse(date_cols == "df", 1, length(date_cols))[1]

    if (is.na(iters)) {
      dates
    } else {
      if (!quiet) message(paste(">>> Processing", iters, "date/time columns:", vector2text(date_cols)))

      if (!"data.frame" %in% class(dates) && iters == 1) {
        dates <- data.frame(values_date = dates)
        date_cols <- "values_date"
      }

      alldates <- NULL
      if (holidays || !is.na(currency_pair)) {
        search_dates <- dates[, date_cols, drop = FALSE]
        search_dates[] <- lapply(search_dates, function(x) gsub(" .*", "", as.character(x)))
        alldates <- as.Date(unlist(search_dates, use.names = FALSE))
        alldates <- alldates[!is.na(alldates)]
      }

      holidays_dates <- NULL
      if (holidays && length(alldates) > 0) {
        years <- sort(unique(year(alldates)))
        holidays_dates <- holidays(countries = country, years)
        colnames(holidays_dates)[1] <- "values_date"
        holidays_dates$values_date <- as.character(as.Date(holidays_dates$values_date))
        cols <- paste0("values_date_holiday_", colnames(holidays_dates)[4:ncol(holidays_dates)])
        colnames(holidays_dates)[-(1:3)] <- cols
      }

      for (col in date_cols) {
        result <- dates %>% select(!!sym(col))
        values <- result[[1]]
        result$values_date <- as.character(as.Date(values))

        result$values_date_year <- year(values)
        result$values_date_month <- month(values)
        result$values_date_day <- day(values)
        result$values_date_week <- week(values)
        result$values_date_weekday <- weekdays(values, abbreviate = TRUE)
        result$values_date_weekend <- format(values, "%u") %in% c("6", "7")
        result$values_date_year_day <- as.integer(difftime(
          values, floor_date(values, unit = "year"),
          units = "day"
        ))

        if (any(grepl("POSIX", class(values)))) {
          result$values_date_hour <- hour(values)
          result$values_date_minute <- minute(values)
          result$values_date_minutes <- as.integer(difftime(
            values, floor_date(values, unit = "day"),
            units = "mins"
          ))
          result$values_date_second <- second(values)
        }

        if (!is.null(holidays_dates)) {
          result <- result %>%
            left_join(holidays_dates, by = "values_date", relationship = "many-to-many") %>%
            mutate(values_date_holiday_county = as.character(.data$values_date_holiday_county)) %>%
            mutate(across(starts_with("values_date_holiday_"), ~ replace(., is.na(.), FALSE)))
        }

        if (!is.na(currency_pair) && length(alldates) > 0) {
          currency <- get_currency(currency_pair, from = min(alldates), to = max(alldates))
          colnames(currency) <- c("values_date", paste0("values_date_", tolower(cleanText(currency_pair))))
          currency[[1]] <- as.character(currency[[1]])
          result <- result %>% left_join(currency, by = "values_date")
        }

        prefix <- if (col == "values_date") "" else paste0(col, "_")
        colnames(result)[-1] <- gsub("^values_date_", prefix, colnames(result)[-1])

        results <- results %>%
          bind_cols(result) %>%
          select(-contains("values_date"))

        if (vector) colnames(results)[1] <- "values"
      }

      if (append) {
        results <- bind_cols(original, select(results, -any_of(colnames(original))))
      }

      if (drop) {
        results <- results[, !colnames(results) %in% date_cols, drop = FALSE]
      }

      as_tibble(results)
    }
  }
}


####################################################################
#' Holidays in your Country
#'
#' This function lets the user automatically scrap holiday dates from
#' any country and year within +- 5 years. Thanks to timeanddate.com!
#'
#' @family Data Wrangling
#' @family Feature Engineering
#' @family Scrapper
#' @family One Hot Encoding
#' @inheritParams get_mp3
#' @param years Character or vector. For which year(s) do you wish to import
#' holiday dates?
#' @param countries Character or vector. For which country(ies) should the
#' holidays be imported?
#' @param include_regions Boolean. Default FALSE. If TRUE, for countries with
#' internal subdivisions, it will provide details on which sub-state the found
#' holidays apply.
#' @return \code{data.frame} with holidays data for given \code{countries} and \code{years}.
#' @examples
#' \donttest{
#' holidays(countries = "Argentina")
#' year <- as.integer(format(Sys.Date(), format = "%Y"))
#' holidays(countries = c("Spain", "Venezuela"), years = year)
#' holidays(countries = "Germany", include_regions = TRUE)
#' }
#' @export
holidays <- function(countries = "Venezuela",
                     years = year(Sys.Date()),
                     quiet = FALSE,
                     include_regions = FALSE) {
  if (!haveInternet()) {
    message("No internet connetion...")
    invisible(NULL)
  } else {
    # Further improvement: let the user bring more than +-5 years
    results <- NULL
    if (any(!years %in% (year(Sys.Date()) - 5):(year(Sys.Date()) + 5))) {
      warning(paste(
        "Only allowing \u00b1 5 years from today. Check:", v2t(years)
      ))
    }
    year <- year(Sys.Date())
    years <- years[years %in% ((year - 5L):(year + 5L))]
    combs <- expand.grid(years, countries) %>%
      dplyr::rename(year = "Var1", country = "Var2")

    for (i in seq_len(nrow(combs))) {
      if (!quiet) {
        message(paste0(">>> Extracting ", combs$country[i], "'s holidays for ", combs$year[i]))
      }
      url <- paste0("https://www.timeanddate.com/holidays/", tolower(combs$country[i]), "/", combs$year[i])
      # call httr's GET however set header to only accept English named date parts (months)
      # otherwise if user uses own locale, for instance German, an error can occur parsing dates of holidays
      # compare with plain call without additional headers in different locale: holidays <- content(GET(url))
      ret <- try(content(GET(url, add_headers("Accept-Language" = "en"))))
      if ("xml_document" %in% class(ret)) {
        holidays <- ret %>%
          html_nodes(".table") %>%
          html_table(fill = TRUE) %>%
          data.frame(.) %>%
          filter(!is.na(.data$Date)) %>%
          select(-2L) %>%
          mutate(Date = paste(.data$Date, combs$year[i])) %>%
          .[-1L, ] %>%
          removenacols(all = TRUE) %>%
          removenarows(all = TRUE)
        colnames(holidays) <- if (include_regions & ncol(holidays) > 3) {
          c("Date", "Holiday", "Holiday.Type", "Holiday.Details")
        } else {
          c("Date", "Holiday", "Holiday.Type")
        }

        # the table might contain comment about interstate holidays like
        # '* Observed only in some communities of this state.
        # Hover your mouse over the region or click on the holiday for details.'
        # this will not parse as Date but create a warning, hence handling it here
        grep_comment <- grep("*", holidays$Date, fixed = TRUE)
        if (length(grep_comment) != 0L) {
          holidays <- holidays[-grep_comment, ]
        }
        holidays$Date <- tryCatch(
          {
            lubridate::dmy(holidays$Date)
          },
          error = function(cond) {
            stop(
              "Unaccounted problem(s) occurred parsing the date column.\n Check sample: ",
              v2t(head(holidays$Date, 3))
            )
          }
        )

        result <- data.frame(
          holiday = holidays$Date,
          holiday_name = holidays$Holiday,
          holiday_type = holidays$Holiday.Type
        )
        if (include_regions) result$holiday_details <- holidays$Holiday.Details
        result <- result %>%
          mutate(
            national = grepl("National|Federal", holidays$Holiday.Type),
            observance = grepl("Observance", holidays$Holiday.Type),
            bank = grepl("Bank", holidays$Holiday.Type),
            nonwork = grepl("Non-working", holidays$Holiday.Type),
            season = grepl("Season", holidays$Holiday.Type),
            hother = !grepl("National|Federal|Observance|Season", holidays$Holiday.Type)
          ) %>%
          {
            if (length(unique(countries)) > 1L) {
              mutate(., country = combs$country[i])
            } else {
              .
            }
          }
        result$county <- combs$country[i]
        results <- bind_rows(results, result)
      }
      results <- results %>%
        filter(!is.na(.data$holiday)) %>%
        cleanNames() %>%
        as_tibble()
    }
    results
  }
}