rtools: Package providing R tools and wrappers

Documented in rt_get_date_fields

#' returns whether or not a value is NULL, NA, or NAN
#'
#' @param date_vector a vector of dates
#' @param reference_date the reference/base date used for fields such as (is_current_xxx). If NULL (default), then today's date is used.
#'
#' @examples
#'
#' date_vector <- lubridate::as_date('2018-01-01') + seq(0, 400)
#' rt_get_date_fields(date_vector)
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr mutate select
#' @importFrom lubridate year quarter month week day yday as_date
#' @importFrom purrr map_int map
#' @importFrom stringr str_pad
#' @importFrom timeDate holiday USNewYearsDay USMLKingsBirthday USWashingtonsBirthday USMemorialDay USIndependenceDay USLaborDay USColumbusDay USElectionDay USVeteransDay USThanksgivingDay USChristmasDay
#' @export
rt_get_date_fields <- function(date_vector, reference_date=NULL) {

    date_vector <- as_date(date_vector)
    if(is.null(reference_date)) {

        reference_date <- Sys.Date()

    } else {

        reference_date <- lubridate::as_date(reference_date)
    }

    min_year <- min(year(date_vector), na.rm = TRUE)
    max_year <- max(year(date_vector), na.rm = TRUE)

    day_lookup <- c(1, 2, 3, 4, 5, 6, 7)
    month_names <- c('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September',
                      'October', 'November', 'December')
    names_days_of_the_week <- c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')
    names(day_lookup) <- names_days_of_the_week

    #library(timeDate)
    us_federal_holiday_names <- c('USNewYearsDay', 'USMLKingsBirthday', 'USWashingtonsBirthday', 'USMemorialDay',
                                  'USIndependenceDay', 'USLaborDay', 'USColumbusDay', 'USElectionDay',
                                  'USVeteransDay', 'USThanksgivingDay', 'USChristmasDay')
    us_federal_holiday_dates <- timeDate::holiday(min_year:max_year, us_federal_holiday_names)

    return (
        data.frame(
        year = year(date_vector),
        quarter = quarter(date_vector),
        month = month(date_vector),
        week_of_year = week(date_vector),
        day_of_month = day(date_vector),
        month_name = factor(months(date_vector), levels=month_names, ordered=TRUE),
        day_name = factor(weekdays(date_vector), levels=names_days_of_the_week, ordered=TRUE),
        day_of_year = yday(date_vector)) %>%
            mutate(week_of_month = ceiling(day_of_month / 7),
                   is_current_year = year == year(reference_date),
                   is_current_quarter = ifelse(is.na(date_vector),
                                               NA,
                                               paste(year, quarter) == paste(year(reference_date),
                                                                             quarter(reference_date))),
                   is_current_month = ifelse(is.na(date_vector),
                                             NA,
                                             paste(year, month) == paste(year(reference_date),
                                                                         month(reference_date))),
                   is_month_quarter_start = month %% 3 == 1,
                   is_month_quarter_end = month %% 3 == 0,
                   week_day_number = map_int(day_name, ~ as.integer(day_lookup[as.character(.)])),
                   is_weekend = week_day_number - 6 >= 0,
                   is_holiday = ifelse(is.na(date_vector),
                                       NA,
                                       date_vector %in% as_date(us_federal_holiday_dates)),
                   is_first_of_month=day(date_vector) == 1,
                   is_end_of_month=day((date_vector + 1)) == 1,
                   is_end_of_quarter = is_end_of_month & is_month_quarter_end,
                   is_year_start = ifelse(is.na(month(date_vector)) | is.na(day(date_vector)),
                                          NA,
                                          month(date_vector) == 1 & day(date_vector) == 1),
                   is_year_end = ifelse(is.na(month(date_vector)) | is.na(day(date_vector)),
                                          NA,
                                          month(date_vector) == 12 & day(date_vector) == 31),
                   cohort_week = factor(ifelse(is.na(year) | is.na(week_of_year),
                                               NA,
                                               paste0(year, '-W', str_pad(week_of_year,
                                                                          width= 2,
                                                                          side='left',
                                                                          pad='0'))),
                                        ordered=TRUE),
                   cohort_month = factor(paste0(year, '-', substr(month_name, start = 1, stop = 3)),
                                         levels=unlist(map(min_year:max_year,
                                                           ~ paste0(., '-', substr(month_names, 1, 3)))),
                                         ordered=TRUE),
                   cohort_quarter = factor(ifelse(is.na(year) | is.na(quarter),
                                                  NA,
                                                  paste0(year, '-Q', quarter)),
                                           ordered=TRUE)
                   ) %>%
            select(year, quarter, month, day_of_month, week_day_number, day_of_year, week_of_year,
                   week_of_month, month_name, day_name, is_current_year, is_current_quarter, is_current_month,
                   is_weekend, is_holiday, is_month_quarter_start, is_month_quarter_end, is_first_of_month,
                   is_end_of_month, is_end_of_quarter, is_year_start, is_year_end, cohort_week, cohort_month,
                   cohort_quarter)
    )
}

#' returns a dataframe's column as a vector
#'
#' @param df a data.frame
#' @param column the column to return as a vector
#' @param return_unique if TRUE, return unique values
#'
#' @export
rt_get_vector <- function(df, column, return_unique=FALSE) {

    if(return_unique) {

        return (unique(df[[column]]))

    } else {

        return (df[[column]])
    }
}

#' returns the vector (`vec`) without the specified value (`val`). If `val` doesn't exist in `vec`, `vec` is
#' returned unchanged.
#'
#' @param vec the vector
#' @param val the value to remove
#'
#' @export
rt_remove_val <- function(vec, val) {

    return (vec[!vec %in% val])
}

#' There are times when you want to filter/match any value, *including* NA
#' So if
#'
#' @param x single value or vector to compare with y
#' @param y single value or vector to compare with x
#'
#' @export
rt_equal_include_na <- function(x, y) {

    rt_stopif(is.null(x) | is.null(y))
    # if either x or y is NA, then they are equal only if both are NA
    # otherwise, just use ==
    #return (!xor(is.na(x), is.na(y)) | x == y)
    return (ifelse(is.na(x) | is.na(y),
                   is.na(x) & is.na(y),
                   x == y))
}

#' returns numeric value of difftime (date_last - date_first)
#'
#' @param date_last the date to subtract from
#' @param date_first the date to subtract
#' @param units same values of difftime, defaults to `days`
#'
#' @export
rt_difftime_numeric <- function(date_last, date_first, units='days') {
    rt_stopif(length(units) > 1)
    rt_stopif(units == 'auto')  # since we are returning a numeric, we wouldn't know the units if using auto
    return (as.numeric(difftime(date_last, date_first, units=units)))
}

#' returns an ordered factor of the date vector
#'
#' date_floor: `week` gives `YYYY-MM-DD` with the `Monday` as the date when week_start is `1`
#' date_floor: `month` gives `YYYY-MM`
#' date_floor: `quarter` gives `YYYY-QX`
#' date_floor: `year` gives `YYYY`
#'
#' @param date_vector the date vector
#' @param date_floor `week`, `month`, `quarter`, `fiscal quarter`, `year`, 'fiscal year'
#' @param week_start same values of floor_date, defaults to `1` which starts the week on Monday
#' @param fiscal_start the month of the start of the fiscal year
#'
#' @importFrom magrittr "%>%"
#' @importFrom lubridate floor_date
#' @importFrom stringr str_replace, str_split
#' @importFrom purrr map_chr
#'
#' @export
rt_floor_date_factor <- function(date_vector, date_floor='week', week_start=1, fiscal_start=1) {

    if(date_floor == 'fiscal quarter' || date_floor == 'fiscal year') {
        date_vector <- lubridate::quarter(date_vector, fiscal_start = fiscal_start, type = 'year.quarter')

        if (date_floor == 'fiscal quarter') {
            date_parts <- str_split(date_vector, '\\.')
            date_vector <- map_chr(date_parts, ~ {
                paste0('FY', .[1], '-Q', .[2])
            })

        } else {
            date_vector <- map_chr(date_vector, ~ paste0('FY', floor(.)))
        }
    } else {
        date_vector <- as.character(floor_date(x=date_vector, unit=date_floor, week_start=week_start))

        if(date_floor == 'month') {

            date_vector <- substr(date_vector, 1, 7)

        } else if (date_floor == 'quarter') {

            date_vector <- date_vector %>%
                str_replace('-01-01', '-Q1') %>%
                str_replace('-04-01', '-Q2') %>%
                str_replace('-07-01', '-Q3') %>%
                str_replace('-10-01', '-Q4')

        } else if(date_floor == 'year') {

            date_vector <- substr(date_vector, 1, 4)
        }
    }

    date_vector <- factor(date_vector, levels = sort(unique(date_vector)), ordered = TRUE)

    return (date_vector)
}

#' Returns the ceiling of the **absolute value** of `y`, rounded to the nearest_x.
#'
#' @param y the value
#' @param nearest_x the decimal value to round the ceiling to
#'
#' @export
rt_ceiling_nearest_x <- function(y, nearest_x) {

    y_trans <- ceiling(abs(y)/nearest_x)*nearest_x
    if(y < 0) {
        y_trans <- y_trans * -1
    }
    # round to nearest 10 because computers can't handle decimals
    return (round(y_trans, 10))
}

#' Checks if two numbers are equivalent out to `n` decimals where `n` is the `num_decimals` parameter
#'
#' @param n1 number or vector of numbers
#' @param n2 number or vector of numbers
#' @param num_decimals round each value `n` decimals where `n` is `num_decimals`
#'
#' @export
rt_are_numerics_equal <- function(n1, n2, num_decimals) {

    return(all(round(n1, num_decimals) == round(n2, num_decimals)))
}

#' Like `stopifnot`, but stop `stopifnot` stops if the expression is not true, and `rt_stopif` stops if the
#' expression is true. Avoids the unintuitive double-negative e.g. (`stopifnot(!espression)`) which becomes
#' `rt_stopif(espression)`.
#'
#' @param exprs the expression
#'
#' @export
rt_stopif <- function(exprs) {

    stopifnot(!exprs)
}

#' Hex colors vector
#'
#' @param color_names filter by the names of the colors
#' @param sets filter by the set index
#' @param return_named_vector returns the colors as a named vector
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr filter
#' @importFrom tibble tribble

#' @export
rt_colors <- function(color_names=NULL, sets=NULL, return_named_vector=FALSE) {
    color_df <- tribble(
        ~set, ~name, ~hex,
        1, "pastel_blue", '#7AA9CF',
        1, "tuplip_tree", '#EBB13E',
        1, "custom_green" , '#41B3A3',
        1, "crail", '#A65B50',
        1, "dove_gray", '#7A7A7A',
        1, "custom_purple", '#C38D9E',
        1, "cadmium_orange", '#F28E2B',

        2, "mariner", '#4E79A7',
        2, "sandstone", '#F4CC70',
        2, "avocado", '#258039',
        2, "tomato", '#CF3721',
        2, "mist", '#90AFC5',
        2, "lavender", '#C396E8',
        2, "tree_poppy", '#FD9126',

        3, "cerulean", '#1EB1ED',
        3, "yellow_pepper", '#F5BE41',
        3, "pigment_green", '#1AAF54',
        3, "poppy", '#FF420E',
        3, "loblolly_gray", '#B4B7B9',
        3, "lavendar2", '#6C648B',
        3, "flamingo", '#FC641F',

        4, "forest", '#1E434C',
        4, "gold", '#C99E10',
        4, "sunflower", '#3F681C',
        4, "crimson", '#8D230F',
        4, "granite", '#B7B8B6',
        4, "vivid_violet", '#932791',
        4, "petal", '#F98866',

        5, "black_shadow", '#2A3132',
        5, "sky", '#375E97',
        5, "medium_sea_green", '#37B57F',
        5, "red_clay", '#A43820',

        #6, "mint", '#3EB480',
        6, "mandy", '#DF585C'
        # "coffee", '#B38867',
        # "turquoise", '#5BC8AC',
        #
        #
        # "custom_red", '#E27D60',
        #
        # "sunset", '#FB6542',
        # "stem", '#80BD9E',
        # "spring_green", '#89DA59',
        # "automn_foliage", '#763626',
        # "stone", '#336B87',
        #
        # "pink_tulip", '#F18D9E',
        # "blue_sky", '#4CB5F5',
        # "aqua_blue", '#31A9B8',
        #
        # "red_valencia", '#E15759',
        # "summer_sky", '#40C6EE',
        # "yale_blue", '#11499C',
        # "blue_de_france", '#4286E8',
        #
        # "shamrock", '#5FECA6',
        # "astronaut", '#283676',
        # "well_read", '#BB3A34',
        # "blue_chill", '#159192',
        #
        # "custom_blue", '#085DCB',
        # "custom_orange", '#E8A87C'
    )

    rt_stopif(any(duplicated(color_df$name)))
    rt_stopif(any(duplicated(color_df$hex)))

    # blue, yellow, green, red, gray, purple, orange,
    # color_df <- color_df[1:7, ]

    if(!is.null(color_names)) {
        color_df <- color_df %>% filter(name %in% color_names)
        color_df <- color_df[match(color_names, color_df$name),]
    }

    if(!is.null(sets)) {
        color_df <- color_df %>% filter(set %in% sets)
    }

    colors <- color_df$hex

    if(return_named_vector) {
        names(colors) <- color_df$name
    }
    return (colors)
}

#' Hex colors vector representing "good" (green) and "bad" (red)
#'
#' @param good_first if `TRUE` then index 1 is the hex color corresponding to `good` and
#'  index 2 corresponds to bad, otherwise the reverse.
#'
#' @export
rt_colors_good_bad <- function(good_first=TRUE) {
    good <- 'medium_sea_green'
    bad <- 'mandy'
    if(good_first) {
        custom_color_names <- c(good, bad)
    } else {
        custom_color_names <- c(bad, good)
    }
    return (rt_colors(color_names=custom_color_names))
}

#' Plots the colors from `rt_colors()`
#'
#' @param color_names filter by the names of the colors
#' @param sets filter by the set index
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr mutate
#' @importFrom ggplot2 ggplot aes geom_col scale_fill_manual theme_light theme element_blank coord_flip

#' @export
rt_plot_colors <- function(color_names=NULL, sets=NULL) {
    custom_colors <- rt_colors(color_names=color_names, sets=sets, return_named_vector=TRUE)

    factor_names <- names(custom_colors)
    factor_names <- rev(names(custom_colors))
    colors_df <- data.frame(name=names(custom_colors),
                   hex=custom_colors,
                   value=1,
                   stringsAsFactors = FALSE) %>%
            mutate(name = factor(name, levels=factor_names))

    colors_df %>%
        ggplot(aes(x=name, y=value, fill=name)) +
        geom_col() +
        scale_fill_manual(values=rev(custom_colors)) +
        theme_light() +
        theme(legend.position = 'none',
              axis.title.y=element_blank(),
              axis.title.x=element_blank(),
              axis.text.x=element_blank(),
              axis.ticks.x=element_blank()) +
        coord_flip()
}

#' Returns the colors based on the alphabetical order of the values. E.g. if you plot a graph alphabetically
#' and then plot it based on it's factor order the colors should be retained for each category.
#'
#' @param x a character or factor vector
#'
#' @export
rt_get_colors_from_values <- function(x) {

    custom_colors <- c(rt_colors(), rt_colors(), rt_colors())
    sorted_values <- sort(as.character(unique(x)))

    stopifnot(length(custom_colors) >= length(sorted_values))

    if(is.character(x) || is.logical(x)) {

        return (custom_colors[1:length(sorted_values)])

    } else if (is.factor(x)) {

        found_values <- as.character(sort(unique(x)))
        color_index <- match(found_values, sorted_values)
        return (custom_colors[color_index])
    } else {

        stopifnot(FALSE)
    }
}

rt_transform_multi_value_df <- function(dataset, variable, multi_value_delimiter) {
    # the number of columns needed is the max number of values found (i.e. out of each row in the
    # variable, the max number of multi_value_delimiter instances (+1) found)
    # so if the most multi-values found corresponds to `a;b;c;d;e` then `;` was found 4 times and 5
    # columns are needed
    column_names <- colnames(dataset)
    number_of_columns_needed <- max(str_count(string=dataset[[variable]],
                                              pattern = multi_value_delimiter),
                                    na.rm = TRUE) + 1
    new_colname_names <- paste0("temp____", 1:number_of_columns_needed)
    dataset <- suppressWarnings(dataset %>%
                                    separate(col=!!sym(variable),
                                             into=new_colname_names,
                                             sep = multi_value_delimiter))
    # if NA is a value corresponding to `temp____1` then it was NA to start with;
    # if NA is a value corresponding to `temp____x` then there wasn't multiple values
    # i.e. we can get rid of all NAs associated with `temp____x`
    # first `gather` accordingly, then filter

    columns_to_remove <- column_names %>% rt_remove_val(c(new_colname_names, variable))

    if(length(columns_to_remove) == 0) {

        dataset <- dataset %>% gather(key, value)

    } else {

        dataset <- dataset %>% gather(key, value, -all_of(columns_to_remove))
    }

    dataset <- dataset %>%
        filter(key == 'temp____1' | !is.na(value)) %>%
        select(-key)

    dataset[, variable] <- dataset$value
    dataset <- dataset %>% rt_select_all_of(column_names)

    return (dataset)
}

#' converts multi-argument values into flattened  vector
#' @param ... values and/or vectors
#'
#' @export
rt_params_to_vector <- function(...){
    x <- list(...)
    return (unlist(x))
}

#' wrapper for change in DPLYR where we cannot send vector of strings as variable
#' @param .data the data.frame to select from
#' @param ... string values and/or string vectors
#'
#' @importFrom dplyr select
#' @importFrom tidyselect all_of
#'
#' @export
rt_select_all_of <- function(.data, ...) {

    columns <- rt_params_to_vector(...)
    return (select(.data, all_of(columns)))
}

#' wrapper for group_by for string variables
#' @param .data the data.frame to select from
#' @param ... string values and/or string vectors
#'
#' @importFrom dplyr group_by_at
#' @importFrom tidyselect all_of
#'
#' @export
rt_group_by_all_of <- function(.data, ...) {

    columns <- rt_params_to_vector(...)
    return (group_by_at(.data, all_of(columns)))
}

#' like `head()` but converts to a data.frame so rows/columns aren't excluded when showing
#' @param .df dataframe
#' @param .n number of rows to include
#'
#' @export
rt_peak <- function(.df, .n=25) {

    return (.df %>% head(.n) %>% as.data.frame())
}

#' returns year and month factors in list
#'
#' @param .date_vector date vector
#' @param .abbreviate use month abbreviations
#'
#' @importFrom lubridate year
#'
#' @export
rt_get_year_month_factors <- function(.date_vector, .abbreviate=FALSE) {

    if(.abbreviate) {

        month_names <- month.abb

    } else {

        month_names <- month.name
    }

    return(list(
        year_factor=as.factor(as.character(year(.date_vector))),
        month_factor=factor(months(.date_vector, abbreviate = .abbreviate), levels=month_names)
        )
    )
}

#' as `[.column]_year` and `[.column]_month` factor variables to .data based off of date column .column
#'
#' @param .data date vector
#' @param .column name of date column in .data
#' @param .abbreviate use month abbreviations
#'
#' @export
rt_add_year_month_factors <- function(.data, .column, .abbreviate=FALSE) {

    .column <- deparse(substitute(.column))
    year_month_factors <- rt_get_year_month_factors(.data[[.column]], .abbreviate)
    .data[[paste0(.column, '_year')]] <- year_month_factors$year_factor
    .data[[paste0(.column, '_month')]] <- year_month_factors$month_factor

    return (.data)
}

#' `append` appears not to work with dates any longer... in 4.0.2.. perhaps they will fix
#' e.g.:
#'     selections <- list()
#'     append(selections, Sys.Date())  # do not know how to convert 'e' to class “Date”
#'     selections <- list(Sys.Date())
#'     append(selections, Sys.Date())  # adds date as numeric.. this use to work
#'
#' @param .list_a list to append to
#' @param .values any value or set of values to append to the list
#'
#' @export
append_list <- function(.list_a, .values) {
    # `append` appears not to work with dates any longer... in 4.0.2.. perhaps they will fix
    # selections <- list()
    # append(selections, Sys.Date())  # do not know how to convert 'e' to class “Date”
    # selections <- list(Sys.Date())
    # append(selections, Sys.Date())  # adds date as numeric.. this use to work

    return (c(.list_a, list(.values)))
}

#' wrapper arround `any(duplicated(.x))`
#'
#' @param .x vector of e.g. characters
#'
#' @export
any_duplicated <- function(.x) {
    any(duplicated(.x))
}

#' wrapper arround `stop_if(any(is.na(.x)))`. If `.empty_string_as_missing` is set to TRUE, it also treats
#'    empty strings as missing values. Otherwise, is just checks for NA values.
#'
#' @param .x vector of e.g. characters
#' @param .empty_string_as_missing if TRUE, empty strings are treated as missing. If TRUE, .x must be a character or factor vector
#'
#' @export
any_missing <- function(.x, .empty_string_as_missing=FALSE) {

    if(.empty_string_as_missing) {

        if(is.null(.x) || any(is.na(.x))) {

            return (TRUE)

        } else {

            stopifnot(is.character(.x) || is.factor(.x))
            return (any(.x == ''))
        }

    } else {

        return (is.null(.x) || any(is.na(.x)))
    }
}

#' If any of the expressions are TRUE, `stop` is called
#'
#' @param .expression any number of R expressions which should each evaluatoe to a logical vector
#'
#' @export
stop_if <- function(.expression) {
    stopifnot(!.expression)
}

#' wrapper arround `stop_if(any(duplicated(.x)))`
#'
#' @param .x vector of e.g. characters
#'
#' @export
stop_if_any <- function(.x) {
    stop_if(any(.x))
}

#' wrapper arround `stop_if(any(duplicated(.x)))`
#'
#' @param .x vector of e.g. characters
#'
#' @export
stop_if_any_duplicated <- function(.x) {
    stop_if_any(duplicated(.x))
}

#' Wrapper around `stop_if(any_missing(.x, .empty_string_as_missing))`
#'
#' @param .x vector of e.g. characters
#' @param .empty_string_as_missing if TRUE, empty strings are treated as missing
#'
#' @export
stop_if_any_missing <- function(.x, .empty_string_as_missing=FALSE) {

    stop_if(any_missing(.x, .empty_string_as_missing))
}

#' Wrapper around `stopifnot(identical(.x, .y))`
#'
#' Mainly, this is here so that I remember not to do `stopifnot(all(.x == .y))` because if one of .x or .y has
#' zero length all() still evaluates to TRUE and the program is not stopped, even though .x and .y do not have
#' the same values
#'
#' @param .x vector of e.g. characters
#' @param .y vector of e.g. characters
#'
#' @export
stop_if_not_identical <- function(.x, .y) {

    stopifnot(identical(.x, .y))
}