R/prepare_ml_data.R

Defines functions data_prep_func

Documented in data_prep_func

#' Prepares data for modeling
#'
#' @param data Data frame with data on the right date format e.g. daily, weekly, monthly and with column named 'id'
#' @param outcome_var Name of the outcome variable. The function will change the outcome variable name to 'outcome'
#' @param negative_to_zero Recodes negative values as zero, defaults to TRUE
#' @param max_gap_size The maximum length that the outcome can be zero. If the interval is larger than max_gap_size then only use data after the interval
#' @param trailing_zero Extends all time series, back and forward, so they will be the same length. Defaults to FALSE
#' @param transformation Should the series be transformed, e.g. log or log1p. Defaults to none
#' @param use_holidays Should national holidays be included. As of now this has to be a dataframe supplied by the user to the holidays_to_use argument
#' @param holidays_to_use_1 Data frame of dummy holidays. Outcome of the create_holiday() function: fridagar_tbl
#' @param holidays_to_use_2 Data frame of holidays, one variable.
#' @param horizon The forecast horizon
#' @param drop_na_values When creating data_prepared_tbl, should NA's be dropped. Defaults to TRUE
#' @param fill_na_with_zero Used when drop_na = FALSE to fill missing values with zero instead of dropping them.
#' @param clean Should the data be cleand for outliers. Defaults to FALSE
#' @param use_holiday_to_clean Uses fridagar_one_var from the create_holiday() function to revert series to original value if cleand
#' @param pacf_threshold Threshold for where to cut the PACF to choose terms for the fourier calculation
#' @param no_fourier_terms Number of fourier terms, defultas to 5
#' @param fourier_k The fourier term order, defaults to 5
#' @param slidify_period The window size, defaults to c(4, 8)
#' @param use_seasonal_lag Should lag of outcome variable, equal to the seasonality, be used. Defaults to TRUE.
#' @param seasonal_frequency The frequency of the data. E.g. 52 for weekly data
#' @param use_own_fourier Should you use your own fourier terms? Defaults to FALSE
#' @param fourier_terms The fourier terms to include.
#' @param recursive_data Should the data be prepared for a recursive forecasting. Defaults to FALSE.
#' @param no_recursive_lag The number of lags to be.
#' @param xreg External regressors to add
#' @param anomaly Should anomaly detection variable added. Defaults to FALSE.
#' @param intermittent Intermittent threshold when to remove anomaly label. Defaults to 0.3
#'
#' @return List with data_prepared, future_data, train_data, splits and horizon


data_prep_func <- function(data, outcome_var, negative_to_zero = FALSE, fix_gap_size = FALSE, max_gap_size, remove_one_obs = FALSE,
                           trailing_zero = FALSE, transformation = "none", use_holidays = FALSE,
                           holidays_to_use_1 = NULL, holidays_to_use_2 = NULL, use_seasonal_lag = TRUE, seasonal_frequency,
                           horizon, clean = FALSE, drop_na_values = TRUE,  use_holiday_to_clean = FALSE,
                           holiday_for_clean = NULL, use_abc_category = FALSE, pacf_threshold = 0.2, no_fourier_terms = 5,
                           fourier_k = 5, slidify_period = c(4, 8), use_own_fourier = FALSE, fourier_terms, intermittent = 0.3,
                           recursive_data = FALSE, no_recursive_lag, xreg = NULL, fill_na_with_zero = TRUE, anomaly = FALSE) {

    require(tidyverse)
    require(timetk)
    require(zoo)
    require(tidymodels)

    # return list
    return_list <- list()


    # Rename outcome
    df <- data %>%
        rename("outcome" = outcome_var)

    df <- df %>%
        arrange(id, date)


    # Negative values
    if(negative_to_zero) {
        df <- df %>%
            mutate(outcome = ifelse(outcome < 0 , 0, outcome))

    } else {
        df
    }

    # Check if all observations ara zero
    all_zeros_id <- df %>%
        group_by(id) %>%
        summarise(all_zero = all(outcome == 0)) %>%
        filter(all_zero) %>%
        pull(id)

    # Gap size
    if (fix_gap_size) {
        df <- df %>%
            group_by(id) %>%
            mutate(max_date = max(date)) %>%
            ungroup() %>%
            complete(id, date) %>%
            mutate(outcome = ifelse(is.na(outcome), 0, outcome)) %>%
            fill(-outcome, .direction = "down") %>%
            group_by(id) %>%

            # leading zeros first
            mutate(cumsum_sala = cumsum(outcome)) %>%
            filter(cumsum_sala > 0, date <= max_date) %>%
            select(-max_date) %>%

            # choose only items with sales in the past 52 weeks
            mutate(no_sala = ifelse(outcome == 0, 1, 0)) %>%
            mutate(cum_no_sala = rollapplyr(no_sala, width = max_gap_size, FUN = sum, partial = TRUE),
                   indicator = case_when(
                       lag(cum_no_sala) == max_gap_size & outcome > 0 ~ 1,
                       all(cum_no_sala < max_gap_size) ~ 1,
                       TRUE ~ 0),
                   cum_indicator = cumsum(indicator)) %>%
            filter(cum_indicator >= 1) %>%
            select(-c(cumsum_sala, no_sala, cum_no_sala, indicator, cum_indicator)) %>%
            ungroup()}



    # Trailing zeros
    df <- df %>%
        complete(id, date) %>%
        group_by(id) %>%
        fill(-outcome, .direction = "up") %>%
        fill(-outcome, .direction = "down") %>%
        ungroup() %>%
        mutate(outcome = ifelse(is.na(outcome), 0 , outcome))

    # if(trailing_zero) {
    #     df <- df %>%
    #         complete(id, date) %>%
    #         group_by(id) %>%
    #         fill(-outcome, .direction = "up") %>%
    #         fill(-outcome, .direction = "down") %>%
    #         ungroup() %>%
    #         mutate(outcome = ifelse(is.na(outcome), 0 , outcome))
    # } else {
    #
    #     df <- df %>%
    #         complete(id, date) %>%
    #         # Hendi út trailing zeros
    #
    #         #filter(id == "dan764333") %>%
    #         mutate(outcome = ifelse(is.na(outcome), 0, outcome),
    #                zero_helper = ifelse(outcome == 0, 0, 1)) %>%
    #         group_by(id) %>%
    #         mutate(cumsum_zero = cumsum(zero_helper)) %>%
    #         rowwise() %>%
    #
    #         # A product can have zero sale the first month because of -1 and +1 sales
    #         # We should still include this product so we need an indicator that grabs
    #         # this products
    #         mutate(no_na = sum(is.na(across(3:ncol(df))))) %>%
    #         #mutate(no_na = sum(is.na((across(dataareaid:packaging_group_id))))) %>%
    #         ungroup() %>%
    #         mutate(no_na_dummy = ifelse(no_na == 0, 1, 0),
    #                no_na_cumsum = cumsum(no_na_dummy)) %>%
    #
    #         # Bý til indicator hvenær eigi að filter
    #         mutate(filter_indicator = case_when(
    #             cumsum_zero > 0 | no_na_cumsum > 0 ~ "choose",
    #             TRUE ~ "skip"
    #         )) %>%
    #
    #         filter(filter_indicator == "choose") %>%
    #         ungroup() %>%
    #         # drop_na(ft) %>%
    #         select(-contains("zero"), -contains("cumsum"), -no_na_dummy) %>%
    #         drop_na()
    #
    # }

    # Transformations
    if(transformation == "none") {
        df
    } else if (transformation == "log") {
        df <- df %>%
            mutate(outcome = log(outcome))
    } else if (transformation == "log1p") {
        df <- df %>%
            mutate(outcome = log1p(outcome))
    }

    # Remove ID with one obs
    if (remove_one_obs) {
        one_obs_id_tbl <- df %>%
            group_by(id) %>%
            mutate(no_obs = n()) %>%
            filter(no_obs == 1) %>%
            pull(id)

        df <- df %>%
            group_by(id) %>%
            mutate(no_obs = n()) %>%
            filter(no_obs > 1) %>%
            select(-no_obs)

    }


    # Future and prepared data
    df <- df %>%
        ungroup() %>%
        complete(date, id) %>%
        replace_na(list(outcome = 0)) %>%
        mutate(id = as_factor(id)) %>%
        group_by(id) %>%
        future_frame(date, .length_out = horizon, .bind_data = TRUE) %>%
        fill(-outcome, .direction = "down") %>%
        ungroup() %>%
        drop_na(-outcome)

    if (!is.null(xreg)) {
        df <- df %>%
            left_join(xreg, by = "date")
    }


    # Holidays
    if (use_holidays) {

        if(!is.null(holidays_to_use_2)) {
            df <- df %>%
                left_join(holidays_to_use_1) %>%
                left_join(holidays_to_use_2)

        } else {
            df <- df %>%
                left_join(holidays_to_use_1)

        }

    } else {
        df
    }



    # Fourier period
    if (use_own_fourier) {
        fourier_terms <- fourier_terms

    } else {
        if (use_abc_category) {
            fourier_periods <- df %>%
                filter(str_detect(abc, c("a|b"))) %>%
                group_by(id) %>%
                tk_acf_diagnostics(date, outcome) %>%
                ungroup() %>%
                filter(abs(PACF) > pacf_threshold) %>%
                count(lag) %>%
                arrange(desc(n)) %>%
                filter(lag > 1) %>%
                dplyr::slice(1:no_fourier_terms) %>%
                pull(lag)
        } else {
            fourier_periods <- df  %>%
                group_by(id) %>%
                tk_acf_diagnostics(date, outcome) %>%
                ungroup() %>%
                filter(abs(PACF) > pacf_threshold) %>%
                count(lag) %>%
                arrange(desc(n)) %>%
                filter(lag > 1) %>%
                dplyr::slice(1:no_fourier_terms) %>%
                pull(lag)
        }

        fourier_periods <- c(fourier_periods, seasonal_frequency/2,  seasonal_frequency)
        fourier_periods <- unique(fourier_periods)

        fourier_terms <- fourier_periods
    }



    # Full data
    if (recursive_data) {

        if (use_seasonal_lag) {
            if (use_abc_category) {
                full_data_tbl <- df %>%
                    arrange(id, abc, date) %>%
                    group_by(id) %>%
                    tk_augment_fourier(date, .periods = fourier_terms, .K = fourier_k) %>%
                    tk_augment_lags(.value = outcome, .lags = c(1:no_recursive_lag, seasonal_frequency - 1, seasonal_frequency, seasonal_frequency + 1)) %>%
                    tk_augment_slidify(
                        contains(paste0("outcome_lag", seasonal_frequency)),
                        .f = ~mean(.x, na.rm = TRUE),
                        .period  = slidify_period,
                        .partial = TRUE,
                        .align   = "center"
                    ) %>%
                    ungroup() %>%
                    rowid_to_column(var = "rowid")

            } else {
                full_data_tbl <- df %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    tk_augment_fourier(date, .periods = fourier_terms, .K = fourier_k) %>%
                    tk_augment_lags(.value = outcome, .lags = c(1:no_recursive_lag, seasonal_frequency - 1, seasonal_frequency, seasonal_frequency + 1)) %>%
                    tk_augment_slidify(
                        contains(paste0("outcome_lag", seasonal_frequency)),
                        .f = ~mean(.x, na.rm = TRUE),
                        .period  = slidify_period,
                        .partial = TRUE,
                        .align   = "center"
                    ) %>%
                    ungroup() %>%
                    rowid_to_column(var = "rowid")
            }

        } else {

            if (use_abc_category) {
                full_data_tbl <- df %>%
                    arrange(id, abc, date) %>%
                    group_by(id) %>%
                    tk_augment_fourier(date, .periods = fourier_terms, .K = fourier_k) %>%
                    tk_augment_lags(.value = outcome, .lags = c(1:no_recursive_lag)) %>%
                    ungroup() %>%
                    rowid_to_column(var = "rowid")

            } else {
                full_data_tbl <- df %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    tk_augment_fourier(date, .periods = fourier_terms, .K = fourier_k) %>%
                    tk_augment_lags(.value = outcome, .lags = c(1:no_recursive_lag)) %>%
                    ungroup() %>%
                    rowid_to_column(var = "rowid")
            }

        }


    } else {

        if (use_seasonal_lag) {
            if (use_abc_category) {
                full_data_tbl <- df %>%
                    arrange(id, abc, date) %>%
                    group_by(id) %>%
                    tk_augment_fourier(date, .periods = fourier_terms, .K = fourier_k) %>%
                    tk_augment_lags(.value = outcome, .lags = c(seasonal_frequency - 1, seasonal_frequency, seasonal_frequency + 1)) %>%
                    tk_augment_slidify(
                        contains("outcome_lag"),
                        .f = ~mean(.x, na.rm = TRUE),
                        .period  = slidify_period,
                        .partial = TRUE,
                        .align   = "center"
                    ) %>%
                    ungroup() %>%
                    rowid_to_column(var = "rowid")

            } else {
                full_data_tbl <- df %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    tk_augment_fourier(date, .periods = fourier_terms, .K = fourier_k) %>%
                    tk_augment_lags(.value = outcome, .lags = c(seasonal_frequency - 1, seasonal_frequency, seasonal_frequency + 1)) %>%
                    tk_augment_slidify(
                        contains("outcome_lag"),
                        .f = ~mean(.x, na.rm = TRUE),
                        .period  = slidify_period,
                        .partial = TRUE,
                        .align   = "center"
                    ) %>%
                    ungroup() %>%
                    rowid_to_column(var = "rowid")
            }

        } else {

            if (use_abc_category) {
                full_data_tbl <- df %>%
                    arrange(id, abc, date) %>%
                    group_by(id) %>%
                    tk_augment_fourier(date, .periods = fourier_terms, .K = fourier_k) %>%
                    ungroup() %>%
                    rowid_to_column(var = "rowid")

            } else {
                full_data_tbl <- df %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    tk_augment_fourier(date, .periods = fourier_terms, .K = fourier_k) %>%
                    ungroup() %>%
                    rowid_to_column(var = "rowid")
            }

        }

    }


    # Fix trailing zeros
    if(!trailing_zero) {
        full_data_tbl <- full_data_tbl %>%
            filter(!id %in% all_zeros_id) %>%
            group_by(id) %>%
            mutate(cumsum_outcome = cumsum(outcome),
                   cumsum_outcome = ifelse(is.na(cumsum_outcome), 1, cumsum_outcome)) %>%
            ungroup() %>%
            filter(cumsum_outcome > 0) %>%
            select(-cumsum_outcome) %>%
            bind_rows(
                full_data_tbl %>%
                    filter(id %in% all_zeros_id)
            )
    }



    # data prepared
    # drop_na
    if (drop_na_values) {
        data_prepared_tbl <- full_data_tbl %>%
            filter(!is.na(outcome)) %>%
            drop_na()
    } else if (fill_na_with_zero) {
        data_prepared_tbl <- full_data_tbl %>%
            filter(!is.na(outcome)) %>%
            mutate(across(.cols = contains("_lag"),
                          .fns  = ~ replace_na(.x, 0)))
    } else {
        data_prepared_tbl <- full_data_tbl %>%
            filter(!is.na(outcome))
    }


    # Future data
    future_data <- full_data_tbl %>%
        filter(is.na(outcome))


    # address nan and na in variables _lag
    future_data <- future_data %>%
        mutate(across(.cols = contains("_lag"),
                      .fns = ~ ifelse(is.nan(.x), NA, .x))) %>%
        mutate(across(.cols = contains("_lag"),
                      .fns  = ~ replace_na(.x, 0)))


    # Anomaly detection

    if(anomaly) {
        short_time_series <- data_prepared_tbl %>%
            group_by(id) %>%
            summarise(n = n_distinct(date)) %>%
            filter(n < seasonal_frequency)


        anomaly_tbl <- data_prepared_tbl %>%
            filter(!id %in% short_time_series$id) %>%
            group_by(id) %>%
            tk_anomaly_diagnostics(date, outcome) %>%
            select(id, date, anomaly)


        # add anomaly to data_prepared_tbl and future_data
        data_prepared_tbl <- data_prepared_tbl %>%
            left_join(anomaly_tbl) %>%
            mutate(anomaly = ifelse(is.na(anomaly), "No", anomaly))

        future_data <- future_data %>%
            add_column(anomaly = "No")
    }

    # Some series are intermittent and all the spikes are marked as anomaly
    # I'll fix that here. If 0.3 of series is zero, then intermittent and all anomaly
    # labels will be removed
    data_prepared_tbl <- data_prepared_tbl %>%
        group_by(id) %>%
        mutate(inter = mean(outcome == 0)) %>%
        mutate(anomaly = ifelse(inter >= intermittent, "No", anomaly)) %>%
        select(-inter)


    # Split
    # splits <- data_prepared_tbl %>%
    #     time_series_split(date, assess = horizon, cumulative = TRUE)


    # Train data

    freq_name <- case_when(seasonal_frequency == 12 ~ "month",
                           seasonal_frequency == 52 ~ "week",
                           seasonal_frequency >= 360 ~ "year",
                           TRUE ~ "day")

    if(clean) {
        if (use_holiday_to_clean) {

            # Create holiday variable to "unclean" series if there is a holiday
            date_variable <- seq.Date(from = as.Date("1990-01-01"),
                                      to   = as.Date("2040-12-31"),
                                      by   = "day")

            date_tbl <- tibble(date = date_variable)

            if (freq_name == "week") {

                holiday_clean_date <- date_tbl %>%
                    mutate(date = floor_date(date, "week", week_start = 1)) %>%
                    distinct() %>%
                    left_join(holiday_for_clean) %>%
                    #left_join(holidays_to_use_2) %>%
                    mutate(holiday = ifelse(is.na(value), 0, 1)) %>%
                    mutate(lead_holiday = lead(holiday),
                           lag_holiday  = lag(holiday)) %>%
                    replace(is.na(.), 0) %>%
                    mutate(value = rowSums(. [2:4])) %>%
                    mutate(value = ifelse(value == 0, 0, 1)) %>%
                    select(-contains("holiday")) %>%
                    rename("holiday" = "value")

            } else {

                holiday_clean_date <- date_tbl %>%
                    mutate(date = floor_date(date, freq_name)) %>%
                    distinct() %>%
                    left_join(holiday_for_clean) %>%
                    #left_join(holidays_to_use_2) %>%
                    mutate(holiday = ifelse(is.na(value), 0, 1)) %>%
                    mutate(lead_holiday = lead(holiday),
                           lag_holiday  = lag(holiday)) %>%
                    replace(is.na(.), 0) %>%
                    mutate(value = rowSums(. [2:4])) %>%
                    mutate(value = ifelse(value == 0, 0, 1)) %>%
                    select(-contains("holiday")) %>%
                    rename("holiday" = "value")

            }


            if (transformation == "log") {
                data_prepared_tbl <- data_prepared_tbl %>%
                    filter(!id %in% all_zeros_id) %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    mutate(cumsum_zero = cumsum(outcome)) %>%
                    filter(cumsum_zero > 0) %>%
                    mutate(outcome = exp(outcome)) %>%
                    mutate(outcome_clean = ts_clean_vec(outcome)) %>%
                    left_join(holiday_clean_date) %>%
                    mutate(holiday = ifelse(is.na(holiday), 0, 1)) %>%
                    mutate(outcome = case_when(holiday == 1 ~ outcome,
                                               TRUE ~ outcome_clean)) %>%
                    select(-c(outcome_clean, cumsum_zero, holiday)) %>%
                    mutate(outcome = log(outcome)) %>%
                    bind_rows(data_prepared_tbl %>%
                                  filter(id %in% all_zeros_id))



            } else if (transformation == "log1p") {
                data_prepared_tbl <- data_prepared_tbl %>%
                    filter(!id %in% all_zeros_id) %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    mutate(cumsum_zero = cumsum(outcome)) %>%
                    filter(cumsum_zero > 0) %>%
                    mutate(outcome = expm1(outcome)) %>%
                    mutate(outcome_clean = ts_clean_vec(outcome)) %>%
                    left_join(holiday_clean_date) %>%
                    mutate(holiday = ifelse(is.na(holiday), 0, 1)) %>%
                    mutate(outcome = case_when(holiday == 1 ~ outcome,
                                               TRUE ~ outcome_clean)) %>%
                    select(-c(outcome_clean, cumsum_zero, holiday)) %>%
                    mutate(outcome = log1p(outcome)) %>%
                    bind_rows(data_prepared_tbl %>%
                                  filter(id %in% all_zeros_id))



            } else {
                data_prepared_tbl <- data_prepared_tbl %>%
                    filter(!id %in% all_zeros_id) %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    mutate(cumsum_zero = cumsum(outcome)) %>%
                    filter(cumsum_zero > 0) %>%
                    mutate(outcome_clean = ts_clean_vec(outcome)) %>%
                    left_join(holiday_clean_date) %>%
                    mutate(holiday = ifelse(is.na(holiday), 0, 1)) %>%
                    mutate(outcome = case_when(holiday == 1 ~ outcome,
                                               TRUE ~ outcome_clean)) %>%
                    select(-c(outcome_clean, cumsum_zero, holiday)) %>%
                    bind_rows(data_prepared_tbl %>%
                                  filter(id %in% all_zeros_id))


            }

        } else {

            if (transformation == "log") {
                data_prepared_tbl <- data_prepared_tbl %>%
                    filter(!id %in% all_zeros_id) %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    mutate(cumsum_zero = cumsum(outcome)) %>%
                    filter(cumsum_zero > 0) %>%
                    mutate(outcome = exp(outcome)) %>%
                    mutate(outcome = ts_clean_vec(outcome)) %>%
                    select(-cumsum_zero) %>%
                    mutate(outcome = log(outcome)) %>%
                    bind_rows(data_prepared_tbl %>%
                                  filter(id %in% all_zeros_id))

            } else if (transformation == "log1p") {
                data_prepared_tbl <- data_prepared_tbl %>%
                    filter(!id %in% all_zeros_id) %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    mutate(cumsum_zero = cumsum(outcome)) %>%
                    filter(cumsum_zero > 0) %>%
                    mutate(outcome = expm1(outcome)) %>%
                    mutate(outcome = ts_clean_vec(outcome)) %>%
                    select(-cumsum_zero) %>%
                    mutate(outcome = log1p(outcome)) %>%
                    bind_rows(data_prepared_tbl %>%
                                  filter(id %in% all_zeros_id))
            } else {
                data_prepared_tbl <- data_prepared_tbl %>%
                    filter(!id %in% all_zeros_id) %>%
                    arrange(id, date) %>%
                    group_by(id) %>%
                    mutate(cumsum_zero = cumsum(outcome)) %>%
                    filter(cumsum_zero > 0) %>%
                    mutate(outcome = ts_clean_vec(outcome)) %>%
                    select(-cumsum_zero) %>%
                    bind_rows(data_prepared_tbl %>%
                                  filter(id %in% all_zeros_id))
            }
        }

        splits <- data_prepared_tbl %>%
            time_series_split(date, assess = horizon, cumulative = TRUE)

    } else {
        data_prepared_tbl <- data_prepared_tbl

        splits <- data_prepared_tbl %>%
            time_series_split(date, assess = horizon, cumulative = TRUE)
    }


    return_list$full_data     <- full_data_tbl
    return_list$data_prepared <- data_prepared_tbl
    return_list$future_data   <- future_data
    return_list$splits        <- splits
    return_list$horizon       <- horizon
    return_list$fourier_terms <- fourier_terms

    if (exists("one_obs_id_tbl")) {
        return_list$one_obs_id <- one_obs_id_tbl
    }

    return(return_list)

}
vidarsumo/sumots documentation built on June 29, 2021, 4:23 a.m.