R/best_model_per_id.R

Defines functions accuracy_by_id

Documented in accuracy_by_id

#' Find best model per id and create forecast out of sample
#'
#' @param forecast_tbl Data frame with all the forecasts by all the models
#' @param first_date_in_test_set The first date in the test set
#' @param one_obs_id Vector of id's with one observation
#' @param transformation_used What transformation, if any, was used. Possibe values are 'none', 'log' or 'log1p'
#' @param model_used Character vector of models used.



accuracy_by_id <- function(forecast_tbl, one_obs_id, first_date_in_test_set, transformation_used, model_used, unnest = FALSE) {

    return_list <- list() # list with values to return

    # Accuracy table
    accuracy_tbl <- forecast_tbl %>%

        filter(!id %in% one_obs_id) %>%

        # Select necessary columns and convert to original level
        select(.model_desc, .index, .value, id)


    if (transformation_used == "log") {
        accuracy_tbl <- accuracy_tbl %>%
            mutate(.value = exp(.value))

    } else if (transformation_used == "log1p") {
        accuracy_tbl <- accuracy_tbl %>%
            mutate(.value = expm1(.value))
    } else {
        accuracy_tbl <- accuracy_tbl
    }

    model_used <- c("ACTUAL", stringr::str_to_upper(model_used), "ENSEMBLE")

    if (unnest) {
        accuracy_tbl <- accuracy_tbl %>%
            pivot_wider(names_from = .model_desc, values_from = .value) %>%
            unnest(model_used)
    } else {
        accuracy_tbl <- accuracy_tbl %>%
            pivot_wider(names_from = .model_desc, values_from = .value)
    }

    accuracy_tbl <- accuracy_tbl %>%
        filter(.index >= first_date_in_test_set) %>%
        pivot_longer(cols = -c(.index, id, ACTUAL)) %>%

        # necessary if some .value is empty
        drop_na(value) %>%

        group_by(id, name) %>%
        summarize_accuracy_metrics(
            truth = ACTUAL,
            estimate = value,
            metric_set = default_forecast_accuracy_metric_set()
        ) %>%

        # Find best model per id by counting number of times it has the lowest error
        pivot_longer(cols = mae:rsq,
                     names_to = "metric_name",
                     values_to = "metric_value") %>%

        # Skip rsq
        filter(!metric_name == "rsq") %>%
        # Arrange and create index to sum (lowest sum = best model)
        group_by(metric_name, id) %>%
        arrange(metric_value) %>%
        mutate(index = 1:n()) %>%
        group_by(name, id) %>%
        mutate(sum = sum(index)) %>%
        group_by(id) %>%

        # Best model based on majority vote
        select(id, name, sum, metric_name, metric_value) %>%
        distinct() %>%
        filter(sum == min(sum)) %>%

        # If it's a tie, filter by rmse, then mase, then mae. If still tie, pick
        # the "firest"
        pivot_wider(names_from = metric_name, values_from = metric_value) %>%
        filter(rmse == min(rmse)) %>%
        filter(mase == min(mase)) %>%
        filter(mae  == min(mae)) %>%
        slice_head(n = 1)


    # Best metric per ID
    best_metric_by_id <- accuracy_tbl %>%
        select(id, name) %>%
        set_names("id", "best_model") %>%
        group_by(id) %>%
        mutate(n_models = n(),
               choose = case_when(
                   n_models > 1 & str_detect(best_model, "CATBOOST") ~ "choose",
                   n_models > 1 & !str_detect(best_model, "CATBOOST") ~ "not_choose",
                   TRUE ~ "choose"
               )
        ) %>%
        filter(choose == "choose") %>%
        select(-c(n_models, choose)) %>%
        ungroup()

    # One sample ID model
    one_sample_id_tbl <- tibble(
        id         = one_obs_id,
        best_model = "CATBOOST"
        )

    # Final best by id
    best_metric_by_id <- best_metric_by_id %>%
        bind_rows(one_sample_id_tbl)


    # Return
    return_list$accuracy_tbl <- accuracy_tbl
    return_list$best_model_per_id <- best_metric_by_id


    return(return_list)

}
vidarsumo/sumots documentation built on June 29, 2021, 4:23 a.m.