inst/analysis/accuracy-bootstrap-intervals.R

# Setup ------------------------------------------------------------------------
fs <- FeatureStore$new()
model_name <- c("arithmetic-mean", "rpart", "ranger", "catboost")[1]
output_dir <- file.path(getOption("path_archive"), model_name)
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

# Get the Data -----------------------------------------------------------------
tidy_data <- fs$tidy_data
historical_data <- tidy_data %>% dplyr::filter(source %in% "historical_data") %>% dplyr::select(-source)
new_data <- tidy_data %>% dplyr::filter(source %in% "new_data") %>% dplyr::select(-source)

# Saple the Data ---------------------------------------------------------------
set.seed(1936)
K <- 10
rset_obj <- historical_data %>% dplyr::sample_n(6e4) %>% rsample::bootstraps(times = K, strata = "damage_grade")

# Formualte the Model ----------------------------------------------------------
role_pk <- "building_id"
role_none <- NULL
role_input <- match_columns(historical_data, "^geo_|^has_superstructure_mud_mortar_stone$|^age$|_type$")
role_target <- "damage_grade"

# Run model ---------------------------------------------------------------
model_performance <- tibble::tibble()
pb <- progress::progress_bar$new(total = K, format = "Training Models [:bar] :current/:total (:percent) eta: :eta")
for(k in seq_len(get_rsample_num_of_splits(rset_obj))){
    pb$tick()

    train_set <- get_rsample_training_set(rset_obj, split = k)
    test_set <- get_rsample_test_set(rset_obj, split = k)

    pm <- PentaModel$new(path = file.path(.Options$path_models, model_name))
    pm$set_historical_data(train_set)
    pm$set_new_data(test_set)
    pm$set_role_pk(role_pk)
    pm$set_role_input(role_input)
    pm$set_role_target(role_target)

    pm$model_init()$model_fit()$model_predict()$model_store()

    metadata <- test_set %>% dplyr::select(role_pk, dplyr::starts_with("geo_"))
    truth.numeric <- test_set %>% dplyr::select_at(c(role_pk, role_target)) %>% dplyr::rename("truth.numeric" = !!role_target)
    estimate.numeric <- pm$response %>% dplyr::select(role_pk, fit) %>% dplyr::rename("estimate.numeric" = "fit")
    data <-
        metadata %>%
        dplyr::right_join(truth.numeric, by = role_pk) %>%
        dplyr::right_join(estimate.numeric, by = role_pk) %>%
        dplyr::mutate(truth.class = as_earthquake_damage(truth.numeric), estimate.class = as_earthquake_damage(estimate.numeric))
    data <- data %>% dplyr::group_by(geo_level_1_id)

    new_entry <-
        Yardstick$
        new(data, truth = "truth.class", estimate = "estimate.class")$
        # delete_label(".estimator")$
        insert_label(".sample", k)$
        all_class_metrics

    model_performance <- dplyr::bind_rows(model_performance, new_entry)
}

# Visualisation -----------------------------------------------------------
boxplot_data <-
    model_performance %>%
    dplyr::mutate(geo_level_1_id = forcats::fct_reorder(geo_level_1_id, .estimate, median))

grand_accuracy <-
    boxplot_data %>%
    dplyr::group_by(.sample, geo_level_1_id) %>%
    dplyr::mutate(.estimate = .n * .estimate) %>%
    dplyr::group_by(.sample) %>%
    dplyr::summarise(.estimate = sum(.estimate) / sum(.n))

boxplot(.estimate ~ geo_level_1_id, boxplot_data %>% as.data.frame())
title(main = ".estimate ~ geo_level_1_id")
abline(h = quantile(grand_accuracy$.estimate, probs = c(0.5)))
data-science-competitions/Modeling-Earthquake-Damage documentation built on Dec. 25, 2019, 12:02 p.m.