In KWB-R/dwc.wells: A Package for Condition Predictions for Drinking Water Wells

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Install R Package

# Enable KWB-R universe

options(repos = c(
  kwbr = 'https://kwb-r.r-universe.dev',
  CRAN = 'https://cloud.r-project.org'))

# Install R package "dwc.wells"
install.packages('dwc.wells', dependencies = TRUE)

Input Dataset

library(dwc.wells)
df <- dwc.wells::model_data_reduced
str(df)

Resampling

resampling <- "random"
#resampling <- "by_well"
set.seed(1)

if (resampling == "random") {

  # for regression
  data_split <- rsample::initial_split(df %>% dplyr::select(-well_id),
                                       prop = 0.8,
                                       strata = Qs_rel)
  df_training <- data_split %>% rsample::training()
  df_test <- data_split %>% rsample::testing()

}


# version 2: splitting per well ids

if (resampling == "by_well") {

  well_ids <- unique(df$well_id)
  train_ids <- sample(well_ids, 0.8 * length(well_ids))
  test_ids <- setdiff(well_ids, train_ids)
  df_training <- df %>%
    dplyr::filter(well_id %in% train_ids) %>%
    dplyr::select(-well_id)
  df_test <- df %>%
    dplyr::filter(well_id %in% test_ids) %>%
    dplyr::select(-well_id)

}

tibble::as_tibble(df_training)
tibble::as_tibble(df_test)

Regression

Hyperparameter Tuning

# Specify model ----------------------------------------------------------------

  # specify model
rf_reg_tune_model <- parsnip::rand_forest(trees = 500, 
                                          mtry = tune::tune(), 
                                          min_n = tune::tune()) %>%
  # Specify the engine
  parsnip::set_engine('randomForest') %>%
  # Specify the mode
  parsnip::set_mode('regression')

# specify recipe
rec <- recipes::recipe(Qs_rel ~ ., data = df_training)

# setup  workflow
rf_reg_tune_wflow <- workflows::workflow() %>%
  workflows::add_recipe(rec) %>%
  workflows::add_model(rf_reg_tune_model)

# define cross validation procedure
cv_folds <- rsample::vfold_cv(df_training, v = 5)

# define hyperparameter grid
#rf_reg_grid <- grid_random(parameters(rf_reg_tune_model), size = 100)
#rf_reg_grid <- grid_regular(parameters(rf_reg_tune_model), c(5,5))
rf_reg_grid <- dials::grid_regular(dials::mtry(range = c(3, 15)),
                                   dials::min_n(range = c(5, 15)),
                                   levels = 13)

# parallelisation and tuning
doParallel::registerDoParallel()

set.seed(345)
rf_reg_tuning <- tune::tune_grid(
  rf_reg_tune_wflow,
  resamples = cv_folds,
  grid = rf_reg_grid
)

# visualise results
metrics <- rf_reg_tuning %>% tune::collect_metrics()
dwc.wells::save_data(metrics, 
                     path = getwd(), 
                     filename = sprintf("rf-regression_tuning-grid_resampling-%s_metric",
                                        resampling))

# visualise results
metrics %>%
  dplyr::filter(.metric == "rmse") %>%
  dplyr::select(mean, min_n, mtry) %>%
  tidyr::pivot_longer(min_n:mtry,
                      values_to = "value",
                      names_to = "parameter") %>%
  ggplot2::ggplot(ggplot2::aes(value, mean, color = parameter)) +
  ggplot2::geom_point(show.legend = FALSE) +
  ggplot2::scale_x_continuous(breaks = seq.int(1, 15, 2)) +
  ggplot2::facet_wrap(~parameter, scales = "free_x") +
  ggplot2::labs(x = NULL, y = "RMSE [%]") +
  sema.berlin.utils::my_theme()

ggplot2::ggsave("rf_reg_regression_hyperparameter_tuning_plot_regular_random_resampling.png",
                width = 6, 
                height = 3, 
                dpi = 600)

# raster heatmap plot
metrics %>%
  dplyr::filter(.metric == "rmse") %>%
  dplyr::select(mean, min_n, mtry) %>%
  ggplot2::ggplot(aes(x = min_n, y = mtry, fill = mean)) +
  ggplot2::geom_raster() +
  ggplot2::scale_x_continuous(breaks = seq.int(1, 15, 2)) +
  ggplot2::scale_y_continuous(breaks = seq.int(1, 15, 2)) +
  ggplot2::labs(fill = "RMSE [%]") +
  sema.berlin.utils::my_theme()

ggplot2::ggsave("rf_reg_regression_hyperparameter_tuning_plot_regular_random_resampling_heatmap.png",
                width = 5, 
                height = 3, 
                dpi = 600)



# determine best model
best_rmse <- tune::select_best(rf_reg_tuning, "rmse")
dwc.wells::save_data(best_rmse, 
                     path = getwd(), 
                     filename = "rf_reg_regression_best_model_regular", 
                     "RData")
final_rf_reg <- tune::finalize_model(rf_reg_tune_model, best_rmse)

# update workflow
rf_reg_final_wflow <- workflows::workflow() %>%
   workflows::add_recipe(rec) %>%
   workflows::add_model(final_rf_reg)

# train and test model / workflow
rf_reg_final_fit <- rf_reg_final_wflow %>% tune::last_fit(data_split)

#get metrics
rf_reg_final_fit %>%  tune::collect_metrics()

# get predictions
df_pred <- rf_reg_final_fit %>%  tune::collect_predictions()

# Evaluate model performance ---
ggplot2::scatterplot(df_pred)
ggplot2::ggsave("random_forest_regression_tuned_regular.png", width = 3.5, height = 3)

Best-Fit Model

rf_reg_model <- parsnip::rand_forest(trees = 500,
                                     mtry = 6, 
                                     min_n = 10) %>%
  # Specify the engine
  parsnip::set_engine('randomForest') %>%
  # Specify the mode
  parsnip::set_mode('regression')

# Model training and assessment (regression) -----------------------------------

# Train model
set.seed(26)
rf_reg_fit <- rf_reg_model %>% parsnip::fit(Qs_rel ~ ., data = df_training)

# Make predictions
predictions <- predict(rf_reg_fit, df_test)

# Evaluate model performance
df_pred <- df_test %>% 
  dplyr::select(Qs_rel) %>% 
  dplyr::bind_cols(predictions)

yardstick::rmse(df_pred, truth = Qs_rel, estimate = .pred)
yardstick::rsq(df_pred, truth = Qs_rel, estimate = .pred)

# scatter plot
dwc.wells::scatterplot(df_pred, 
                       lines_80perc = FALSE, 
                       alpha = 1, 
                       pointsize = 0.9)
ggplot2::ggsave("scatterplot_rf-regression_numeric.png",
                dpi = 600,
                width = 3.5,
                height = 3)

Classification

# classification performance ---------------------------------------------------

# classify Qs data
df_pred <- df_pred %>%
  dplyr::mutate(Qs_rel_class = dwc.wells::classify_Qs(Qs_rel),
                .pred_class = dwc.wells::classify_Qs(.pred))

# confusion matrix
matrix <- yardstick::conf_mat(df_pred, 
                              truth = Qs_rel_class, 
                              estimate = .pred_class)
matrix 

# performance metrics
metrics <- summary(matrix)
metrics

dwc.wells::save_data(matrix, 
                     path = getwd(), 
                     filename = "rf_numeric_to_class_matrix_split80",
                     formats = "RData")