Geoscience BC Study Code - Machine Learning Wrapper

library(geosciencebc2019008)
run_date = Sys.Date() %>% str_replace_all(., "-", "_")

model_prefix = 'regr_model_xgboost_wcfd/xgboost_wcfd'
input_data = "../wcfd_data/final_wcfd_mldf_regr.rds"
model_type = "regr.xgboost"

pset = makeParamSet(
  makeNumericParam("eta", lower = 0.1, upper = 0.6),
  makeNumericParam("gamma", lower = 0.1, upper = 10),
  makeNumericParam("lambda", lower = -1, upper = 2, trafo = function(x) 10^x),
  makeIntegerParam("nrounds", lower = 50, upper = 150),
  makeIntegerParam("max_depth", lower = 1, upper = 6),
  makeNumericParam("colsample_bytree", lower = 0.3, upper = 0.7),
  makeNumericParam("alpha", lower = 0, upper =1),
  makeIntegerParam("min_child_weight", lower = 1, upper = 7)
)

wa_nums = c(32768, 32950, 33857, 33665, 32582, 33161, 29429, 32301, 33113,
            24035, 33894, 33162, 32574, 32575, 33550,   37346, 30376, 30940, 
            28640, 29976, 30735)

shap_wells = which((read_rds(input_data) %>%
  dplyr::filter(seismogenic == 1) %>% 
  pull(wa_num)) %in% wa_nums)

target = "max_mag"

regr_comp_feats  = c(
  "calc_total_fluid_m3", "calc_total_proppant_t", "calc_completed_length_m",
  "n_stages", "min_midpoint_dist"
  )

regr_geo_feats = c(
  "pressure_depth_ratio_kpa_m", "top_montney_structure_mss", "third_order_residual_m",
  "geothermal_gradient_degc_km", "distance_listric_faults_berger_m",
  "distance_normal_faults_berger_m"
  )

final_feats = c(regr_comp_feats, regr_geo_feats)

ml_df <- read_rds(input_data) %>%
  dplyr::filter(seismogenic == 1) %>%
  dplyr::select(all_of(target), all_of(final_feats))

set.seed(2019008)
train_rows = sample(nrow(ml_df)*0.9)

train = ml_df[train_rows,]
test = ml_df[-train_rows,]

learner = makeLearner(model_type)

train_task = makeRegrTask(data = train, target = target)
test_task = makeRegrTask(data = test, target = target)
all_task = makeRegrTask(data = ml_df, target = target)
meas = list(mae, rmse)

# hyperparameter tuning with 5-fold cross-validation & MBO optimization
tune_res = tuneParams(
  learner, train_task, resampling=cv5, par.set=pset, 
  control=makeTuneControlMBO(budget = 100L),
  measures=meas
)

write_rds(tune_res, paste0('../output/',model_prefix,"_tune_res.rds"))

# set hyperparameters
tuned_learner = setHyperPars(learner = learner, par.vals = tune_res$x)

# train final model for performance and interpretation
model = train(tuned_learner, train_task)
test_predict = predict(model, newdata=test)
train_predict = predict(model, newdata=train)

predictor = Predictor$new(model, data = ml_df)

# monte-carlo performance measures
resample = mlr::resample(
  tuned_learner, all_task, 
  makeResampleDesc("Subsample", iters=500, split=4/5, predict='both'),  
  measures = meas,
  show.info = FALSE
  )

model_res = get_resample_regr_res(resample)
model_res$train_perf = performance(predict(model, newdata=train), measures = meas)
model_res$test_perf = performance(predict(model, newdata=test), measures = meas)
model_res$model = model
model_res$tuned_learner = tuned_learner

sink(paste0('../output/',model_prefix,"_model_results.txt"))
print(model_res)
sink()

write_rds(model_res, paste0('../output/',model_prefix,"_resample.rds"))

p1 = plotResiduals(train_predict) +
  geom_abline(slope=1,linetype = "dashed") +
  ggtitle("Training Set") +
  theme_minimal()

p2 = plotResiduals(test_predict) +
  geom_abline(slope=1,linetype = "dashed") +
  ggtitle("Test Set") +
  theme_minimal()

ggsave(file = paste0('../output/',model_prefix,"_resplot.jpg"), 
       arrangeGrob(grobs = list(p1,p2), nrow=1),
       width = 12, height = 8)

p1 = plotLearnerPrediction(
  tuned_learner, task = all_task, 
  features = c("calc_total_fluid_m3", "calc_total_proppant_t"),
  measures = mae
  )

p2 = plotLearnerPrediction(
  tuned_learner, task = all_task, 
  features = c("min_midpoint_dist", "n_stages"),
  measures = mae
  )

p3 = plotLearnerPrediction(
  tuned_learner, task = all_task, 
  features = c("third_order_residual_m", "geothermal_gradient_degc_km"),
  measures = mae
  )

p4 = plotLearnerPrediction(
  tuned_learner, task = all_task, 
  features = c("distance_listric_faults_berger_m", "distance_normal_faults_berger_m"),
  measures = mae
  )

ggsave(file = paste0('../output/',model_prefix,"_learnpreds.jpg"), 
       arrangeGrob(grobs = list(p1,p2,p3,p4), nrow=2),
       width = 12, height = 8)

# importance plot
imp = FeatureImp$new(predictor, loss='mae', n.repetitions = 50)

p1 = ggplot(imp$results) +
  geom_col(aes(x = feature, y = importance)) +
  labs(x = 'Feature', y = 'Permutation Importance') +
  coord_flip() +
  theme_minimal()

interact = Interaction$new(predictor)

p2 = ggplot(as.data.frame(interact$results)) +
  geom_col(aes(x = .feature, y = .interaction)) +
  labs(x = '', y = 'Interaction') +
  coord_flip() +
  theme_minimal()

ggsave(file = paste0('../output/',model_prefix,"_impintplot.jpg"), 
       arrangeGrob(grobs = list(p1,p2), nrow=1),
       width = 12, height = 8, dpi=600)

# partial dependence plot
make_pdp_plot <- function(i, predictor, feats){
  pdp <- FeatureEffect$new(predictor, method = 'pdp+ice', 
                           feature = feats[i], center.at = 2)

  plot(pdp) + theme_minimal() + theme(axis.title.y=element_blank())
}

plist <- map(.x = 1:length(final_feats), .f = make_pdp_plot,
             predictor = predictor, feats = final_feats)

ggsave(file = paste0('../output/',model_prefix,"_pdpplot.jpg"), 
       arrangeGrob(grobs = plist, ncol = 3, left = 'Magnitude'),
       width = 12, height = 8, dpi=600)

for (well in shap_wells){
  lime.explain = LocalModel$new(predictor, 
                                x.interest = ml_df[well,],
                                k = length(final_feats))

  p1 = ggplot(lime.explain$results) +
    geom_col(aes(x = feature, y = effect)) +
    labs(x = 'Feature', y = 'LIME Effect') +
    coord_flip() +
    theme_minimal()

  shapley = Shapley$new(predictor,
                        x.interest = ml_df[well,] %>% 
                          dplyr::select(all_of(final_feats)),
                        sample.size = 100)

  p2 = ggplot(shapley$results) +
    geom_col(aes(x = feature, y = phi)) +
    labs(x = 'Feature', y = 'SHAP Phi') +
    coord_flip() +
    theme_minimal()

  ggsave(file = paste0('../output/',model_prefix,"_",well,"_limeshap.jpg"),
         arrangeGrob(grobs = list(p1,p2), nrow=1),
         width = 12, height = 4, dpi=600)
}


Enlightengeo/GeoscienceBC_2019-008 documentation built on Feb. 4, 2021, 12:43 p.m.