library(geosciencebc2019008)
run_date = Sys.Date() %>% str_replace_all(., "-", "_")
knitr::opts_chunk$set(eval = FALSE)

model_prefix = 'classif_model_glm_wcfd/glm_wcfd'
input_data = "../wcfd_data/final_wcfd_mldf_class.rds"
model_type = "classif.glmnet"

pset= makeParamSet(
  makeNumericParam('alpha',lower = 0, upper = 1),
  makeIntegerParam('lambda', lower = -4, upper = 1, trafo = function(x) 10^x)
)

wa_nums = c(32768, 32950, 33857, 33665, 32582, 33161, 29429, 32301, 33113,
            24035, 33894, 33162, 32574, 32575, 33550,   37346, 30376, 30940, 
            28640, 29976, 30735)

shap_wells = which(read_rds(input_data) %>% pull(wa_num) %in% wa_nums)

target = "seismogenic"

comp_feats = c(
  "calc_total_fluid_m3", "mean_rate_m3_min", "mean_proppant_per_stage_t",
  "proppant_cat.hybrid", "horiz_wells_in_5km", "min_midpoint_dist",
  "calc_completed_length_m"
)

geo_feats = c(
  "paleozoic_structure_mss", "geothermal_gradient_degc_km", "shmin_grasby",
  "distance_listric_faults_berger_m", "distance_normal_faults_berger_m"
)

final_feats = c(comp_feats, geo_feats)

ml_df <- read_rds(input_data) %>%
  dplyr::select(all_of(target), all_of(final_feats))

set.seed(2019008)
train_rows = sample(nrow(ml_df)*0.9)

train = ml_df[train_rows,]
test = ml_df[-train_rows,]

## Model Tuning
train_task = makeClassifTask(data = train, target = target)
test_task = makeClassifTask(data = test, target = target)
all_task = makeClassifTask(data = ml_df, target = target)

meas = list(logloss, f1, mmce)
learner = makeLearner(model_type, predict.type = "prob")

# hyperparameter tuning with 5-fold cross-validation & MBO optimization
tune_res = tuneParams(
  learner, train_task, resampling=cv5, par.set=pset, 
  control=makeTuneControlMBO(budget = 100L),
  measures=meas
)

write_rds(tune_res, paste0('../output/',model_prefix,"_tune_res.rds"))

# set hyperparameters
tuned_learner = setHyperPars(learner = learner, par.vals = tune_res$x)

# train final model for performance and interpretation
model = train(tuned_learner, train_task)
predictor = Predictor$new(model, data = ml_df)

# monte-carlo performance measures
resample = mlr::resample(
  tuned_learner, all_task, 
  makeResampleDesc("Subsample", iters=500, split=4/5, predict='both'),  
  measures = meas,
  show.info = FALSE
  )

model_res = get_resample_class_res(resample)

model_res$train_perf = performance(predict(model, newdata=train), measures = meas)
model_res$test_perf = performance(predict(model, newdata=test), measures = meas)
model_res$model = model
model_res$tuned_learner = tuned_learner
model_res$train_conf = calculateConfusionMatrix(
  predict(model, newdata=train), relative = TRUE, sums = TRUE
  )
model_res$test_conf = calculateConfusionMatrix(
  predict(model, newdata=test), relative = TRUE, sums = TRUE
  )

sink(paste0('../output/',model_prefix,"_model_results.txt"))
print(model_res)
sink()

write_rds(model_res, paste0('../output/',model_prefix,"_model_results.rds"))

## Threshold vs Performance
pvs = generateThreshVsPerfData(
  predict(model, newdata=ml_df), 
  measures = list(fpr, tpr, mmce))

pvs_plot = plotThreshVsPerf(pvs)

pvs_plot +
  theme_minimal() +
  ggsave(paste0('../output/',model_prefix,"_pvsplot.jpg"), width=12, height=8)

plotROCCurves(pvs) +
  theme_minimal() +
  ggsave(paste0('../output/',model_prefix,"_roccurve.jpg"), width=6, height=6)

## Feature Importance and Interaction
coef = data.frame(
  'feature' = rownames(coef(getLearnerModel(model))),
  'coefficient' = matrix(coef(getLearnerModel(model)))
  ) %>% mutate(coefficient = coefficient) %>% 
  filter(feature != '(Intercept)') %>%
  filter(feature != 'proppant_cat.hybrid') %>%
  mutate(coefficient = (coefficient-mean(coefficient))/sd(coefficient))

p1 = ggplot(coef) +
  geom_col(aes(x = feature, y = coefficient)) +
  labs(x = 'Feature', y = 'Coefficient') +
  coord_flip() +
  theme_minimal()

interact = Interaction$new(predictor)

p2 = ggplot(as.data.frame(interact$results) %>% filter(.class == 'X1')) +
  geom_col(aes(x = .feature, y = .interaction)) +
  labs(x = '', y = 'Interaction') +
  coord_flip() +
  theme_minimal()

ggsave(file = paste0('../output/',model_prefix,"_impintplot.jpg"), 
       arrangeGrob(grobs = list(p1,p2), nrow=1),
       width = 12, height = 8, dpi=600)

## Confusion Matrix and Threshold
p1 = plotLearnerPrediction(
  learner, task = all_task, 
  features = c("distance_listric_faults_berger_m", "distance_normal_faults_berger_m"),
  measures = logloss
  )

p2 = plotLearnerPrediction(
  learner, task = all_task, 
  features = c("paleozoic_structure_mss", "shmin_grasby"),
  measures = logloss
  )

p3 = plotLearnerPrediction(
  learner, task = all_task, 
  features = c("calc_total_fluid_m3", "mean_proppant_per_stage_t"),
  measures = logloss
  )

p4 = plotLearnerPrediction(
  learner, task = all_task, 
  features = c("horiz_wells_in_5km", "min_midpoint_dist"),
  measures = logloss
  )

ggsave(file = paste0('../output/',model_prefix,"_learnpreds.jpg"), 
       arrangeGrob(grobs = list(p1,p2,p3,p4), nrow=2),
       width = 12, height = 8, dpi=600)

## Partial Dependence & ICE Plots

make_pdp_plot <- function(i, predictor, feats){
  pdp <- FeatureEffect$new(predictor, method = 'pdp+ice', 
                           feature = feats[i], center.at = 0.5)

  plot(pdp) + theme_minimal() + theme(axis.title.y=element_blank())
}

plist <- map(.x = 1:length(final_feats), .f = make_pdp_plot,
             predictor = predictor, feats = final_feats)

ggsave(file = paste0('../output/',model_prefix,"_pdpplot.jpg"), 
       arrangeGrob(grobs = plist, ncol = 3, left = 'Seismogenic'),
       width = 12, height = 8, dpi=600)

## LIME & SHAP

for (well in shap_wells){
  lime.explain = LocalModel$new(predictor, 
                                x.interest = ml_df[well,],
                                k = length(final_feats))

  p1 = ggplot(lime.explain$results %>% filter(.class == 'X1')) +
    geom_col(aes(x = feature, y = effect)) +
    labs(x = 'Feature', y = 'LIME Effect') +
    coord_flip() +
    theme_minimal()

  shapley = Shapley$new(predictor,
                        x.interest = ml_df[well,] %>% 
                          dplyr::select(final_feats),
                        sample.size = 100)

  p2 = ggplot(shapley$results %>% filter(class == 'X1')) +
    geom_col(aes(x = feature, y = phi)) +
    labs(x = 'Feature', y = 'SHAP Phi') +
    coord_flip() +
    theme_minimal()

  ggsave(file = paste0('../output/',model_prefix,"_",well,"_limeshap.jpg"),
         arrangeGrob(grobs = list(p1,p2), nrow=1),
         width = 12, height = 4, dpi=600)
}


Enlightengeo/GeoscienceBC_2019-008 documentation built on Feb. 4, 2021, 12:43 p.m.