library(geosciencebc2019008) run_date = Sys.Date() %>% str_replace_all(., "-", "_") knitr::opts_chunk$set(eval = FALSE) model_prefix = 'classif_model_glm_wcfd/glm_wcfd' input_data = "../wcfd_data/final_wcfd_mldf_class.rds" model_type = "classif.glmnet" pset= makeParamSet( makeNumericParam('alpha',lower = 0, upper = 1), makeIntegerParam('lambda', lower = -4, upper = 1, trafo = function(x) 10^x) ) wa_nums = c(32768, 32950, 33857, 33665, 32582, 33161, 29429, 32301, 33113, 24035, 33894, 33162, 32574, 32575, 33550, 37346, 30376, 30940, 28640, 29976, 30735) shap_wells = which(read_rds(input_data) %>% pull(wa_num) %in% wa_nums) target = "seismogenic" comp_feats = c( "calc_total_fluid_m3", "mean_rate_m3_min", "mean_proppant_per_stage_t", "proppant_cat.hybrid", "horiz_wells_in_5km", "min_midpoint_dist", "calc_completed_length_m" ) geo_feats = c( "paleozoic_structure_mss", "geothermal_gradient_degc_km", "shmin_grasby", "distance_listric_faults_berger_m", "distance_normal_faults_berger_m" ) final_feats = c(comp_feats, geo_feats) ml_df <- read_rds(input_data) %>% dplyr::select(all_of(target), all_of(final_feats)) set.seed(2019008) train_rows = sample(nrow(ml_df)*0.9) train = ml_df[train_rows,] test = ml_df[-train_rows,] ## Model Tuning train_task = makeClassifTask(data = train, target = target) test_task = makeClassifTask(data = test, target = target) all_task = makeClassifTask(data = ml_df, target = target) meas = list(logloss, f1, mmce) learner = makeLearner(model_type, predict.type = "prob") # hyperparameter tuning with 5-fold cross-validation & MBO optimization tune_res = tuneParams( learner, train_task, resampling=cv5, par.set=pset, control=makeTuneControlMBO(budget = 100L), measures=meas ) write_rds(tune_res, paste0('../output/',model_prefix,"_tune_res.rds")) # set hyperparameters tuned_learner = setHyperPars(learner = learner, par.vals = tune_res$x) # train final model for performance and interpretation model = train(tuned_learner, train_task) predictor = Predictor$new(model, data = ml_df) # monte-carlo performance measures resample = mlr::resample( tuned_learner, all_task, makeResampleDesc("Subsample", iters=500, split=4/5, predict='both'), measures = meas, show.info = FALSE ) model_res = get_resample_class_res(resample) model_res$train_perf = performance(predict(model, newdata=train), measures = meas) model_res$test_perf = performance(predict(model, newdata=test), measures = meas) model_res$model = model model_res$tuned_learner = tuned_learner model_res$train_conf = calculateConfusionMatrix( predict(model, newdata=train), relative = TRUE, sums = TRUE ) model_res$test_conf = calculateConfusionMatrix( predict(model, newdata=test), relative = TRUE, sums = TRUE ) sink(paste0('../output/',model_prefix,"_model_results.txt")) print(model_res) sink() write_rds(model_res, paste0('../output/',model_prefix,"_model_results.rds")) ## Threshold vs Performance pvs = generateThreshVsPerfData( predict(model, newdata=ml_df), measures = list(fpr, tpr, mmce)) pvs_plot = plotThreshVsPerf(pvs) pvs_plot + theme_minimal() + ggsave(paste0('../output/',model_prefix,"_pvsplot.jpg"), width=12, height=8) plotROCCurves(pvs) + theme_minimal() + ggsave(paste0('../output/',model_prefix,"_roccurve.jpg"), width=6, height=6) ## Feature Importance and Interaction coef = data.frame( 'feature' = rownames(coef(getLearnerModel(model))), 'coefficient' = matrix(coef(getLearnerModel(model))) ) %>% mutate(coefficient = coefficient) %>% filter(feature != '(Intercept)') %>% filter(feature != 'proppant_cat.hybrid') %>% mutate(coefficient = (coefficient-mean(coefficient))/sd(coefficient)) p1 = ggplot(coef) + geom_col(aes(x = feature, y = coefficient)) + labs(x = 'Feature', y = 'Coefficient') + coord_flip() + theme_minimal() interact = Interaction$new(predictor) p2 = ggplot(as.data.frame(interact$results) %>% filter(.class == 'X1')) + geom_col(aes(x = .feature, y = .interaction)) + labs(x = '', y = 'Interaction') + coord_flip() + theme_minimal() ggsave(file = paste0('../output/',model_prefix,"_impintplot.jpg"), arrangeGrob(grobs = list(p1,p2), nrow=1), width = 12, height = 8, dpi=600) ## Confusion Matrix and Threshold p1 = plotLearnerPrediction( learner, task = all_task, features = c("distance_listric_faults_berger_m", "distance_normal_faults_berger_m"), measures = logloss ) p2 = plotLearnerPrediction( learner, task = all_task, features = c("paleozoic_structure_mss", "shmin_grasby"), measures = logloss ) p3 = plotLearnerPrediction( learner, task = all_task, features = c("calc_total_fluid_m3", "mean_proppant_per_stage_t"), measures = logloss ) p4 = plotLearnerPrediction( learner, task = all_task, features = c("horiz_wells_in_5km", "min_midpoint_dist"), measures = logloss ) ggsave(file = paste0('../output/',model_prefix,"_learnpreds.jpg"), arrangeGrob(grobs = list(p1,p2,p3,p4), nrow=2), width = 12, height = 8, dpi=600) ## Partial Dependence & ICE Plots make_pdp_plot <- function(i, predictor, feats){ pdp <- FeatureEffect$new(predictor, method = 'pdp+ice', feature = feats[i], center.at = 0.5) plot(pdp) + theme_minimal() + theme(axis.title.y=element_blank()) } plist <- map(.x = 1:length(final_feats), .f = make_pdp_plot, predictor = predictor, feats = final_feats) ggsave(file = paste0('../output/',model_prefix,"_pdpplot.jpg"), arrangeGrob(grobs = plist, ncol = 3, left = 'Seismogenic'), width = 12, height = 8, dpi=600) ## LIME & SHAP for (well in shap_wells){ lime.explain = LocalModel$new(predictor, x.interest = ml_df[well,], k = length(final_feats)) p1 = ggplot(lime.explain$results %>% filter(.class == 'X1')) + geom_col(aes(x = feature, y = effect)) + labs(x = 'Feature', y = 'LIME Effect') + coord_flip() + theme_minimal() shapley = Shapley$new(predictor, x.interest = ml_df[well,] %>% dplyr::select(final_feats), sample.size = 100) p2 = ggplot(shapley$results %>% filter(class == 'X1')) + geom_col(aes(x = feature, y = phi)) + labs(x = 'Feature', y = 'SHAP Phi') + coord_flip() + theme_minimal() ggsave(file = paste0('../output/',model_prefix,"_",well,"_limeshap.jpg"), arrangeGrob(grobs = list(p1,p2), nrow=1), width = 12, height = 4, dpi=600) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.