Geoscience BC Study Code - Machine Learning Wrapper

library(geosciencebc2019008)
run_date = Sys.Date() %>% str_replace_all(., "-", "_")

model_prefix = 'glm_wcfd_geo'

ML Dataframe Prep

This chunk prepares our dataframe for clustering, statistical modelling, and other data science techniques. It specifies auxillary columns (those useful for information), predictor columns, and our target column(s): seimogenic (T/F) and max magnitude (numeric).

For the regression problem, we focus exclusively on wells that have generated earthquakes. The dataset is therefore quite a bit smaller, but non-seismogenic wells provide no information regarding the maximum magnitude of earthquake generated by an event.

target = "max_mag"

aux_cols <- c(
  "unique_surv_id", "wa_num", "drilling_event", "ground_elevtn",
  "mean_easting", "mean_northing", "on_prod_date",
  "last_reported_date", "cum_gas_to_date_e3m3", "cum_oil_to_date_m3",
  "cum_water_to_date_m3", "cum_cond_to_date_m3", "frac_start_date",
  "frac_end_date", "min_dist_well", "calc_completed_length_m")

geology_input_cols <- c("distance_all_faults_berger_m",
"distance_divergent_faults_berger_m", "distance_listric_faults_berger_m", 
"distance_normal_faults_berger_m", "distance_strike_slip_faults_berger_m", 
"distance_thrust_faults_berger_m", "geothermal_gradient_degc_km", 
"paleozoic_structure_mss", "pressure_depth_ratio_kpa_m", "shmin_grasby", 
"third_order_residual_m", "top_montney_isotherm_degc", "top_montney_structure_mss", 
"top_montney_tvd_mss")

Load and subset data

# load prepared data
ml_df <- read_rds('../wcfd_data/wcfd_mldf.rds') %>%
  dplyr::filter(seismogenic == 1) %>%
  dplyr::select(-'seismogenic', -aux_cols, -geology_input_cols)

Feature Selection

# normalize features to speed model fitting
norm_ml_df <- normalizeFeatures(ml_df, target = target)

# task
regr_tsk = makeRegrTask(data = norm_ml_df, target = target)

# learner
regr_lrn = makeLearner("regr.glm")

filter-based feature importance

plot_regr_filt_importance(regr_tsk,
paste0('../output/',model_prefix,"_"))

library(tidyverse)

regr_fv = generateFilterValuesData(regr_tsk, 
    method = c("FSelector_information.gain",
               "FSelector_symmetrical.uncertainty"))

regr_fv$data %>% write_csv(.,paste0( '../output/regr_',model_prefix,"_",'filter_feats.csv'))
## wrapper-based feature importance - sequential forward sampling
for (alpha in seq(9.45E-4, 9.9E-4, length.out = 20)){
  i = which(alpha == seq(9.45E-4, 9.9E-4, length.out = 20))

  regr_feats_seq = suppressWarnings(selectFeatures(
  learner = regr_lrn, task = regr_tsk, resampling = cv5,
  control = makeFeatSelControlSequential(method = 'sfbs', alpha = alpha, maxit = 4000), measures = list(mae, rmse), show.info = FALSE
  ))

  print(i)
  saveRDS(
    regr_feats_seq,
    paste0('../output/regr_feats_',model_prefix,'/seq_',i,'.rds')
    )
}

Wrapper-based feature importance - SFBS parsing

This chunk parses the wrapper-based sequential feature selection results in order to calculate the most important features and produces a plot.

regr_feats_seq_files = list.files(
  paste0('../output/regr_feats_',model_prefix), 
  pattern = "seq", full.names = TRUE
  )

for (file in regr_feats_seq_files){
  feat_res = read_rds(file)

  if (file == regr_feats_seq_files[1]){
    path = feat_res$opt.path$env$path
  } else {
    path = rbind(path, feat_res$opt.path$env$path)
  }
}

plot_regr_feat_seq(path, paste0('../output/',model_prefix,"_"), n_best_val = 1000)

top_df = path %>%
  dplyr::mutate(mean_error = (mae.test.mean + rmse.test.rmse)/2) %>%
  dplyr::top_n(., -1000, mean_error) %>%
  dplyr::select(-mean_error, -mae.test.mean, -rmse.test.rmse) %>%
  dplyr::summarize_all(., sum) %>%
  dplyr::mutate_all(., function(x,n) {x/n}, n = 1000) %>%
  rownames_to_column() %>% 
  gather(var, value, -rowname) %>%
  spread(rowname, value)

colnames(top_df) <- c('var',paste0('regr_rand_',model_prefix))

top_df %>% write_csv(.,paste0( '../output/regr_',model_prefix,"_",'seq_feats.csv'))

Wrapper-based feature importance - random loop

We also use random sampling for feature selection. This is more robust to non-convergence, but requires a lot of trials (we use 100,000).

## wrapper-based feature importance - random sampling
for (i in 1:100){
  regr_feats_rand = suppressWarnings(selectFeatures(
    learner = regr_lrn, task = regr_tsk, resampling = cv5,
    control = makeFeatSelControlRandom(maxit = 1000), 
    measures = list(mae, rmse), show.info = FALSE
    ))

  print(i)
  saveRDS(
    regr_feats_rand, 
    paste0('../output/regr_feats_',model_prefix,'/rand_',i,'.rds')
    )
}

Wrapper-based feature importance - random loop

This chunk parses the random sampling results.

regr_feats_rand_files = list.files(
  paste0('../output/regr_feats_',model_prefix), 
  pattern = "rand",
  full.names = TRUE
  )

for (file in regr_feats_rand_files){
  feat_res = read_rds(file)

  if (file == regr_feats_rand_files[1]){
    path = feat_res$opt.path$env$path
  } else {
    path = rbind(path, feat_res$opt.path$env$path)
  }
}

plot_regr_feat_rand(path, paste0('../output/',model_prefix,"_"), n_best_val = 1000)

top_df = path %>%
  dplyr::mutate(mean_error = (mae.test.mean + rmse.test.rmse)/2) %>%
  dplyr::top_n(., -1000, mean_error) %>%
  dplyr::select(-mean_error, -mae.test.mean, -rmse.test.rmse) %>%
  dplyr::summarize_all(., sum) %>%
  dplyr::mutate_all(., function(x,n) {x/n}, n = 1000) %>%
  rownames_to_column() %>% 
  gather(var, value, -rowname) %>%
  spread(rowname, value)

colnames(top_df) <- c('var',paste0('regr_rand_',model_prefix))

top_df %>% write_csv(.,paste0( '../output/regr_',model_prefix,"_",'rand_feats.csv'))


Enlightengeo/GeoscienceBC_2019-008 documentation built on Feb. 4, 2021, 12:43 p.m.