a

library(geosciencebc2019008)
run_date = Sys.Date() %>% str_replace_all(., "-", "_")
knitr::opts_chunk$set(eval = FALSE)

model_prefix = 'glm_wcfd_geo'

Load prepared + cleaned data

Load cleaned data (ml_df) from rds object

target = 'seismogenic'

aux_cols <- c(
  "unique_surv_id", "wa_num", "drilling_event", "ground_elevtn",
  "mean_easting", "mean_northing", "on_prod_date", "last_reported_date",
  "cum_gas_to_date_e3m3", "cum_oil_to_date_m3", "cum_water_to_date_m3",
  "cum_cond_to_date_m3", "frac_start_date", "frac_end_date", "min_dist_well", "calc_completed_length_m")

geology_input_cols <- c("distance_all_faults_berger_m",
"distance_divergent_faults_berger_m", "distance_listric_faults_berger_m", 
"distance_normal_faults_berger_m", "distance_strike_slip_faults_berger_m", 
"distance_thrust_faults_berger_m", "geothermal_gradient_degc_km", 
"paleozoic_structure_mss", "pressure_depth_ratio_kpa_m", "shmin_grasby", 
"third_order_residual_m", "top_montney_isotherm_degc", "top_montney_structure_mss", 
"top_montney_tvd_mss")

Subset data

ml_df <- read_rds('../wcfd_data/wcfd_mldf.rds') %>%
  dplyr::select(-'max_mag', -aux_cols, -geology_input_cols)

Model Setup

We normalize features to speed and improve model training. We use the default MLR model parametersin order to provide a consistent model for feature selection (eta = 0.3).

# normalize features to speed model fitting
norm_ml_df <- normalizeFeatures(ml_df, target = target)

# task
classif_tsk = makeClassifTask(data = norm_ml_df, target = target) %>%
  undersample(., rate = 0.25)

# learner
classif_lrn = makeLearner("classif.logreg", predict.type = "prob")

Filter Based Feature Selection

plot_classif_filt_importance(classif_tsk, paste0('../output/',model_prefix,"_"))

library(tidyverse)

classif_fv = generateFilterValuesData(classif_tsk, 
    method = c("FSelector_information.gain",
               "FSelector_symmetrical.uncertainty"))

classif_fv$data %>% write_csv(.,paste0( '../output/classif_',model_prefix,"_",'filter_feats.csv'))

Wrapper-based feature importance - SFBS loop

We use sequential floating backwards sampling for feature selection. This is setup as a loop and the algorithm's convergence is quite sensitive to the alpha value. Five-fold cross-validation, log loss, and default hyperparameters are used. We iterate through various alpha values to make the algorithm more robust to non-convergence.

## wrapper-based feature importance - sequential floating backwards sampling
for (alpha in seq(3E-3, 8E-3, length.out = 50)){
  i = which(alpha == seq(3E-3, 8E-3, length.out = 50))

  classif_feats_seq = suppressWarnings(selectFeatures(
  learner = classif_lrn, task = classif_tsk, resampling = cv5,
  control = makeFeatSelControlSequential(method = 'sfbs',alpha = alpha), 
  measures = list(logloss), show.info = FALSE
  ))

  print(i)
  saveRDS(classif_feats_seq,
          paste0('../output/classif_feats_',model_prefix,'/seq_',i,'.rds')
          )
}

Wrapper-based feature importance - SFBS parsing

This chunk parses the wrapper-based sequential feature selection results in order to calculate the most important features and produces a plot.

classif_feats_seq_files = list.files(
  paste0('../output/classif_feats_',model_prefix), 
  pattern = "seq", full.names = TRUE
  )

for (file in classif_feats_seq_files){
  feat_res = read_rds(file)

  if (file == classif_feats_seq_files[1]){
    path = feat_res$opt.path$env$path
  } else {
    path = rbind(path, feat_res$opt.path$env$path)
  }
}

plot_classif_feat_seq(path, paste0('../output/',model_prefix,"_"), n_best_val = 1000)

top_df = path %>% 
  dplyr::top_n(., -1000, logloss.test.mean) %>%
  dplyr::select(-logloss.test.mean) %>%
  dplyr::summarize_all(., sum) %>%
  dplyr::mutate_all(., function(x,n) {x/n}, n = 1000) %>%
  rownames_to_column() %>% 
  gather(var, value, -rowname) %>%
  spread(rowname, value)

colnames(top_df) <- c('var',paste0('classif_seq_',model_prefix))

top_df %>% write_csv(.,paste0( '../output/classif_',model_prefix,"_",'seq_feats.csv'))

Wrapper-based feature importance - random loop

We also use random sampling for feature selection. This is more robust to non-convergence, but requires a lot of trials (we use 100,000).

## wrapper-based feature importance - random sampling
for (i in 1:100){
  classif_feats_rand = suppressWarnings(selectFeatures(
    learner = classif_lrn, task = classif_tsk, resampling = cv5,
    control = makeFeatSelControlRandom(maxit = 1000), 
    measures = list(logloss), show.info = FALSE
    ))

  print(i)
  saveRDS(
    classif_feats_rand,
    paste0('../output/classif_feats_',model_prefix,'/rand_',i,'.rds')
    )
}

Wrapper-based feature importance - random loop

This chunk parses the random sampling results.

classif_feats_rand_files = list.files(
  paste0('../output/classif_feats_',model_prefix), pattern = "rand", full.names = TRUE
  )

for (file in classif_feats_rand_files){
  feat_res = read_rds(file)

  if (file == classif_feats_rand_files[1]){
    path = feat_res$opt.path$env$path
  } else {
    path = rbind(path, feat_res$opt.path$env$path)
  }
}

plot_classif_feat_rand(path, paste0('../output/',model_prefix,"_"), n_best_val = 1000)

top_df = path %>% 
  dplyr::top_n(., -1000, logloss.test.mean) %>%
  dplyr::select(-logloss.test.mean) %>%
  dplyr::summarize_all(., sum) %>%
  dplyr::mutate_all(., function(x,n) {x/n}, n = 1000) %>%
  rownames_to_column() %>% 
  gather(var, value, -rowname) %>%
  spread(rowname, value)

colnames(top_df) <- c('var',paste0('classif_rand_',model_prefix))

top_df %>% write_csv(.,paste0( '../output/classif_',model_prefix,"_",'rand_feats.csv'))


Enlightengeo/GeoscienceBC_2019-008 documentation built on Feb. 4, 2021, 12:43 p.m.