library(geosciencebc2019008) run_date = Sys.Date() %>% str_replace_all(., "-", "_") model_prefix = 'glm_wcfd_geo'
This chunk prepares our dataframe for clustering, statistical modelling, and other data science techniques. It specifies auxillary columns (those useful for information), predictor columns, and our target column(s): seimogenic (T/F) and max magnitude (numeric).
For the regression problem, we focus exclusively on wells that have generated earthquakes. The dataset is therefore quite a bit smaller, but non-seismogenic wells provide no information regarding the maximum magnitude of earthquake generated by an event.
target = "max_mag" aux_cols <- c( "unique_surv_id", "wa_num", "drilling_event", "ground_elevtn", "mean_easting", "mean_northing", "on_prod_date", "last_reported_date", "cum_gas_to_date_e3m3", "cum_oil_to_date_m3", "cum_water_to_date_m3", "cum_cond_to_date_m3", "frac_start_date", "frac_end_date", "min_dist_well", "calc_completed_length_m") geology_input_cols <- c("distance_all_faults_berger_m", "distance_divergent_faults_berger_m", "distance_listric_faults_berger_m", "distance_normal_faults_berger_m", "distance_strike_slip_faults_berger_m", "distance_thrust_faults_berger_m", "geothermal_gradient_degc_km", "paleozoic_structure_mss", "pressure_depth_ratio_kpa_m", "shmin_grasby", "third_order_residual_m", "top_montney_isotherm_degc", "top_montney_structure_mss", "top_montney_tvd_mss")
# load prepared data ml_df <- read_rds('../wcfd_data/wcfd_mldf.rds') %>% dplyr::filter(seismogenic == 1) %>% dplyr::select(-'seismogenic', -aux_cols, -geology_input_cols)
# normalize features to speed model fitting norm_ml_df <- normalizeFeatures(ml_df, target = target) # task regr_tsk = makeRegrTask(data = norm_ml_df, target = target) # learner regr_lrn = makeLearner("regr.glm")
plot_regr_filt_importance(regr_tsk, paste0('../output/',model_prefix,"_")) library(tidyverse) regr_fv = generateFilterValuesData(regr_tsk, method = c("FSelector_information.gain", "FSelector_symmetrical.uncertainty")) regr_fv$data %>% write_csv(.,paste0( '../output/regr_',model_prefix,"_",'filter_feats.csv'))
## wrapper-based feature importance - sequential forward sampling for (alpha in seq(9.45E-4, 9.9E-4, length.out = 20)){ i = which(alpha == seq(9.45E-4, 9.9E-4, length.out = 20)) regr_feats_seq = suppressWarnings(selectFeatures( learner = regr_lrn, task = regr_tsk, resampling = cv5, control = makeFeatSelControlSequential(method = 'sfbs', alpha = alpha, maxit = 4000), measures = list(mae, rmse), show.info = FALSE )) print(i) saveRDS( regr_feats_seq, paste0('../output/regr_feats_',model_prefix,'/seq_',i,'.rds') ) }
This chunk parses the wrapper-based sequential feature selection results in order to calculate the most important features and produces a plot.
regr_feats_seq_files = list.files( paste0('../output/regr_feats_',model_prefix), pattern = "seq", full.names = TRUE ) for (file in regr_feats_seq_files){ feat_res = read_rds(file) if (file == regr_feats_seq_files[1]){ path = feat_res$opt.path$env$path } else { path = rbind(path, feat_res$opt.path$env$path) } } plot_regr_feat_seq(path, paste0('../output/',model_prefix,"_"), n_best_val = 1000) top_df = path %>% dplyr::mutate(mean_error = (mae.test.mean + rmse.test.rmse)/2) %>% dplyr::top_n(., -1000, mean_error) %>% dplyr::select(-mean_error, -mae.test.mean, -rmse.test.rmse) %>% dplyr::summarize_all(., sum) %>% dplyr::mutate_all(., function(x,n) {x/n}, n = 1000) %>% rownames_to_column() %>% gather(var, value, -rowname) %>% spread(rowname, value) colnames(top_df) <- c('var',paste0('regr_rand_',model_prefix)) top_df %>% write_csv(.,paste0( '../output/regr_',model_prefix,"_",'seq_feats.csv'))
We also use random sampling for feature selection. This is more robust to non-convergence, but requires a lot of trials (we use 100,000).
## wrapper-based feature importance - random sampling for (i in 1:100){ regr_feats_rand = suppressWarnings(selectFeatures( learner = regr_lrn, task = regr_tsk, resampling = cv5, control = makeFeatSelControlRandom(maxit = 1000), measures = list(mae, rmse), show.info = FALSE )) print(i) saveRDS( regr_feats_rand, paste0('../output/regr_feats_',model_prefix,'/rand_',i,'.rds') ) }
This chunk parses the random sampling results.
regr_feats_rand_files = list.files( paste0('../output/regr_feats_',model_prefix), pattern = "rand", full.names = TRUE ) for (file in regr_feats_rand_files){ feat_res = read_rds(file) if (file == regr_feats_rand_files[1]){ path = feat_res$opt.path$env$path } else { path = rbind(path, feat_res$opt.path$env$path) } } plot_regr_feat_rand(path, paste0('../output/',model_prefix,"_"), n_best_val = 1000) top_df = path %>% dplyr::mutate(mean_error = (mae.test.mean + rmse.test.rmse)/2) %>% dplyr::top_n(., -1000, mean_error) %>% dplyr::select(-mean_error, -mae.test.mean, -rmse.test.rmse) %>% dplyr::summarize_all(., sum) %>% dplyr::mutate_all(., function(x,n) {x/n}, n = 1000) %>% rownames_to_column() %>% gather(var, value, -rowname) %>% spread(rowname, value) colnames(top_df) <- c('var',paste0('regr_rand_',model_prefix)) top_df %>% write_csv(.,paste0( '../output/regr_',model_prefix,"_",'rand_feats.csv'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.