library(geosciencebc2019008) run_date = Sys.Date() %>% str_replace_all(., "-", "_") knitr::opts_chunk$set(eval = FALSE) model_prefix = 'glm_bcogc_geo'
Load cleaned data (ml_df) from rds object
target = 'seismogenic' aux_cols <- c( "unique_surv_id", "wa_num", "drilling_event", "ground_elevtn", "mean_easting", "mean_northing", "on_prod_date", "last_reported_date", "cum_gas_to_date_e3m3", "cum_oil_to_date_m3", "cum_water_to_date_m3", "cum_cond_to_date_m3", "frac_start_date", "frac_end_date", "min_dist_well") geology_input_cols <- c("distance_all_faults_berger_m", "distance_divergent_faults_berger_m", "distance_listric_faults_berger_m", "distance_normal_faults_berger_m", "distance_strike_slip_faults_berger_m", "distance_thrust_faults_berger_m", "geothermal_gradient_degc_km", "paleozoic_structure_mss", "pressure_depth_ratio_kpa_m", "shmin_grasby", "third_order_residual_m", "top_montney_isotherm_degc", "top_montney_structure_mss", "top_montney_tvd_mss")
ml_df <- read_rds('../output/bcogc_mldf.rds') %>% dplyr::select(-'max_mag', -aux_cols, -geology_input_cols)
We normalize features to speed and improve model training. We use the default MLR model parametersin order to provide a consistent model for feature selection (eta = 0.3).
# normalize features to speed model fitting norm_ml_df <- normalizeFeatures(ml_df, target = target) # task classif_tsk = makeClassifTask(data = norm_ml_df, target = target) %>% undersample(., rate = 0.25) # learner classif_lrn = makeLearner("classif.logreg", predict.type = "prob")
plot_classif_filt_importance(classif_tsk, paste0('../output/',model_prefix,"_")) library(tidyverse) classif_fv = generateFilterValuesData(classif_tsk, method = c("FSelector_information.gain", "FSelector_symmetrical.uncertainty")) classif_fv$data %>% write_csv(.,paste0( '../output/classif_',model_prefix,"_",'filter_feats.csv'))
We use sequential floating backwards sampling for feature selection. This is setup as a loop and the algorithm's convergence is quite sensitive to the alpha value. Five-fold cross-validation, log loss, and default hyperparameters are used. We iterate through various alpha values to make the algorithm more robust to non-convergence.
## wrapper-based feature importance - sequential floating backwards sampling for (alpha in seq(3E-3, 8E-3, length.out = 50)){ i = which(alpha == seq(3E-3, 8E-3, length.out = 50)) classif_feats_seq = suppressWarnings(selectFeatures( learner = classif_lrn, task = classif_tsk, resampling = cv5, control = makeFeatSelControlSequential(method = 'sfbs',alpha = alpha), measures = list(logloss), show.info = FALSE )) print(i) saveRDS(classif_feats_seq, paste0('../output/classif_feats_',model_prefix,'/seq_',i,'.rds') ) }
This chunk parses the wrapper-based sequential feature selection results in order to calculate the most important features and produces a plot.
classif_feats_seq_files = list.files( paste0('../output/classif_feats_',model_prefix), pattern = "seq", full.names = TRUE ) for (file in classif_feats_seq_files){ feat_res = read_rds(file) if (file == classif_feats_seq_files[1]){ path = feat_res$opt.path$env$path } else { path = rbind(path, feat_res$opt.path$env$path) } } plot_classif_feat_seq(path, paste0('../output/',model_prefix,"_"), n_best_val = 1000) top_df = path %>% dplyr::top_n(., -1000, logloss.test.mean) %>% dplyr::select(-logloss.test.mean) %>% dplyr::summarize_all(., sum) %>% dplyr::mutate_all(., function(x,n) {x/n}, n = 1000) %>% rownames_to_column() %>% gather(var, value, -rowname) %>% spread(rowname, value) colnames(top_df) <- c('var',paste0('classif_seq_',model_prefix)) top_df %>% write_csv(.,paste0( '../output/classif_',model_prefix,"_",'seq_feats.csv'))
We also use random sampling for feature selection. This is more robust to non-convergence, but requires a lot of trials (we use 100,000).
## wrapper-based feature importance - random sampling for (i in 1:100){ classif_feats_rand = suppressWarnings(selectFeatures( learner = classif_lrn, task = classif_tsk, resampling = cv5, control = makeFeatSelControlRandom(maxit = 1000), measures = list(logloss), show.info = FALSE )) print(i) saveRDS(classif_feats_rand, paste0('../output/classif_feats_',model_prefix,'/rand_',i,'.rds') ) }
This chunk parses the random sampling results.
classif_feats_rand_files = list.files( paste0('../output/classif_feats_',model_prefix), pattern = "rand", full.names = TRUE ) for (file in classif_feats_rand_files){ feat_res = read_rds(file) if (file == classif_feats_rand_files[1]){ path = feat_res$opt.path$env$path } else { path = rbind(path, feat_res$opt.path$env$path) } } plot_classif_feat_rand(path, paste0('../output/',model_prefix,"_"), n_best_val = 1000) top_df = path %>% dplyr::top_n(., -1000, logloss.test.mean) %>% dplyr::select(-logloss.test.mean) %>% dplyr::summarize_all(., sum) %>% dplyr::mutate_all(., function(x,n) {x/n}, n = 1000) %>% rownames_to_column() %>% gather(var, value, -rowname) %>% spread(rowname, value) colnames(top_df) <- c('var',paste0('classif_rand_',model_prefix)) top_df %>% write_csv(.,paste0( '../output/classif_',model_prefix,"_",'rand_feats.csv'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.