library(geosciencebc2019008) run_date = Sys.Date() %>% str_replace_all(., "-", "_") knitr::opts_chunk$set(eval = FALSE)
This chunk loads our prepared data (see prepare_data.Rmd
)
saf_wells_sf <- read_rds('../output/bcogc_saf_well_data.rds') geologic_input_data = FALSE
This chunk prepares our dataframe for the data science workflows (exploratory data analysis, feature selection, model generation, and model interpretation). It specifies auxillary columns (those useful for information), predictor columns, and our target column(s), which for classificatio is the seimogenic (T/F) association. Some of the columns, filtering, and imputation are specific for the classification problem, although the bits of code are very similar.
aux_cols <- c( "unique_surv_id", "wa_num", "drilling_event", "ground_elevtn", "mean_easting", "mean_northing", "survey_well_type", "on_prod_date", "last_reported_date", "cum_gas_to_date_e3m3", "cum_oil_to_date_m3", "cum_water_to_date_m3", "cum_cond_to_date_m3", "frac_start_date", "frac_end_date", "min_dist_well") removed_inputs <- c( "no_prior_wells", "no_surrounding_wells", "interval_clusters_bool", "base_fluid_cat", "hybrid_frac_bool", "skipped_stages", "stim_company") numeric_input_cols <- c( "mean_tvd", "n_stages", "calc_completed_length_m", "mean_proppant_per_stage_t", "sd_proppant_per_stage_t", "max_proppant_per_stage_t", "calc_total_proppant_t", "mean_fluid_per_stage_m3", "sd_fluid_per_stage_m3", "max_fluid_per_stage_m3", "calc_total_fluid_m3", "avg_stage_length", "avg_stage_spacing", "max_rate_m3_min", "mean_rate_m3_min", "max_stage_duration_min", "mean_stage_duration_min", "max_isip_mpa", "sd_isip_mpa", "mean_isip_mpa", "frac_duration_days", "calc_proppant_intensity_kg_m", "calc_fluid_intensity_m3_m", "min_midpoint_dist", "horiz_wells_in_1km", "horiz_wells_in_5km", "horiz_wells_in_10km", "horiz_wells_in_25km") if (geologic_input_data == TRUE){ numeric_input_cols = c( numeric_input_cols, "max_treating_mpa", "sd_treating_mpa", "mean_treating_mpa", "number_sand_off_screen_out_stages", "mean_intervals_per_stage", "max_breakdown_mpa", "sd_breakdown_mpa", "mean_breakdown_mpa", "total_gas_injected_m3", "breakdown_isip_ratio" ) } geology_input_cols <- c("distance_all_faults_berger_m", "distance_divergent_faults_berger_m", "distance_listric_faults_berger_m", "distance_normal_faults_berger_m", "distance_strike_slip_faults_berger_m", "distance_thrust_faults_berger_m", "geothermal_gradient_degc_km", "paleozoic_structure_mss", "pressure_depth_ratio_kpa_m", "shmin_grasby", "third_order_residual_m", "top_montney_isotherm_degc", "top_montney_structure_mss", "top_montney_tvd_mss") fact_input_cols <- c( "viscosity_cat", "energizer_bool", "proppant_cat") if (geologic_input_data == TRUE){ fact_input_cols = c( fact_input_cols, "cased_well_bool", "interference_bool" ) } targets <- c("seismogenic", "max_mag") ml_df <- saf_wells_sf %>% st_drop_geometry() %>% dplyr::select(aux_cols, numeric_input_cols, fact_input_cols, geology_input_cols, targets) %>% dplyr::mutate_if(is.logical, as.numeric) %>% dplyr::filter( max_stage_duration_min < 10000, min_midpoint_dist < 25000, max_isip_mpa < 200, shmin_grasby > 0) %>% dplyr::filter( !is.na(mean_proppant_per_stage_t), !is.na(mean_fluid_per_stage_m3), !is.na(mean_isip_mpa), mean_tvd < 0, min_midpoint_dist >0) %>% dplyr::mutate(seismogenic = as.factor(seismogenic)) if (geologic_input_data == TRUE){ ml_df = ml_df %>% dplyr::filter(max_treating_mpa < 200) %>% dplyr::filter(max_breakdown_mpa < 200) %>% dplyr::filter(!is.na(mean_breakdown_mpa)) ml_df = impute(ml_df, cols = list( sd_treating_mpa = imputeMean(), mean_treating_mpa = imputeMean(), sd_treating_mpa = imputeMean(), mean_intervals_per_stage = imputeMean(), sd_breakdown_mpa = imputeMean() )) ml_df[!is.finite(ml_df$max_treating_mpa),'max_treating_mpa'] <- 0 } # impute non-causal features ml_df = impute(ml_df, cols = list( sd_proppant_per_stage_t = imputeMean(), sd_fluid_per_stage_m3 = imputeMean(), avg_stage_length = imputeMean(), avg_stage_spacing = imputeMean(), mean_rate_m3_min = imputeMean(), mean_stage_duration_min = imputeMean(), sd_isip_mpa = imputeMean() ))$data # one hot encode categorical variables ml_df <- createDummyFeatures( ml_df, target = "seismogenic", cols = c("viscosity_cat","proppant_cat") ) %>% removeConstantFeatures(.) ml_df[!is.finite(ml_df$max_rate_m3_min),'max_rate_m3_min'] <- 0 ml_df[!is.finite(ml_df$max_stage_duration_min),'max_stage_duration_min'] <- 0
We use a boxplot, correlation plot, and PCA analysis to explore our data prior to feature selection. These functions are located in plot_functions.R.
# scatter plot of variables with seismogenic target target_boxplot(ml_df, output_path = '../output/') # pairs plot of variables with max magnitude target ml_corrplot(ml_df, '../output/', numeric_input_cols, target) # pca plot pca_scree_plot(ml_df, '../output/') pca_unit_circle_plot(ml_df, '../output/')
write_csv(ml_df, paste0("../output/","bcgoc_mldf.csv")) write_rds(ml_df, paste0("../output/","bcogc_mldf.rds"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.