Geoscience BC Study Code - Machine Learning Wrapper

library(geosciencebc2019008)
run_date = Sys.Date() %>% str_replace_all(., "-", "_")
knitr::opts_chunk$set(eval = FALSE)

Load prepared data

This chunk loads our prepared data (see prepare_data.Rmd)

saf_wells_sf <- read_rds('../output/bcogc_saf_well_data.rds')
geologic_input_data = FALSE

ML Dataframe Prep

This chunk prepares our dataframe for the data science workflows (exploratory data analysis, feature selection, model generation, and model interpretation). It specifies auxillary columns (those useful for information), predictor columns, and our target column(s), which for classificatio is the seimogenic (T/F) association. Some of the columns, filtering, and imputation are specific for the classification problem, although the bits of code are very similar.

aux_cols <- c(
  "unique_surv_id", "wa_num", "drilling_event", "ground_elevtn",
  "mean_easting", "mean_northing", "survey_well_type", 
  "on_prod_date", "last_reported_date", "cum_gas_to_date_e3m3", 
  "cum_oil_to_date_m3", "cum_water_to_date_m3", "cum_cond_to_date_m3",
  "frac_start_date", "frac_end_date", "min_dist_well")

removed_inputs <- c(
  "no_prior_wells", "no_surrounding_wells",
  "interval_clusters_bool", "base_fluid_cat",
  "hybrid_frac_bool", "skipped_stages", "stim_company")

numeric_input_cols <- c(
  "mean_tvd", "n_stages", "calc_completed_length_m",
  "mean_proppant_per_stage_t", "sd_proppant_per_stage_t", 
  "max_proppant_per_stage_t", "calc_total_proppant_t", 
  "mean_fluid_per_stage_m3",   "sd_fluid_per_stage_m3", 
  "max_fluid_per_stage_m3", "calc_total_fluid_m3",
  "avg_stage_length", "avg_stage_spacing", "max_rate_m3_min", 
  "mean_rate_m3_min", "max_stage_duration_min", "mean_stage_duration_min",
  "max_isip_mpa", "sd_isip_mpa", "mean_isip_mpa", "frac_duration_days", 
  "calc_proppant_intensity_kg_m", "calc_fluid_intensity_m3_m", 
  "min_midpoint_dist",  "horiz_wells_in_1km", 
  "horiz_wells_in_5km", "horiz_wells_in_10km", "horiz_wells_in_25km")

if (geologic_input_data == TRUE){
  numeric_input_cols = c(
    numeric_input_cols, 
    "max_treating_mpa", "sd_treating_mpa", "mean_treating_mpa",
    "number_sand_off_screen_out_stages", "mean_intervals_per_stage",
    "max_breakdown_mpa", "sd_breakdown_mpa", "mean_breakdown_mpa", 
    "total_gas_injected_m3", "breakdown_isip_ratio"
    )
}

geology_input_cols <- c("distance_all_faults_berger_m",
"distance_divergent_faults_berger_m", "distance_listric_faults_berger_m", 
"distance_normal_faults_berger_m", "distance_strike_slip_faults_berger_m", 
"distance_thrust_faults_berger_m", "geothermal_gradient_degc_km", 
"paleozoic_structure_mss", "pressure_depth_ratio_kpa_m", "shmin_grasby", 
"third_order_residual_m", "top_montney_isotherm_degc", "top_montney_structure_mss", 
"top_montney_tvd_mss")

fact_input_cols <- c(
  "viscosity_cat", "energizer_bool", "proppant_cat")

if (geologic_input_data == TRUE){
  fact_input_cols = c(
    fact_input_cols, 
    "cased_well_bool", "interference_bool"
    )
}

targets <- c("seismogenic", "max_mag")

ml_df <- saf_wells_sf %>%
  st_drop_geometry() %>% 
  dplyr::select(aux_cols, numeric_input_cols, fact_input_cols, geology_input_cols, targets) %>%
  dplyr::mutate_if(is.logical, as.numeric) %>%
  dplyr::filter(
    max_stage_duration_min < 10000,
    min_midpoint_dist < 25000,
    max_isip_mpa < 200,
    shmin_grasby > 0) %>%
  dplyr::filter(
    !is.na(mean_proppant_per_stage_t),
    !is.na(mean_fluid_per_stage_m3),
    !is.na(mean_isip_mpa),
    mean_tvd < 0,
    min_midpoint_dist >0) %>%
  dplyr::mutate(seismogenic = as.factor(seismogenic))

if (geologic_input_data == TRUE){
  ml_df = ml_df %>%
    dplyr::filter(max_treating_mpa < 200) %>%
    dplyr::filter(max_breakdown_mpa < 200) %>%
    dplyr::filter(!is.na(mean_breakdown_mpa))

  ml_df = impute(ml_df, cols = list(
    sd_treating_mpa = imputeMean(),
    mean_treating_mpa = imputeMean(),
    sd_treating_mpa = imputeMean(),
    mean_intervals_per_stage = imputeMean(),
    sd_breakdown_mpa = imputeMean()
  ))

  ml_df[!is.finite(ml_df$max_treating_mpa),'max_treating_mpa'] <- 0
}

# impute non-causal features
ml_df = impute(ml_df, cols = list(
  sd_proppant_per_stage_t = imputeMean(),
  sd_fluid_per_stage_m3 = imputeMean(),
  avg_stage_length = imputeMean(),
  avg_stage_spacing = imputeMean(),
  mean_rate_m3_min = imputeMean(),
  mean_stage_duration_min = imputeMean(),
  sd_isip_mpa = imputeMean()
  ))$data

# one hot encode categorical variables
ml_df <- createDummyFeatures(
  ml_df, target = "seismogenic", cols = c("viscosity_cat","proppant_cat")
  ) %>% removeConstantFeatures(.)

ml_df[!is.finite(ml_df$max_rate_m3_min),'max_rate_m3_min'] <- 0
ml_df[!is.finite(ml_df$max_stage_duration_min),'max_stage_duration_min'] <- 0

Exploratory Data Analysis

We use a boxplot, correlation plot, and PCA analysis to explore our data prior to feature selection. These functions are located in plot_functions.R.

# scatter plot of variables with seismogenic target
target_boxplot(ml_df, output_path = '../output/')

# pairs plot of variables with max magnitude target
ml_corrplot(ml_df, '../output/', numeric_input_cols, target)

# pca plot
pca_scree_plot(ml_df, '../output/')
pca_unit_circle_plot(ml_df, '../output/')

Write csv and rds for future use

write_csv(ml_df, paste0("../output/","bcgoc_mldf.csv"))
write_rds(ml_df, paste0("../output/","bcogc_mldf.rds"))


Enlightengeo/GeoscienceBC_2019-008 documentation built on Feb. 4, 2021, 12:43 p.m.