In Enlightengeo/GeoscienceBC_2019-008: Enlighten Geoscience BC 2019-008 Induced Seismicity Statistical Study Code

Geoscience BC Study Code - Machine Learning Wrapper

library(geosciencebc2019008)
run_date = Sys.Date() %>% str_replace_all(., "-", "_")
knitr::opts_chunk$set(eval = FALSE)

Load prepared data

This chunk loads our prepared data (see prepare_data.Rmd)

saf_wells_sf <- read_rds('../output/bcogc_saf_well_data.rds')
geologic_input_data = FALSE

ML Dataframe Prep

This chunk prepares our dataframe for the data science workflows (exploratory data analysis, feature selection, model generation, and model interpretation). It specifies auxillary columns (those useful for information), predictor columns, and our target column(s), which for classificatio is the seimogenic (T/F) association. Some of the columns, filtering, and imputation are specific for the classification problem, although the bits of code are very similar.

aux_cols <- c(
  "unique_surv_id", "wa_num", "drilling_event", "ground_elevtn",
  "mean_easting", "mean_northing", "survey_well_type", 
  "on_prod_date", "last_reported_date", "cum_gas_to_date_e3m3", 
  "cum_oil_to_date_m3", "cum_water_to_date_m3", "cum_cond_to_date_m3",
  "frac_start_date", "frac_end_date", "min_dist_well")

removed_inputs <- c(
  "no_prior_wells", "no_surrounding_wells",
  "interval_clusters_bool", "base_fluid_cat",
  "hybrid_frac_bool", "skipped_stages", "stim_company")

numeric_input_cols <- c(
  "mean_tvd", "n_stages", "calc_completed_length_m",
  "mean_proppant_per_stage_t", "sd_proppant_per_stage_t", 
  "max_proppant_per_stage_t", "calc_total_proppant_t", 
  "mean_fluid_per_stage_m3",   "sd_fluid_per_stage_m3", 
  "max_fluid_per_stage_m3", "calc_total_fluid_m3",
  "avg_stage_length", "avg_stage_spacing", "max_rate_m3_min", 
  "mean_rate_m3_min", "max_stage_duration_min", "mean_stage_duration_min",
  "max_isip_mpa", "sd_isip_mpa", "mean_isip_mpa", "frac_duration_days", 
  "calc_proppant_intensity_kg_m", "calc_fluid_intensity_m3_m", 
  "min_midpoint_dist",  "horiz_wells_in_1km", 
  "horiz_wells_in_5km", "horiz_wells_in_10km", "horiz_wells_in_25km")

if (geologic_input_data == TRUE){
  numeric_input_cols = c(
    numeric_input_cols, 
    "max_treating_mpa", "sd_treating_mpa", "mean_treating_mpa",
    "number_sand_off_screen_out_stages", "mean_intervals_per_stage",
    "max_breakdown_mpa", "sd_breakdown_mpa", "mean_breakdown_mpa", 
    "total_gas_injected_m3", "breakdown_isip_ratio"
    )
}

geology_input_cols <- c("distance_all_faults_berger_m",
"distance_divergent_faults_berger_m", "distance_listric_faults_berger_m", 
"distance_normal_faults_berger_m", "distance_strike_slip_faults_berger_m", 
"distance_thrust_faults_berger_m", "geothermal_gradient_degc_km", 
"paleozoic_structure_mss", "pressure_depth_ratio_kpa_m", "shmin_grasby", 
"third_order_residual_m", "top_montney_isotherm_degc", "top_montney_structure_mss", 
"top_montney_tvd_mss")

fact_input_cols <- c(
  "viscosity_cat", "energizer_bool", "proppant_cat")

if (geologic_input_data == TRUE){
  fact_input_cols = c(
    fact_input_cols, 
    "cased_well_bool", "interference_bool"
    )
}

targets <- c("seismogenic", "max_mag")

ml_df <- saf_wells_sf %>%
  st_drop_geometry() %>% 
  dplyr::select(aux_cols, numeric_input_cols, fact_input_cols, geology_input_cols, targets) %>%
  dplyr::mutate_if(is.logical, as.numeric) %>%
  dplyr::filter(
    max_stage_duration_min < 10000,
    min_midpoint_dist < 25000,
    max_isip_mpa < 200,
    shmin_grasby > 0) %>%
  dplyr::filter(
    !is.na(mean_proppant_per_stage_t),
    !is.na(mean_fluid_per_stage_m3),
    !is.na(mean_isip_mpa),
    mean_tvd < 0,
    min_midpoint_dist >0) %>%
  dplyr::mutate(seismogenic = as.factor(seismogenic))

if (geologic_input_data == TRUE){
  ml_df = ml_df %>%
    dplyr::filter(max_treating_mpa < 200) %>%
    dplyr::filter(max_breakdown_mpa < 200) %>%
    dplyr::filter(!is.na(mean_breakdown_mpa))

  ml_df = impute(ml_df, cols = list(
    sd_treating_mpa = imputeMean(),
    mean_treating_mpa = imputeMean(),
    sd_treating_mpa = imputeMean(),
    mean_intervals_per_stage = imputeMean(),
    sd_breakdown_mpa = imputeMean()
  ))

  ml_df[!is.finite(ml_df$max_treating_mpa),'max_treating_mpa'] <- 0
}

# impute non-causal features
ml_df = impute(ml_df, cols = list(
  sd_proppant_per_stage_t = imputeMean(),
  sd_fluid_per_stage_m3 = imputeMean(),
  avg_stage_length = imputeMean(),
  avg_stage_spacing = imputeMean(),
  mean_rate_m3_min = imputeMean(),
  mean_stage_duration_min = imputeMean(),
  sd_isip_mpa = imputeMean()
  ))$data

# one hot encode categorical variables
ml_df <- createDummyFeatures(
  ml_df, target = "seismogenic", cols = c("viscosity_cat","proppant_cat")
  ) %>% removeConstantFeatures(.)

ml_df[!is.finite(ml_df$max_rate_m3_min),'max_rate_m3_min'] <- 0
ml_df[!is.finite(ml_df$max_stage_duration_min),'max_stage_duration_min'] <- 0

Exploratory Data Analysis

We use a boxplot, correlation plot, and PCA analysis to explore our data prior to feature selection. These functions are located in plot_functions.R.

# scatter plot of variables with seismogenic target
target_boxplot(ml_df, output_path = '../output/')

# pairs plot of variables with max magnitude target
ml_corrplot(ml_df, '../output/', numeric_input_cols, target)

# pca plot
pca_scree_plot(ml_df, '../output/')
pca_unit_circle_plot(ml_df, '../output/')

Write csv and rds for future use

write_csv(ml_df, paste0("../output/","bcgoc_mldf.csv"))
write_rds(ml_df, paste0("../output/","bcogc_mldf.rds"))

Enlightengeo/GeoscienceBC_2019-008 documentation built on Feb. 4, 2021, 12:43 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Enlightengeo/GeoscienceBC_2019-008
Enlighten Geoscience BC 2019-008 Induced Seismicity Statistical Study Code

In Enlightengeo/GeoscienceBC_2019-008: Enlighten Geoscience BC 2019-008 Induced Seismicity Statistical Study Code

Geoscience BC Study Code - Machine Learning Wrapper

Load prepared data

ML Dataframe Prep

Exploratory Data Analysis

Write csv and rds for future use

R Package Documentation

Browse R Packages

We want your feedback!

Enlightengeo/GeoscienceBC_2019-008 Enlighten Geoscience BC 2019-008 Induced Seismicity Statistical Study Code

In Enlightengeo/GeoscienceBC_2019-008: Enlighten Geoscience BC 2019-008 Induced Seismicity Statistical Study Code

Geoscience BC Study Code - Machine Learning Wrapper

Load prepared data

ML Dataframe Prep

Exploratory Data Analysis

Write csv and rds for future use

R Package Documentation

Browse R Packages

We want your feedback!

Enlightengeo/GeoscienceBC_2019-008
Enlighten Geoscience BC 2019-008 Induced Seismicity Statistical Study Code