# load package, paths and variable sets from global.R --------------------------
source("inst/extdata/scripts/global.R")
# MAIN 1: get and prepare individual data sets ---------------------------------
# 1. well characteristics
df_wells <- dwc.wells:::prepare_well_data(paths$data_wells, renamings)
save_data(df_wells, paths$data_prep_out, "well_data")
# 2. surface water information
df_drilling <- dwc.wells:::prepare_drilling_data(paths$data_drilling, renamings)
save_data(df_drilling, paths$data_prep_out, "drilling_data")
# 3. drilling method information
df_drilling_tech <- dwc.wells:::prepare_drilling_tech_data(
paths$data_drilling_tech, renamings
)
save_data(df_drilling_tech, paths$data_prep_out, "drilling_tech_data")
# 4. water quality data
df_quality_agg <- dwc.wells:::prepare_quality_data(paths$db, renamings)
save_data(df_quality_agg, paths$data_prep_out, "quality_data")
# 5. abstraction volumes
df_volumes_agg <- dwc.wells:::prepare_volume_data(paths$db, renamings, df_wells)
save_data(df_volumes_agg, paths$data_prep_out, "volume_data")
# 6. capacity measurements (virtual pump tests)
df_Q_monitoring <- dwc.wells:::prepare_Q_monitoring_data(
df_wells, paths$data_quantity, paths$data_W_static, renamings
)
save_data(df_Q_monitoring, paths$data_prep_out, "Q_monitoring_data")
# 7. pump test and rehab data
df_pump_tests_tidy <- prepare_pump_test_data(
paths$data_pump_tests, renamings, df_wells, pump_test_vars
)
save_data(df_pump_tests_tidy, paths$data_prep_out, "pump_test_data")
# 8. get standard deviation in static water level measurements
df_W_static_sd <- get_W_static_data(paths$data_W_static, renamings, df_wells) %>%
group_by(well_id) %>%
summarise(W_static.sd = sd(W_static, na.rm = TRUE))
# MAIN 2: combine individual data sets -----------------------------------------
# 1. well data + surface water + drilling + quality (-> feature table)
well_feature_data <- df_wells %>%
dplyr::left_join(df_drilling, by = "drilling_id") %>%
dplyr::left_join(df_drilling_tech, by = "drilling_id") %>%
dplyr::left_join(df_quality_agg, by = "well_id") %>%
dplyr::left_join(df_volumes_agg, by = "well_id") %>%
dplyr::left_join(df_W_static_sd, by = "well_id") %>%
dplyr::select(dplyr::all_of(well_features)) %>%
dplyr::mutate(drilling_method = tidyr::replace_na(drilling_method, "Unbekannt"),
W_static.sd = replace_na_with_median(W_static.sd),
volume_m3_d.mean = replace_na_with_median(volume_m3_d.mean),
volume_m3_d.sd = replace_na_with_median(volume_m3_d.sd),
volume_m3_d.cv = replace_na_with_median(volume_m3_d.cv)) %>%
droplevels()
# 2. combine pump test data and capacity measurements (virtual pump_tests)
df_pump_test_Q_monitoring <-
dwc.wells::combine_pump_test_and_Q_monitoring_data(
df_pump_tests_tidy,
df_Q_monitoring,
pump_test_vars
)
# 3. combine Qs and well_features
model_data <- dwc.wells:::combine_to_model_data(df_pump_tests_tidy, well_feature_data)
save_data(model_data, paths$data_prep_out, "model_data")
### Check different compression formats (as recommended by Rcmdcheck):
### https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Data-in-packages
#usethis::use_data(model_data, compress = "gzip", overwrite = TRUE) # 247 kB
#usethis::use_data(model_data, compress = "bzip2", overwrite = TRUE) # 261 kB
usethis::use_data(model_data, compress = "xz", overwrite = TRUE) # 180 kB
# load data --------------------------------------------------------------------
# load(file.path(paths$data_prep_out, "model_data.RData"))
model_data_reduced <- model_data %>%
dplyr::select(.data$Qs_rel,
.data$well_id,
tidyselect::all_of(model_features))
# refine variable selection ----------------------------------------------------
# remove correlated and unimportant variables (see "inst/extdata/scripts/variable_importance.R")
if (TRUE) {
# remove correlated variables
model_data_reduced <- model_data_reduced %>%
dplyr::select(-c(well_depth, quality.DR, quality.P_tot,
volume_m3_d.sd, waterworks, surface_water))
# remove unimportant variables
model_data_reduced <- model_data_reduced %>%
dplyr::select(-c(n_screens, filter_length, quality.Cu, inliner))
# remove well gallery (local variable which makes models not applicable
# to new sites or well galleries)
model_data_reduced <- model_data_reduced %>%
dplyr::select(-well_gallery)
### remove non ASCII character
is_spuehlbohrung <- grepl(pattern = "^Sp.*bohrung$",
x = model_data_reduced$drilling_method)
model_data_reduced$drilling_method <- as.character(model_data_reduced$drilling_method)
model_data_reduced$drilling_method[is_spuehlbohrung] <- "Spuehlbohrung"
model_data_reduced$drilling_method <- as.factor(model_data_reduced$drilling_method)
model_data_reduced$aquifer_coverage <- kwb.utils::multiSubstitute(
strings = model_data_reduced$aquifer_coverage,
replacements = list(`bedeckt` = "confined",
`unbedeckt` = "unconfined",
`Unbekannt` = "unknown",
`bedeckt` = "confined",
`randlich` = "edges",
`teilweise` = "partly")
) %>%
as.factor()
anonymize_hash <- function(x, algo="crc32"){
unq_hashes <- vapply(unique(x),
function(object) digest::digest(object, algo=algo),
FUN.VALUE="",
USE.NAMES=TRUE)
unname(unq_hashes[x])
}
anonymize_int <- function(x, n_values = 1000000){
values <- x
unique_values <- unique(values)
n_unique_values <- as.integer(length(unique_values))
random_values <- sample.int(n = n_values,
size = n_unique_values,
replace = FALSE)
### https://stackoverflow.com/a/50898859
as.integer(as.character(factor(values, unique_values, random_values)))
}
set.seed(1)
model_data_reduced$well_id <- anonymize_int(model_data_reduced$well_id)
model_data_reduced$screen_material <- as.factor(anonymize_hash(model_data_reduced$screen_material))
model_data_reduced$drilling_method <- as.factor(anonymize_hash(model_data_reduced$drilling_method))
}
usethis::use_data(model_data_reduced, compress = "xz", overwrite = TRUE) # 180 kB
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.