source("inst/extdata/scripts/global.R")
source("inst/extdata/scripts/ml_resampling.R")
# filter only Betriebsbrunnen
df <- well_feature_data %>% dplyr::filter(well_function == "Betriebsbrunnen" &
operational_state == "betriebsbereit")
# fill up NA values
df <- fill_up_na_with_median_from_lookup(df, df)
# drop unused factor levels
df <- df %>% droplevels()
# prepare prediction data ---
# make predictions for other data ------
names(df_test)[!names(df_test) %in% names(df)]
df_wells <- dwc.wells:::prepare_well_data(paths$data_wells, renamings)
rehab_data <- prepare_pump_test_data_1(paths$data_pump_tests, renamings, df_wells) %>%
dplyr::filter(pump_test_2.well_rehab) %>%
dplyr::group_by(well_id) %>%
dplyr::mutate(n_rehab = as.factor(dwc.wells:::cumsum_no_na(pump_test_2.well_rehab))) %>%
dplyr::select(well_id, operational_start.date, action_date, n_rehab) %>%
dplyr::ungroup()
# get well id and operational start date
well_start_dates <- df %>% select(well_id, operational_start.date) %>% unique()
# get well id and age combinations
well_ages <- df %>% group_by(well_id) %>% tidyr::expand(well_age_years = seq.int(0, 60, 1))
# combine both and calculate sim dates
library(lubridate)
well_ages_dates <- well_start_dates %>%
left_join(well_ages) %>%
mutate(sim_date = operational_start.date %m+% years(well_age_years)) %>%
select(-operational_start.date)
# tmp data 1
sim_data_tmp1 <- rehab_data %>%
rename(sim_date = action_date) %>%
mutate(well_age_years = NA) %>%
select(well_id, sim_date, well_age_years, n_rehab)
# tmp data 2
sim_data_tmp2 <- well_ages_dates %>%
mutate(n_rehab = NA) %>%
select(well_id, sim_date, well_age_years, n_rehab)
# create sim data
sim_data_base <- rbind(sim_data_tmp1, sim_data_tmp2) %>%
arrange(well_id, sim_date) %>%
left_join(rehab_data %>% select(well_id, action_date, n_rehab)) %>%
group_by(well_id) %>%
tidyr::fill(c(n_rehab, action_date), .direction = "down") %>%
mutate(time_since_rehab_years = lubridate::time_length(sim_date - action_date, "years")) %>%
mutate(n_rehab = ifelse(is.na(n_rehab), 0, n_rehab),
time_since_rehab_years = ifelse(is.na(time_since_rehab_years),
well_age_years,
time_since_rehab_years)) %>%
mutate(type = ifelse(sim_date < as.Date("2021-05-01"), "past", "future")) %>%
filter(!is.na(well_age_years)) %>%
select(-action_date) %>%
ungroup()
# create sim data
sim_data <- sim_data_base %>%
left_join(df, by = "well_id") %>%
data.frame() %>%
select(well_id, all_of(model_features), type) %>%
# remove correlated variables
select(-c(well_depth, quality.DR, quality.P_tot,
volume_m3_d.sd, waterworks, surface_water)) %>%
# remove unimportant variables
select(-c(n_screens, filter_length, quality.Cu, inliner)) %>%
# set inliner in screen_material to "Unbekannt
dplyr::mutate(screen_material = replace(
screen_material, screen_material == "Inliner", "Unbekannt"
) %>% forcats::fct_drop())
sim_data_pred <- predict(xgb_fit, sim_data) %>%
mutate(.pred = ifelse(.pred < 0, 0, .pred))
names(sim_data_pred) <- "Qs_rel"
predictions <- cbind(sim_data, sim_data_pred)
length(unique(predictions$well_id))
# plot 1: all predictions as points, one plot per well -------------------------
p <- ggplot(predictions, aes(x = well_age_years, y = Qs_rel)) +
scale_x_continuous(limits = c(0, 40)) +
scale_y_continuous(limits = c(0, 100), oob = scales::rescale_none) +
geom_point(alpha = 0.5) +
facet_wrap(~well_id)
ggsave("xgb_plots_points_betriebsbereit.png", p, dpi = 600, width = 30, height = 20)
# plot 2: all predictions as individual lines in one plot ----------------------
p2 <- ggplot(predictions, aes(x = well_age_years, y = Qs_rel, col = type, group = well_id)) +
geom_line(alpha = 0.05) +
scale_x_continuous(limits = c(0, 40)) +
scale_y_continuous(limits = c(0, 100)) +
labs(x = "Well age [yrs]", y = "Specific capacity [%]") +
sema.berlin.utils::my_theme(legend.position = "none")
p2
getwd()
ggsave("xgb_plot_multi_line_betriebsbereit_bis_40_past_future.png", dpi = 600, width = 4.5, height = 3)
# plot mean prediction vs. confidence interval ---------------------------------
p3 <- ggplot(predictions, aes(x = well_age_years, y = Qs_rel)) +
scale_x_continuous(limits = c(0, 40)) +
scale_y_continuous(limits = c(0, 100)) +
labs(x = "Well age [yrs]", y = "Specific capacity [%]") +
# stat_summary(geom = "ribbon", fun.data = mean_cl_normal,
# fun.args = list(conf.int = 0.9999), fill = "lightblue") +
stat_summary(geom = "ribbon", fun.data = "median_hilow", fill = "lightblue") +
stat_summary(geom = "line", fun = mean) +
sema.berlin.utils::my_theme()
p3
ggsave("xgb_plot_mean_line_and_95_conf_int_betriebsbereit_bis_40.png", p3, dpi = 600, width = 4.5, height = 3)
# plot 4: predictions for two selected wells with colors for past / future -----
p4 <- ggplot(filter(predictions, well_id %in% c(1161, 5837)),
aes(x = well_age_years, y = Qs_rel, col = type)) +
geom_line(alpha = 0.5) +
scale_x_continuous(limits = c(0, 40)) +
scale_y_continuous(limits = c(NA, 100), breaks = scales::pretty_breaks(6)) +
facet_wrap(~well_id) +
labs(x = "Well age [yrs]", y = "Specific capacity [%]") +
sema.berlin.utils::my_theme(legend.position = "none")
p4
ggsave("xgb_plot_multi_line_betriebsbereit_bis_40_past_future_2wells.png",
dpi = 600, width = 6, height = 3)
# plot predictions vs. observations for two selected wells ---------------------
df <- model_data %>% filter(well_id %in% c(1161, 5837)) %>%
mutate(n_rehab = as.factor(n_rehab))
df_pred <- predictions %>% filter(well_id %in% c(1161, 5837))
p5 <- ggplot2::ggplot(df, ggplot2::aes(x = well_age_years,
y = Qs_rel, col = n_rehab)) +
ggplot2::geom_point() +
#ggplot2::geom_line(lty = 2) +
#ggplot2::geom_line(ggplot2::aes(group = "all")) +
ggplot2::geom_line(data = df_pred, aes(x = well_age_years,
y = Qs_rel,
col = NULL,
lty = 'unity line'
), alpha = 0.5) +
scale_x_continuous(limits = c(0, 40)) +
scale_y_continuous(limits = c(NA, 120), breaks = scales::pretty_breaks(6), oob = scales::rescale_none) +
ggplot2::scale_color_manual(values = rev(RColorBrewer::brewer.pal(length(levels(df$n_rehab)), "RdYlGn"))) +
#ggplot2::scale_color_manual(values = rev(scales::hue_pal()(length(levels(df$n_rehab))))) +
sema.berlin.utils::my_theme(legend.position = "top") +
ggplot2::guides(color = ggplot2::guide_legend(nrow = 1)) +
ggplot2::labs(color = "observations (by number of rehabs):",
x = "Well age [yrs]",
y = "Specific capacity [%]",
linetype = "predictions") +
facet_wrap(~well_id, labeller = purrr::partial(label_both, sep = ": "))
p5
ggsave("plot_observations_vs_predictions_xgboost_v3.png", dpi = 600, width = 9, height = 4)
# plot predictions vs. observations for one selected well ----------------------
df <- model_data %>% filter(well_id %in% c(5837)) %>%
mutate(n_rehab = as.factor(n_rehab))
df_pred <- predictions %>% filter(well_id %in% c(5837))
p5 <- ggplot2::ggplot(df, ggplot2::aes(x = well_age_years,
y = Qs_rel, col = n_rehab)) +
ggplot2::geom_point() +
ggplot2::geom_line(ggplot2::aes(group = "all")) +
ggplot2::geom_line(data = df_pred, aes(x = well_age_years,
y = Qs_rel,
col = NULL,
lty = 'unity line'
), alpha = 0.5) +
scale_x_continuous(limits = c(0, 40)) +
scale_y_continuous(limits = c(NA, 120), breaks = scales::pretty_breaks(6), oob = scales::rescale_none) +
ggplot2::scale_color_manual(values = rev(RColorBrewer::brewer.pal(length(levels(df$n_rehab)), "RdYlGn"))) +
#ggplot2::scale_color_manual(values = rev(scales::hue_pal()(length(levels(df$n_rehab))))) +
sema.berlin.utils::my_theme() +
#ggplot2::guides(color = ggplot2::guide_legend(nrow = 1)) +
ggplot2::labs(color = "observations\n(by number of rehabs):",
x = "Well age [yrs]",
y = "Specific capacity [%]",
linetype = "predictions") +
facet_wrap(~well_id, labeller = purrr::partial(label_both, sep = ": "))
p5
ggsave("plot_observations_vs_predictions_xgboost_v3b.png", dpi = 600, width = 6, height = 3.5)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.