In KWB-R/wasserportal: R Package with Functions for Scraping Data of Wasserportal Berlin

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

is_ghactions <- identical(Sys.getenv("CI"), "true")

Install R Package

# Enable this universe
options(repos = c(
  kwbr = 'https://kwb-r.r-universe.dev',
  CRAN = 'https://cloud.r-project.org'
))

# Install R package
install.packages('wasserportal')

Load R Package and Define Helper Functions

library(wasserportal)

download_file <- function(url, tdir = tempdir())
{
  filename <- basename(url)
  target_path <- file.path(tdir, filename)
  download.file(url, dest = target_path, mode = "wb") 
  target_path
}

yes_if_in_cas_number_of <- function(x, data) {
  dplyr::if_else(x %in% unique(data$cas_number), "yes", NA_character_)
}

top_filter_datatable <- function(data) {
  DT::datatable(data, filter = "top", rownames = FALSE)
}

Define URLs

urls <- kwb.utils::resolve(list(
  record = "https://zenodo.org/record",
  norman_s0 = "<record>/5873975/files/susdat_2022-01-18-104316.csv",
  norman_s36_ubamp = "<record>/4323239/files/S36_UBAPMT_Dec2020.csv",
  norman_s90_zeropm = "<record>/5854252/files/ZeroPM_Box1.csv",
  norman_lists = "https://www.norman-network.com/nds/SLE/",
  gh_wasserportal = "https://kwb-r.github.io/wasserportal",
  stations_gwq_meta = "<gh_wasserportal>/stations_gwq_master.json",
  stations_gwq_data = "<gh_wasserportal>/stations_gwq_data.json"
))

Get Norman Lists

### Download S0 | SUSDAT | Merged NORMAN Suspect List: SusDat
### Version: NORMAN-SLE-S0.0.4.1 (2021-01-18)
### DOI: 10.5281/zenodo.5873975

norman_s0_path <- download_file(urls$norman_s0)
norman_s0 <- readr::read_csv(norman_s0_path)

### Download S36 | UBAPMT | Potential Persistent, Mobile and Toxic (PMT) substances
### Version: NORMAN-SLE-S36.0.2.1 (2020-12-15)
### DOI: "10.5281/zenodo.4323239"

norman_s36_ubapmt_path  <- download_file(urls$norman_s36_ubamp)
norman_s36_ubapmt <- readr::read_csv(norman_s36_ubapmt_path)

### Download S90 | ZEROPMBOX1 | ZeroPM Box 1 Substances
### Version: Version NORMAN-SLE-S90.0.1.0 (2021-01-15)
### DOI: 10.5281/zenodo.5854252

norman_s90_zeropm_path  <- download_file(urls$norman_s90_zeropm)
norman_s90_zeropm <- readr::read_csv(norman_s90_zeropm_path)

cas_wasserportal <- wasserportal::readPackageFile(
  file = "cas_wasserportal.csv",
  encoding = "UTF-8"
)

cas_reach <- wasserportal::readPackageFile(
  file = "cas_reach.csv"
)

wasserportal_cas_numbers <- unique(cas_wasserportal$cas_number)

ubapmt_publication <- cas_reach %>%
  dplyr::filter(.data$cas_number %in% wasserportal_cas_numbers)

ubapmt_zenodo <- norman_s36_ubapmt %>%
  dplyr::filter(.data$CAS_Number %in% wasserportal_cas_numbers) %>%
  dplyr::rename(cas_number = .data$CAS_Number)

missing_on_zenodo <- cas_reach %>%
  dplyr::mutate(
    zenodo = yes_if_in_cas_number_of(.data$cas_number, ubapmt_zenodo),
    publication = yes_if_in_cas_number_of(.data$cas_number, ubapmt_publication)
  ) %>%
  dplyr::filter(
    publication == "yes" | zenodo == "yes"
  ) %>%
  dplyr::relocate(
    tidyselect::all_of(c("publication", "zenodo")), 
    .before = .data$emission_likelihood
  )

top_filter_datatable(missing_on_zenodo)

norman_s0_in_wasserportal <- norman_s0 %>%
  dplyr::mutate(
    cas_number = stringr::str_remove(
      .data$CAS_RN_Dashboard, 
      pattern = "^CAS_RN:\\s"
    )
  ) %>%
  dplyr::filter(
    !is.na(.data$cas_number),
    .data$cas_number %in% wasserportal_cas_numbers
  )

#top_filter_datatable(norman_s0_in_wasserportal)

norman_lists_table <- xml2::read_html(urls$norman_lists) %>%
  rvest::html_element(css = ".table") %>%
  rvest::html_table() %>%
  janitor::clean_names()

DT::datatable(norman_lists_table)

Get GW Quality from Wasserportal

# Load R package

### For details see:
### https://kwb-r.github.io/wasserportal/articles/groundwater.html
### JSON files (see below) are build every day automatically at 5 a.m. with
### continuous integration, for build status, see here:
### https://github.com/KWB-R/wasserportal/actions/workflows/pkgdown.yaml

### GW quality (all available parameters!)
gwq_master <- jsonlite::fromJSON(urls$stations_gwq_meta)
gwq_data <- jsonlite::fromJSON(urls$stations_gwq_data) %>%
  dplyr::filter(.data$Parameter %in% cas_wasserportal$Parameter) %>%
  dplyr::inner_join(cas_wasserportal, by = "Parameter") %>%
  dplyr::inner_join(norman_s0_in_wasserportal, by = "cas_number") %>% 
  dplyr::filter(
    .data$logKow_EPISuite <= 4.5,
    !is.na(.data$`LC50_48_hr_ug/L`)
  ) %>% 
  dplyr::mutate(
    Messstellennummer = as.character(Messstellennummer),
    ## CensorCode: either "below" (less than) for concentration below detection
    ## limit (value is detection limit) or "nc" (not censored) for concentration
    ## above detection limit
    CensorCode = dplyr::case_when(
      Messwert <= 0 ~ "lt", 
      TRUE ~ "nc"
    ),
    Messwert = dplyr::case_when(
      Messwert < 0 ~ abs(Messwert),
      ### Only two decimal numbers are exported by Wasserportal, but some
      ### sustances have lower detection limit, e.g. 0.002 which results in
      ### -0.00 export, thus the dummy detection limit 0.00999 was introduced
      ### (until fixed by Senate: Christoph will sent a email to Matthias
      ### Schroeder)
      Messwert == 0 ~ 0.009999,
      TRUE ~ Messwert
    )
  ) %>%
  dplyr::left_join(gwq_master, by = c("Messstellennummer" = "Nummer"))

gwq_subs <- gwq_data %>%
  dplyr::count(.data$cas_number, .data$CensorCode) %>%
  tidyr::pivot_wider(names_from = CensorCode, values_from = n) %>%
  dplyr::mutate(
    lt = ifelse(is.na(lt), 0, lt),
    nc = ifelse(is.na(nc), 0, nc),
    n_total = lt + nc,
    percent_nc = 100 * nc / n_total
  ) %>%
  dplyr::rename(
    n_lt = lt,
    n_nc = nc
  ) %>%
  dplyr::left_join(norman_s0_in_wasserportal) %>%
  dplyr::rename(name_norman = .data$Name_Dashboard)

readr::write_csv(gwq_subs, "gwq_subs.csv")

top_filter_datatable(gwq_subs)

samples <- gwq_data %>%
  dplyr::rename(name_norman = .data$Name_Dashboard) %>%
  dplyr::select(
    name_norman,
    cas_number,
    Messstellennummer,
    Datum,
    CensorCode,
    Messwert,
    Einheit
  )

samples_by_para_and_station <- gwq_data %>%
  dplyr::count(
    .data$cas_number,
    .data$Messstellennummer,
    .data$CensorCode
  ) %>%
  tidyr::pivot_wider(names_from = CensorCode, values_from = n) %>%
  dplyr::mutate(
    lt = ifelse(is.na(lt), 0, lt),
    nc = ifelse(is.na(nc), 0, nc),
    n_total = lt + nc,
    percent_nc = 100 * nc / n_total
  ) %>%
  dplyr::rename(
    n_lt = lt,
    n_nc = nc
  ) %>%
  dplyr::left_join(norman_s0_in_wasserportal) %>%
  dplyr::rename(name_norman = .data$Name_Dashboard) %>%
  dplyr::left_join(gwq_master, by = c(Messstellennummer = "Nummer")) %>%
  dplyr::arrange(dplyr::desc(percent_nc))

# samples_by_category_and_station <- samples_by_para_and_station  %>%
#   dplyr::group_by(.data$category,
#                   .data$category_name,
#                   .data$Messstellennummer) %>%
#   dplyr::summarise(n_lt = sum(n_lt),
#                    n_nc = sum(n_nc),
#                    n_total = sum(n_total)) %>%
#   dplyr::mutate(percent_nc = 100*n_nc/n_total) %>%
#   dplyr::arrange(dplyr::desc(percent_nc))

gwq_subs_stations_n_abovedetection <- samples_by_para_and_station  %>%
  dplyr::filter(n_nc > 0) %>%
  dplyr::group_by(.data$cas_number) %>%
  dplyr::summarise(n_stations_abovedetection = dplyr::n())

gwq_subs_stations_n_paras_abovedetection <- samples_by_para_and_station  %>%
  dplyr::filter(n_nc > 0) %>%
  dplyr::group_by(
    #.data$category,
    #.data$category_name,
    .data$Messstellennummer
  ) %>%
  dplyr::summarise(n_paras_abovedetection = dplyr::n()) %>%
  dplyr::left_join(gwq_master, by = c("Messstellennummer" = "Nummer"))

# gwq_subs_stations_n_paras_abovedetection_wide <- gwq_subs_stations_n_paras_abovedetection %>%
#   dplyr::ungroup() %>%
#   dplyr::select(Messstellennummer, category, n_paras_abovedetection) %>%
#   tidyr::pivot_wider(names_from = "category",
#                      names_prefix = "cat_",
#                      values_from = "n_paras_abovedetection") %>%
#   dplyr::left_join(gwq_master, by = c("Messstellennummer" = "Nummer"))

samples_by_para_and_station_n <- samples_by_para_and_station %>%
  dplyr::group_by(name_norman, cas_number) %>%
  dplyr::summarise(
    n_stations_sampled = dplyr::n(),
    n_stations_total = length(unique(gwq_master$Nummer)),
    n_lt = sum(n_lt),
    n_nc = sum(n_nc),
    n_total = sum(n_total)
  ) %>%
  dplyr::left_join(gwq_subs_stations_n_abovedetection) %>%
  dplyr::mutate(
    n_stations_abovedetection = ifelse(
      is.na(n_stations_abovedetection),
      0,
      n_stations_abovedetection
    ),
    n_abovedetection = ifelse(is.na(n_nc), 0, n_nc),
    n_belowdetection = ifelse(is.na(n_lt), 0, n_lt),
    percent_samples_abovedetection = 100 * n_nc / n_total,
    percent_stations_abovedetection = 100 * n_stations_abovedetection / n_stations_total,
    percent_stations_sampled = 100 * n_stations_sampled / n_stations_total
  ) %>%
  dplyr::select(
    name_norman,
    cas_number,
    n_stations_abovedetection,
    n_stations_sampled,
    n_stations_total,
    percent_stations_abovedetection,
    percent_stations_sampled,
    n_belowdetection,
    n_abovedetection,
    n_total,
    percent_samples_abovedetection
  ) %>%
  dplyr::arrange(
    dplyr::desc(percent_stations_abovedetection),
    dplyr::desc(percent_samples_abovedetection)
  )

### Export data to EXCEL
gwq_data_list <- list(
  norman_lists_table = norman_lists_table, 
  cas_wasserportal = cas_wasserportal,
  cas_reach = cas_reach, 
  ubapmt_missing_on_zenodo = missing_on_zenodo,
  norman_s0_in_wasserportal = norman_s0_in_wasserportal,
  samples = samples,
  samples_by_para = gwq_subs %>%
    dplyr::arrange(dplyr::desc(percent_nc)),
  samples_by_para_and_station = samples_by_para_and_station,
  samples_by_para_and_station_n = samples_by_para_and_station_n
)

#samples_by_stations_para_above = gwq_subs_stations_n_paras_abovedetection_wide)
#samples_by_category_and_station = samples_by_category_and_station)

openxlsx::write.xlsx(
  x = gwq_data_list,
  file = "wasserportal_gwq_norman-list_s0_v1.0.0.xlsx",
  overwrite = TRUE
)

Norman Substances in Wasserportal

Filter criteria:

log Kow (column logKow_EPISuite <= 4.5)
toxicity value (column LC50_48_hr_ug/ not NA)

gwq_subs_plot <- samples_by_para_and_station_n %>% 
  dplyr::filter(.data$percent_samples_abovedetection > 0) %>% 
  dplyr::left_join(norman_s0_in_wasserportal) %>%
  dplyr::arrange(.data$`LC50_48_hr_ug/L`) %>% 
  dplyr::mutate(label = sprintf(
    "%s (%s, LC50_48_hr_ug/L: %.1f, logKow_EPISuite: %.1f : ", 
    .data$name_norman, 
    .data$cas_number,
    .data$`LC50_48_hr_ug/L`,
    .data$logKow_EPISuite
  ))

gwq_subs_plot$label <- as.factor(gwq_subs_plot$label)

g1 <- gwq_subs_plot %>% 
  ggplot2::ggplot(ggplot2::aes(
    x = .data$percent_samples_abovedetection, 
    y = forcats::fct_reorder(
      .data$label, 
      .data$percent_samples_abovedetection, 
      .desc = TRUE
    ),
    label = sprintf(
      "%2.2f %% (n_samples = %d, n_stations = %d)",
      .data$percent_samples_abovedetection,
      .data$n_total, 
      .data$n_stations_sampled
    ),
    fill = .data$logKow_EPISuite
  )) +                             
  #ggplot2::scale_fill_brewer(palette="RdYlGn") +      
  ggplot2::scale_y_discrete(limits = rev) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::geom_text(size = 1.8, hjust = -0.01) +
  ggplot2::xlim(c(0,120)) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "top") +
  ggplot2::labs(
    # subtitle = sprintf(
    #   "%d / %d substances (>= 1 value above detection limit)",
    #   sum(gwq_subs_plot$n_abovedetection > 0),
    #   nrow(gwq_subs_plot)
    # ),
    y = "", 
    x = "Percent of Samples above Detection Limit (%)"
  )

g1

ggplot2::ggsave(
  filename = "wasserportal_norman-s0-substances_only-above-detection-limit.jpeg", 
  plot = g1,
  width = 40, 
  height = 20,
  units = "cm"
)

KWB-R/wasserportal documentation built on June 6, 2024, 10:26 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

KWB-R/wasserportal
R Package with Functions for Scraping Data of Wasserportal Berlin

In KWB-R/wasserportal: R Package with Functions for Scraping Data of Wasserportal Berlin

Install R Package

Load R Package and Define Helper Functions

Define URLs

Get Norman Lists

Get GW Quality from Wasserportal

Norman Substances in Wasserportal

R Package Documentation

Browse R Packages

We want your feedback!

KWB-R/wasserportal R Package with Functions for Scraping Data of Wasserportal Berlin

In KWB-R/wasserportal: R Package with Functions for Scraping Data of Wasserportal Berlin

Install R Package

Load R Package and Define Helper Functions

Define URLs

Get Norman Lists

Get GW Quality from Wasserportal

Norman Substances in Wasserportal

R Package Documentation

Browse R Packages

We want your feedback!

KWB-R/wasserportal
R Package with Functions for Scraping Data of Wasserportal Berlin