The Referendum Database (RDB) currently offers a privately documented API available under
services.c2d.ch
with the following HTTP GET "endpoints":
/referendums
: get data on multiple referendums
It supports the following URL parameters:
mode
: the operation mode; one of stream
, count
or ~~search
~~; if not provided, paginated (instead of streamed) data is returnedterm
the search term used for mode=search
; note that only the title$en
field will be searched~~ support for mode=search
was removedformat
: the data format to be returned when mode=stream
; one of json
or csv
filter
: a valid JSON query filter document to customize queries in the
underlying MongoDB collection; needs to be Base64-encoded; there are various query
operators available to build complex queriesThe following values are used to denote empty values in the returned data:
In integer fields:
-1L
(means "information not available or nonexistent", an explicit NA
)-2L
(means "information not given", an implicit NA
)In character fields:
""
"-1"
"Unknown"
(only for certain fields)"-2"
"Not provided"
(only for certain fields)In array fields:
list()
/referendums/ID
: get data on an individual referendum
ID
stands for the referendum's unique identifier oid
(which corresponds to the column id
in the data returned by the rfrnds()
function of this
package).
The returned JSON data has between 22 and 26 fields. Fields that are not always present include:
archive
canton
municipality
number
/referendums/stats
: get the number of referendums
It accepts the following URL parameter:
filter
: a valid JSON query filter document to customize queries in the
underlying MongoDB collection; needs to be Base64-encoded; there are various query
operators available to build complex queriess3_objects
: retrieve a file stored on RDB's Amazon S3 bucket
It's not a real REST API endpoint but rather a URL redirection (I guess). It expects an object key to be appended like this:
http
GET https://services.c2d.ch/s3_objects/referendum_5f859fafd1291cc3961f1bc2_0002.pdf
/states
: get the first 10 subnational entities that match the query
It accepts the following URL parameters to customize the query:
country
: limits the results to a specific country codeterm
: limits the result to subnational entities matching this search term/health
: check API status
Further notes:
Fields that are stored as dates in MongoDB (date_time_created
) adhere to the
JavaScript date specification, i.e. are actually datetimes,
counted as milliseconds since 1970-01-01T00:00:00.
Note that with a recent change in the API code (only deployed to testing instance as of 2022-08-16), the fields supposed to be date(time)s are returned as RFC 3339-compliant strings
R CMD check
notesCf. https://github.com/tidyverse/magrittr/issues/29#issuecomment-74313262
utils::globalVariables(names = c(".", # tidyselect fns "all_of", "any_of", "everything", "matches", "starts_with", "where", ".x", # other "Alpha_2", "Alpha_3", "Alpha_4", "applicability_constraint", "archive", "Children", "children_tier_1", "children_tier_2", "children_tier_3", "Code", "committee_name", "Common_name", "content-disposition", "count", "country_code", "country_code_long", "country_code_continual", "country_name", "country_name_de", "country_name_long", "date_last_edited", "date_time_created", "date_time_last_active", "date_time_last_edited", "Date_withdrawn", "day", "electorate_abroad", "electorate_total", "files", "id", "id_official", "id_sudd", "id_sudd_prefix", "inst_has_precondition", "inst_has_urgent_legal_basis", "inst_is_assembly", "inst_is_binding", "inst_is_counter_proposal", "inst_legal_basis_type", "inst_object_author", "inst_object_legal_level", "inst_object_revision_extent", "inst_object_revision_modes", "inst_object_type", "inst_precondition_actor", "inst_precondition_decision", "inst_quorum_approval", "inst_quorum_turnout", "inst_topics_excluded", "inst_topics_only", "inst_trigger_actor", "inst_trigger_actor_level", "inst_trigger_threshold", "inst_trigger_time_limit", "inst_trigger_type", "is_draft", "is_former_country", "is_multi_valued", "is_opt", "is_testing_server", "ISO_Alpha_3", "items", "level", "lower_house_abstentions", "lower_house_no", "lower_house_yes", "month", "municipality", "n", "Name", "name_long", "number", "Official_name", "Parent", "parent_topic", "position_government", "ptype", "question", "question_en", "referendum_text_options", "remarks", "result", "rowid", "sources", "subnational_entity_code", "subnational_entity_name", "subterritories_no", "subterritories_yes", "sudd_prefix", "tags", "topic", "topic_tier_1", "topic_tier_2", "topic_tier_3", "topics_tier_1", "topics_tier_2", "topics_tier_3", "territory_name_de", "territory_name_de_short", "title_de", "title_fr", "title_en", "turnout", "type", "Type", "un_country_code", "un_region_tier_1_code", "un_region_tier_2_name", "un_region_tier_3_name", "upper_house_abstentions", "upper_house_no", "upper_house_yes", "url_sudd", "url_swissvotes", "value_labels", "value_scale", "variable_name", "variable_name_unnested", "variable_name_print", "variable_values", "value", "value_total", "votes", "votes_empty", "votes_invalid", "votes_no", "votes_per_subterritory", "votes_yes", "year"))
Define dummy functions that reference objects from the package's namespaces.
# katex is used in documentation dynamically via `\Sexpr`, so it has to be imported to avoid an error when rendering the corresponding help page # TODO: since the `R CMD check` warning can be considered a bug in this case, investigate the situation and possibly submit bug report has_katex <- function() { nchar(katex::example_math()) > 0L }
.onLoad <- function(libname, pkgname) { # clear pkgpins cache tryCatch(expr = pkgpins::clear_cache(board = pkgpins::board(pkg = pkgname), max_age = pal::pkg_config_val(key = "global_max_cache_age", pkg = pkgname)), error = function(e) cli::cli_alert_warning(text = "Failed to clear pkgpins cache on load of {.pkg {pkgname}}. Error message: {e$message}")) }
Assembles a custom CSS file by combining rmarkdown's default lightweight vignette stylesheet with our custom pkgdown CSS.
The R code chunk below is not included in the source package (purl = FALSE
) and thus has to be
manually executed in order to regenerate the custom CSS file.
rlang::check_installed("brio") fs::path_package("rmarkdown/templates/html_vignette/resources/vignette.css", package = "rmarkdown") %>% brio::read_file() %>% paste0("/* BEGIN custom CSS */\n", brio::read_file("pkgdown/extra.css"), "/* END custom CSS */\n") %>% brio::write_file(path = "vignettes/custom.css")
api_failure
api_failure <- function(parsed, raw = NULL, prefix = "") { env <- parent.frame(n = 2L) assign(x = "parsed", value = parsed, pos = env) msg_part_val <- ifelse(utils::hasName(parsed$error, "value"), paste0(": ", paste0("{.var ", names(parsed$error$value), "}: {.warn ", parsed$error$value, "}", collapse = ", ")), "") msg_part_error <- ifelse(utils::hasName(parsed, "error"), paste0("error {.err {parsed$error$id}}", msg_part_val), "{.err {.y}}.") cli_div_id <- cli::cli_div(theme = cli_theme) cli::cli_alert_warning(paste0(prefix, "The API server responded with ", msg_part_error), .envir = env) if (!is.null(raw)) { cli::cli_alert_info("The following JSON payload was sent: {.content {raw}}") } cli::cli_end(id = cli_div_id) }
as_fm_list
Convert to formula-list (mainly to be fed to dplyr::case_match()
).
as_fm_list <- function(x) { purrr::imap(x, ~ rlang::new_formula(lhs = .y, rhs = .x, env = emptyenv())) }
assemble_query_filter
NOTE: date
is not stored as an actual MongoDB Date
type in the
database, so we can't filter on it. It has been requested to change this upstream.
#' Assemble MongoDB query filter document #' #' @param country_code The `country_code`(s) to be included. A character vector. #' @param subnational_entity_name The `subnational_entity_name`(s) to be included. A character vector. #' @param municipality The `municipality`(s) to be included. A character vector. #' @param level The `level`(s) to be included. A character vector. #' @param type The `type`(s) to be included. A character vector. #' @param date_min The minimum `date` to be included. A [date][Date] or something coercible to. #' @param date_max The maximum `date` to be included. A [date][Date] or something coercible to. #' @param is_draft `TRUE` means to include only referendum entries with _draft_ status, `FALSE` to include only normal entries. Set to `NULL` in order to #' include both draft and normal entries. #' @param date_time_created_min The minimum `date_time_created` to be included. A [datetime][base::DateTimeClasses], or something coercible to (like #' `"2006-01-02"` or `"2006-01-02T15:04:05Z"`; assumed to be in UTC if no timezone is given). #' @param date_time_created_max The maximum `date_time_created` to be included. A [datetime][base::DateTimeClasses], or something coercible to (like #' `"2006-01-02"` or `"2006-01-02T15:04:05Z"`; assumed to be in UTC if no timezone is given). #' @param date_time_last_edited_min The minimum `date_time_last_edited` to be included. A [datetime][base::DateTimeClasses], or something coercible to (like #' `"2006-01-02"` or `"2006-01-02T15:04:05Z"`; assumed to be in UTC if no timezone is given). #' @param date_time_last_edited_max The maximum `date_time_last_edited` to be included. A [datetime][base::DateTimeClasses], or something coercible to (like #' `"2006-01-02"` or `"2006-01-02T15:04:05Z"`; assumed to be in UTC if no timezone is given). #' @param query_filter A valid [MongoDB JSON query filter document](https://docs.mongodb.com/manual/core/document/#query-filter-documents) which allows for #' maximum control over what data is included. This takes precedence over all of the above listed parameters, i.e. if `query_filter` is provided, the #' parameters `r formals(assemble_query_filter) |> names() |> setdiff(c("query_filter", "base64_encode")) |> pal::enum_str(wrap = "\x60")` are ignored. #' @param base64_encode Whether or not to [Base64](https://en.wikipedia.org/wiki/Base64)-encode the resulting query filter document. Note that the #' `query_filter` argument provided to other functions of this package must be Base64-encoded. #' #' @return A character scalar containing a valid [MongoDB JSON query filter document](https://docs.mongodb.com/manual/core/document/#query-filter-documents), #' [Base64](https://en.wikipedia.org/wiki/Base64)-encoded if `base64_encode = TRUE`. #' @keywords internal assemble_query_filter <- function(country_code = NULL, subnational_entity_name = NULL, municipality = NULL, level = NULL, type = NULL, date_min = NULL, date_max = NULL, is_draft = NULL, date_time_created_min = NULL, date_time_created_max = NULL, date_time_last_edited_min = NULL, date_time_last_edited_max = NULL, query_filter = NULL, base64_encode = TRUE) { checkmate::assert_string(query_filter, null.ok = TRUE) checkmate::assert_flag(base64_encode) # assemble JSON query filter document if `query_filter` is not provided if (is.null(query_filter)) { purrr::map_chr(.x = country_code, .f = checkmate::assert_choice, choices = val_set$country_code, null.ok = TRUE, .var.name = "country_code") checkmate::assert_character(subnational_entity_name, any.missing = FALSE, null.ok = TRUE) checkmate::assert_character(municipality, any.missing = FALSE, null.ok = TRUE) purrr::map_chr(.x = level, .f = checkmate::assert_choice, choices = var_vals("level"), null.ok = TRUE, .var.name = "level") purrr::map_chr(.x = type, .f = checkmate::assert_choice, choices = var_vals("type"), null.ok = TRUE, .var.name = "type") checkmate::assert_flag(is_draft, null.ok = TRUE) date_min %<>% lubridate::as_date() date_max %<>% lubridate::as_date() date_time_created_min %<>% lubridate::as_datetime(tz = "UTC") date_time_created_max %<>% lubridate::as_datetime(tz = "UTC") date_time_last_edited_min %<>% lubridate::as_datetime(tz = "UTC") date_time_last_edited_max %<>% lubridate::as_datetime(tz = "UTC") query_filter <- list(country_code = query_filter_in(country_code), canton = query_filter_in(subnational_entity_name), municipality = query_filter_in(municipality), level = query_filter_in(level), institution = type %>% pal::when(length(.) == 0L ~ ., ~ dplyr::case_match(.x = ., "citizens' assembly" ~ "citizen assembly", .default = .) %>% stringr::str_to_sentence()) %>% query_filter_in(), date = query_filter_date(min = date_min, max = date_max), draft = is_draft, created_on = query_filter_datetime(min = date_time_created_min, max = date_time_created_max), date_time_last_edited = query_filter_datetime(min = date_time_last_edited_min, max = date_time_last_edited_max)) %>% # remove `NULL` elements purrr::compact() %>% # convert to JSON jsonlite::toJSON(POSIXt = "ISO8601", auto_unbox = TRUE, digits = NA, pretty = FALSE) } if (base64_encode) { query_filter %<>% jsonlite::base64_enc() } query_filter }
assert_api_success
assert_api_success <- function(x) { if (!is.null(x$error$id)) { cli_div_id <- cli::cli_div(theme = cli_theme) cli::cli_abort("API server responded with error {.err {x$error$id}}") cli::cli_end(id = cli_div_id) } invisible(x) }
assert_cols_absent
assert_cols_absent <- function(data, type) { type <- rlang::arg_match0(arg = type, values = unique(unlist(data_cols_absent$type))) cols <- data_cols_absent %>% dplyr::filter(purrr::map_lgl(type, ~ !!type %in% .x)) %$% col col_names <- colnames(data) purrr::walk(cols, ~ { if (.x %in% col_names) { data_cols_absent %>% dplyr::filter(col == !!.x & purrr::map_lgl(type, ~ !!type %in% .x)) %$% msg %>% cli::cli_abort() } }) invisible(data) }
assert_cols_valid
assert_cols_valid <- function(data, type = c("validate", "add", "edit"), action = cli::cli_abort, cli_progress_id = NULL) { type <- rlang::arg_match(type) non_na_col_names <- c("id", "country_code", "date", "level") na_col_names <- data %>% dplyr::select(any_of(non_na_col_names)) %>% dplyr::filter(dplyr::if_any(.cols = everything(), .fns = is.na)) %>% dplyr::select(where(~ anyNA(.x))) %>% colnames() n_na_col_names <- length(na_col_names) if (n_na_col_names) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action("Detected {n_na_col_names} column{?s} in {.arg data} that contain forbidden {.val NA}s: {.var {na_col_names}}") } ## check `id` if ("id" %in% colnames(data) && anyDuplicated(data$id)) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action("Duplicated {.var id}s detected. IDs must be unique.") } ## check `date` if ("date" %in% colnames(data)) { check <- checkmate::check_date(data$date, any.missing = FALSE) if (!isTRUE(check)) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action("Failed to validate {.var data$date}. {check}") } } ## check `level` if ("level" %in% colnames(data)) { check <- checkmate::check_subset(as.character(data$level), choices = var_vals("level")) if (!isTRUE(check)) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action("Failed to validate {.var data$level}. {check}") } } ## check `country_code` if ("country_code" %in% colnames(data)) { check <- checkmate::check_subset(as.character(data$country_code), choices = val_set$country_code) if (!isTRUE(check)) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action("Failed to validate {.var data$country_code}. {check}") } ## ensure `position_government` is present for additions when `country_code = "CH" & level = "national"` if (type == "add" && (data %>% dplyr::filter(country_code == "CH" & level == "national") %>% nrow() %>% magrittr::is_greater_than(0L)) && !("position_government" %in% colnames(data))) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action(paste0("Referendums with {.code country_code = \"CH\" & level = \"national\"} present in {.arg data} but column {.var ", "position_government} is missing.")) } } ## check `subnational_entity_name` ## TODO: check `subnational_entity_code` instead once it's available if (any(data[["level"]] != "national")) { if (!("subnational_entity_name" %in% colnames(data))) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action(paste0("Referendums of {.var level} below {.val national} present in {.arg data} but column {.var subnational_entity_name} is missing.")) } ix_missing_subnational_entities <- data %>% tibble::rowid_to_column() %>% dplyr::filter(level != "national" & is.na(subnational_entity_name)) %$% rowid n_missing_subnational_entities <- length(ix_missing_subnational_entities) if (n_missing_subnational_entities) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action(paste0("{n_missing_subnational_entities} row{?s} in {.arg data} {?is/are} missing a {.var subnational_entity_name}. Affected {?is/are} ", "the row{?s} with ind{?ex/ices} {.val {ix_missing_subnational_entities}}.")) } } if ("subnational_entity_name" %in% colnames(data)) { ix_illegal_subnational_entities <- data %>% tibble::rowid_to_column() %>% dplyr::filter(level == "national" & !is.na(subnational_entity_name)) %$% rowid n_illegal_subnational_entities <- length(ix_illegal_subnational_entities) if (n_illegal_subnational_entities) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action(paste0("{n_illegal_subnational_entities} row{?s} in {.arg data} {?has/have} a {.var subnational_entity_name} set although they are on the ", "national level. Affected {?is/are} the row{?s} with ind{?ex/ices} {.val {ix_illegal_subnational_entities}}.")) } } ## check `municipality` if (any(data[["level"]] == "local")) { if (!("municipality" %in% colnames(data))) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action(paste0("Referendums of {.var level = \"local\"} present in {.arg data} but column {.var municipality} is missing.")) } ix_missing_municipalities <- data %>% tibble::rowid_to_column() %>% dplyr::filter(level == "local" & is.na(municipality)) %$% rowid n_missing_municipalities <- length(ix_missing_municipalities) if (n_missing_municipalities) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action(paste0("{n_missing_municipalities} row{?s} in {.arg data} {?is/are} missing a {.var municipality}. Affected {?is/are} the row{?s} with ", "ind{?ex/ices} {.val {ix_missing_subnational_entities}}.")) } } if ("municipality" %in% colnames(data)) { ix_illegal_municipalities <- data %>% tibble::rowid_to_column() %>% dplyr::filter(level != "local" & !is.na(municipality)) %$% rowid n_illegal_municipalities <- length(ix_illegal_municipalities) if (n_illegal_municipalities) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action(paste0("{n_illegal_municipalities} row{?s} in {.arg data} {?has/have} a {.var municipality} set although they are not on the local level. ", "Affected {?is/are} the row{?s} with ind{?ex/ices} {.val {ix_illegal_municipalities}}.")) } } ## check variables that are only meant to be set for Swiss national referendums ## TODO: Remove this as soon as [issue #52](https://github.com/zdaarau/c2d-app/issues/52) is resolved. ### `votes_per_subterritory` if (all(c("votes_per_subterritory", "level", "country_code") %in% colnames(data))) { ix_illegal_votes_per_subterritory <- data %>% tibble::rowid_to_column() %>% dplyr::filter((level != "national" | country_code != "CH") & !purrr::map_lgl(votes_per_subterritory, is.null)) %$% rowid n_illegal_votes_per_subterritory <- length(ix_illegal_votes_per_subterritory) if (n_illegal_votes_per_subterritory) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action(paste0("{n_illegal_votes_per_subterritory} row{?s} in {.arg data} {?has/have} a {.var position_government} set although they are not Swiss ", "national referendums. Affected {?is/are} the row{?s} with ind{?ex/ices} {.val {ix_illegal_votes_per_subterritory}}.")) } } ### non-list vars c("lower_house_yes", "lower_house_no", "lower_house_abstentions", "upper_house_yes", "upper_house_no", "upper_house_abstentions", "position_government") %>% purrr::walk(function(var_name) { if (all(c(var_name, "level", "country_code") %in% colnames(data))) { ix_illegal <- data %>% tibble::rowid_to_column() %>% dplyr::filter((level != "national" | country_code != "CH") & !is.na(!!as.symbol(var_name))) %$% rowid n_illegal <- length(ix_illegal) if (n_illegal) { cli::cli_progress_done(id = cli_progress_id, result = "failed") action(paste0("{n_illegal} row{?s} in {.arg data} {?has/have} a {.var {var_name}} set although {cli::qty(n_illegal)}{?it is not a/they are not} ", "Swiss national referendum{?s}. Affected {?is/are} the row{?s} with ind{?ex/ices} {.val {ix_illegal}}.")) } } }) invisible(data) }
assert_content
assert_content <- function(x) { if (!nchar(x)) { cli::cli_abort("Received empty response from RDB API. Please debug.", .internal = TRUE) } invisible(x) }
auth_session
TODO:
#' Authenticate a user session for the [RDB API](https://github.com/zdaarau/c2d-app/blob/master/docs/services.md#1-reflexive-routes) #' #' Creates a new user session token if necessary. The token is stored in the R option `rdb.user_session_tokens`, a [tibble][tibble::tbl_df] with the columns #' `email`, `token` and `date_time_last_active`. #' #' `email` and `password` default to the [package configuration options][pkg_config] `api_username` and `api_password` respectively. #' #' User session tokens expire automatically after 15 days of inactivity. #' #' @inheritParams url_api #' @param email The e-mail address of the user for which a session should be created. A character scalar. #' @param password The password of the user for which a session should be created. A character scalar. #' @param quiet `r pkgsnip::param_lbl("quiet")` #' #' @return The user session token as a character scalar, invisibly. #' @keywords internal auth_session <- function(email = pal::pkg_config_val(key = "api_username", pkg = this_pkg), password = pal::pkg_config_val(key = "api_password", pkg = this_pkg), use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg), quiet = FALSE) { checkmate::assert_string(email, min.chars = 3L) checkmate::assert_string(password, min.chars = 1L) checkmate::assert_flag(quiet) # get existing tokens or initialize empty tibble tokens <- getOption("rdb.user_session_tokens") %>% pal::when(all(c("email", "token", "date_time_last_active") %in% colnames(.)) ~ ., ~ tibble::tibble(email = character(), token = character(), is_testing_server = logical(), date_time_last_active = as.POSIXct(NULL))) # extract latest token token <- tokens %>% dplyr::filter(email == !!email & is_testing_server == !!use_testing_server) if (nrow(token)) { token %<>% dplyr::filter(date_time_last_active == max(date_time_last_active)) token %<>% .[1L, ] } # ensure token is not expired (checked if older than 14 days), else set to `NULL` if (nrow(token) && checkmate::test_string(token$token, min.chars = 1L) && ((token$date_time_last_active > clock::add_days(clock::date_now(zone = "UTC"), -14L)) || !is_session_expired(token = token$token, use_testing_server = use_testing_server))) { token <- token$token } else { token <- NULL } # create new session if necessary if (is.null(token)) { if (!quiet) { status_msg <- "Authenticating new user session" cli::cli_progress_step(msg = status_msg, msg_done = paste(status_msg, "done"), msg_failed = paste(status_msg, "failed")) } token <- httr::RETRY(verb = "POST", url = url_api("users/session", .use_testing_server = use_testing_server), config = httr::add_headers(Origin = url_admin_portal(.use_testing_server = use_testing_server)), times = 3L, encode = "json", body = list(email = email, password = password)) %>% # ensure we actually got a JSON response pal::assert_mime_type(mime_type = "application/json", msg_suffix = mime_error_suffix) %>% # parse response httr::content(as = "parsed") %$% token if (!quiet) { cli::cli_progress_done() } } # update `rdb.user_session_tokens` option options(rdb.user_session_tokens = tokens %>% dplyr::filter(token != !!token) %>% tibble::add_row(email = email, token = token, is_testing_server = use_testing_server, date_time_last_active = clock::date_now(zone = "UTC"))) # return token invisible(token) }
md_link_codebook
md_link_codebook <- function(var_names) { purrr::map_chr(var_names, \(x) paste0("[`", x, "`](", url_codebook(x), ")")) }
country_code_to_name
country_code_to_name <- function(country_code) { purrr::map2_chr(.x = country_code, .y = nchar(as.character(country_code)) > 2L, .f = ~ { if (isTRUE(.y)) { result <- data_iso_3166_3 %>% dplyr::filter(Alpha_4 == !!.x) %$% name_short } else { result <- data_iso_3166_1 %>% dplyr::filter(Alpha_2 == !!.x) %$% name_short } if (length(result) == 0L) { result <- NA_character_ } result }) }
field_to_var_name
field_to_var_name <- function(x) { x %>% purrr::map_chr(~ var_names[[.x]] %||% .x) }
derive_country_vars
TODO:
Date_withdrawn
once pkg ISOcodes is recent enough to include this
information. This would be correct (although it shouldn't really matter for the current
functionality).NOTES:
This fn is designed to only determine the proper country_code
when the input country_code
(extracted from id_sudd
) is unambiguous for date
(via the
!!date <= (clock::add_years(Date_withdrawn, 50L)
clause in the filter statement). Thus a warning would be thrown when sudd.ch would continue using an
obsolete country code 50 years after its official deletion. This is necessary because deleted country codes can be re-assigned after a transitional period
of at least 50 years.
sudd.ch includes unofficial, unconventional as well as satirical referendums in the following (pseudo-)countries that naturally don't have ISO 3166-1 codes:
| territory_name_de
| example id_sudd
| remarks |
|---------------------|-------------------|--------------------------------------------------------------------------------------------------------------|
| Asgardia | zz012017
| aka Space Kingdom of Asgardia |
| Maghrebinien | mb011902
| based on satirical fiction of Gregor von Rezzori |
derive_country_vars <- function(country_code, date) { country_code %<>% as.character() subnational_entity_code <- NA_character_ # handle subnational entities ## Ascension if (country_code == "AC") { country_code <- "SH" subnational_entity_code <- "SH-AC" } # assign canonical pseudo codes ## Kosovo country_code %<>% dplyr::case_match(.x = ., "KS" ~ "XK", .default = .) data_former <- data_iso_3166_3 %>% dplyr::filter(Alpha_2 == !!country_code & !!date <= (clock::add_years(Date_withdrawn, 50L))) %>% dplyr::filter(Date_withdrawn == pal::safe_max(Date_withdrawn)) is_former <- nrow(data_former) > 0L is_current <- !is_former && country_code %in% data_iso_3166_1$Alpha_2 if (!(is_former || is_current) && !(country_code %in% country_codes_sudd_invalid)) { cli::cli_alert_warning("Neither ISO 3166-1 alpha-2 nor ISO 3166-3 alpha-4 {.var country_code} found for {.val {country_code}}.") } country_code <- country_code %>% pal::when(is_former ~ data_former %>% dplyr::filter(Date_withdrawn == min(Date_withdrawn)) %>% assertr::verify(nrow(.) == 1L) %$% Alpha_4, is_current ~ country_code, ~ NA_character_) tibble::tibble(country_code = country_code, country_name = country_code_to_name(country_code), is_former_country = is_former, subnational_entity_code = subnational_entity_code) }
drop_disabled_vars
drop_disabled_vars <- function(data, to_drop) { to_drop_present <- intersect(to_drop, colnames(data)) n_to_drop_present <- length(to_drop_present) if (n_to_drop_present) { cli::cli_alert_warning(paste0("The {cli::qty(n_to_drop_present)} column{?s} {.var {to_drop_present}} in {.arg data} are ignored because setting/altering ", "the corresponding values is disabled.")) data %<>% dplyr::select(-any_of(to_drop)) } data }
drop_implicit_vars
drop_implicit_vars <- function(data, type = c("add", "edit")) { type <- rlang::arg_match(type) to_drop <- data_cols_absent %>% dplyr::filter(purrr::map_lgl(type, ~ !!type %in% .x)) %$% col data %>% dplyr::select(-any_of(to_drop)) }
drop_non_applicable_vars
drop_non_applicable_vars <- function(data) { if ("level" %in% colnames(data)) { if (data$level != "local") { data %<>% dplyr::select(-any_of("municipality")) } if (data$level == "national") { data %<>% dplyr::select(-any_of("subnational_entity_name")) } # TODO: remove this as soon as [issue #52](https://github.com/zdaarau/c2d-app/issues/52) is resolved if (data$level != "national" || data$country_code != "CH") { data %<>% dplyr::select(-any_of(c("votes_per_subterritory", "lower_house_yes", "lower_house_no", "lower_house_abstentions", "upper_house_yes", "upper_house_no", "upper_house_abstentions", "position_government"))) } } data %<>% dplyr::select(-any_of(c( "files", "is_former_country", # TODO: remove this as soon as [issue #81](https://github.com/zdaarau/c2d-app/issues/81) is fixed "sources" ))) data }
fct_flip
fct_flip <- function(x) { checkmate::assert_factor(x, n.levels = 2L) flip_map <- levels(x) %>% magrittr::set_names(value = rev(.)) %>% as.list() x %>% forcats::fct_recode(!!!flip_map) }
flatten_array_as_is
A helper function to apply to list elements before converting the list to JSON using jsonlite::toJSON(auto_unbox = TRUE)
.
flatten_array_as_is <- function(x) { x %<>% unlist() if (!is.null(x)) { x %<>% I() } x }
httr_config
Before 2021-05-25, the RDB API server didn't provide the intermediate R3
certificate (issued by Let's Encrypt) and since curl doesn't support
Authority Information Access yet to automatically discover it, we had to manually specify it. Thus, the certificate
was shipped with this package (found under certs/3479778542.crt
). Because this isn't needed anymore, the cert as well as the code chunk below (purl = FALSE
)
are ignored from package builds but kept in case they are needed again at some point in the future.
httr_config <- function() { httr::config(cainfo = fs::path_package(package = this_pkg, "certs", "3479778542.crt")) }
is_session_expired
is_session_expired <- function(token, use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { response <- httr::RETRY(verb = "GET", url = url_api("users/profile", .use_testing_server = use_testing_server), config = httr::add_headers(Authorization = paste("Bearer", token), Origin = url_admin_portal(.use_testing_server = use_testing_server)), times = 3L) %>% # ensure we actually got a JSON response pal::assert_mime_type(mime_type = "application/json", msg_suffix = mime_error_suffix) %>% # parse response httr::content(as = "parsed") is.null(response[["profile"]]) }
lower_non_abbrs
#' Lowercase non-abbreviations #' #' @param x A character vector. #' #' @return A character vector of the same length as `x`. #' @keywords internal lower_non_abbrs <- function(x) { x %>% stringr::str_split(pattern = "\\b") %>% purrr::map_chr(~ .x %>% dplyr::if_else(stringr::str_detect(string = ., pattern = "^[^[:lower:]]+$"), ., stringr::str_to_lower(.)) %>% paste0(collapse = "")) }
order_rfrnd_cols
order_rfrnd_cols <- function(data) { data %>% dplyr::relocate(any_of(rfrnd_cols_order)) }
parse_datetime
NOTES:
So far, date-times returned by the RDB API were encountered in 3 different formats:
As MongoDB Date
type in canonical mode:
json
"created_on": {
"$date": {
"$numberLong": "-3280780800000"
}
}
As MongoDB Date
type in relaxed mode:
json
"created_on": {
"$date": "2022-03-16T13:52:10.907Z"
}
In a format similar to 1) whose name/definition I couldn't find:
json
"created_on": {
"$date": -3280780800000
}
Format 3) was the first one encountered, formats 1) and 2) were encountered on the API's testing instance.
parse_datetime <- function(x) { x %<>% unlist(use.names = FALSE) if (is.character(x) && stringr::str_detect(string = x, pattern = "^-?\\d+$", negate = TRUE)) { result <- x %>% clock::naive_time_parse(format = "%Y-%m-%dT%H:%M:%SZ", precision = "millisecond") %>% clock::as_date_time(zone = "UTC") } else { result <- as.numeric(x) %>% magrittr::divide_by(1000L) %>% as.POSIXct(origin = "1970-01-01", tz = "UTC") } result }
plot_share_per_period
NOTES:
layout.yaxis.title.standoff
to vary the margin between the axis title and the
axis tick labels. Thus, we work around this by adding a trailing space to our layout.yaxis.ticksuffix
which together with the default
layout.yaxis.automargin results in a margin of a few pixels.plot_share_per_period <- function(data_freq, x, period) { rlang::check_installed("plotly", reason = pal::reason_pkg_required()) grid_step <- switch(EXPR = period, week = 4L, year = 50L, decade = 50L, century = 100L, 1L) grid_x <- seq(from = ceiling(pal::safe_min(data_freq[[period]])[1L] / grid_step) * grid_step, to = floor(pal::safe_max(data_freq[[period]])[1L] / grid_step) * grid_step, by = grid_step) plotly::plot_ly(data = data_freq, type = "scatter", mode = "none", stackgroup = "one", groupnorm = "percent", x = ~eval(as.symbol(period)), y = ~n, name = ~eval(as.symbol(x))) %>% plotly::layout(hovermode = "x", legend = list(orientation = "h"), xaxis = list(dtick = switch(EXPR = period, week = 1L, month = 1L, quarter = 1L, year = 10L, decade = 10L, century = 100L), showgrid = FALSE, ticklabelstep = switch(EXPR = period, week = 4L, month = 1L, quarter = 1L, year = 5L, decade = 5L, century = 1L), range = switch(EXPR = period, week = c(1L, 53L), month = c(1L, 12L), quarter = c(1L, 4L), NULL), ticks = "outside", title = list(text = NULL)), yaxis = list(fixedrange = TRUE, hoverformat = ".1f", showgrid = FALSE, ticksuffix = "\u2009% ", title = list(text = NULL)), # draw custom grid shapes = grid_x %>% purrr::map(~ list(type = "line", y0 = 0L, y1 = 1L, yref = "paper", x0 = .x, x1 = .x, line = list(color = "#fff", width = 0.2)))) }
restore_topics
NOTE that this function doesn't restore redundant information (parent-tier topics). Once issue #41 is resolved, we can deprecate this anyway.
restore_topics <- function(topics_tier_1, topics_tier_2, topics_tier_3) { list(topics_tier_1, topics_tier_2, topics_tier_3) %>% purrr::pmap(~ { ..1 %>% unlist() %>% as.character() %>% checkmate::assert_character(any.missing = FALSE, max.len = 3L, .var.name = "topics_tier_1") ..2 %>% unlist() %>% as.character() %>% checkmate::assert_character(any.missing = FALSE, max.len = 3L, .var.name = "topics_tier_2") ..3 %>% unlist() %>% as.character() %>% checkmate::assert_character(any.missing = FALSE, max.len = 3L, .var.name = "topics_tier_3") topics_hierarchy <- hierarchize_topics(tibble::tibble(topics_tier_1 = list(..1), topics_tier_2 = list(..2), topics_tier_3 = list(..3))) topics <- topics_hierarchy$topic_tier_3 %>% setdiff(NA_character_) if (length(topics) < 3L) { topics <- topics_hierarchy %>% dplyr::filter(is.na(topic_tier_3)) %$% topic_tier_2 %>% setdiff(NA_character_) %>% c(topics) } if (length(topics) < 3L) { topics <- topics_hierarchy %>% dplyr::filter(is.na(topic_tier_3) & is.na(topic_tier_2)) %$% topic_tier_1 %>% setdiff(NA_character_) %>% c(topics) } topics }) }
topic_frequency
Counts number of topic occurences. Useful to create sunburst charts via Plotly.
topic_frequency <- function(topics, tier) { topics %>% purrr::list_c(ptype = character()) %>% factor(levels = topics(tiers = tier)) %>% tibble::tibble(topic = .) %>% dplyr::group_by(topic) %>% dplyr::summarise(n = dplyr::n()) }
tidy_rfrnds
TODO:
tidy_rfrnds()
, assemble_query_filter()
and all other affected
places.NOTES:
Depending on the referendum(s), the number of fields returned by the RDB API varies. To systematically test if tidy_rfrnds()
works for all cases, run:
``` r ids = rdb::rfrnds(use_cache = FALSE) %$% id
ids %>% cli::cli_progress_along(name = "Test") %>% purrr::walk(~ { cli::cli_text("i {.val {.x}} with id {.field {ids[.x]}}") rdb::rfrnd(id = ids[.x]) }) ```
#' Tidy "raw" RDB API referendum data #' #' Converts the "raw" MongoDB data from the RDB API to the tidied [rfrnds()] schema. #' #' You can reverse this function again using [untidy_rfrnds()]. #' #' @param data The MongoDB data as a list (converted from the JSON returned by the RDB API using [jsonlite::fromJSON()]). #' @param tidy Whether or not to tidy the referendum data, i.e. apply various data cleansing tasks and add additional variables. If `FALSE`, the raw MongoDB #' referendum data will only be modified just enough to be able to return it as a [tibble][tibble::tbl_df]. Note that untidy data doesn't conform to the #' [codebook][data_codebook] (i.a. different variable names). #' #' @return `r pkgsnip::return_lbl("tibble")` #' @keywords internal tidy_rfrnds <- function(data, tidy = TRUE) { checkmate::assert_flag(tidy) this_env <- rlang::current_env() data %<>% # unnest columns and ensure list type for multi-value columns # NOTE that despite of the [speed-up in v1.1.4](https://github.com/tidyverse/tidyr/releases/tag/v1.1.4), `tidyr::unnest()` is still much slower than our # custom function purrr::map(.f = function(l, category_names = names(l$categories), context_names = names(l$context), title_langs = names(l$title)) { for (name in category_names) { l[[paste0("categories.", name)]] <- l$categories[[name]] } for (name in context_names) { l[[paste0("context.", name)]] <- l$context[[name]] } for (lang in title_langs) { l[[paste0("title.", lang)]] <- l$title[[lang]] } l$categories <- NULL l$context <- NULL l$title <- NULL for (name in c("tags", "categories.action", "categories.special_topics", "categories.excluded_topics")) { l[[name]] %<>% purrr::list_c(ptype = character()) %>% list() } for (name in c("archive", "files", "context.votes_per_canton")) { l[[name]] %<>% list() } l }) %>% # drop empty fields purrr::modify_depth(.depth = 1L, .f = purrr::compact) %>% # convert to tibble purrr::map(tibble::as_tibble_row) %>% purrr::list_rbind() # tidy data if (nrow(data) > 0L && tidy) { data %<>% # rename variables (mind that the MongoDB-based API doesn't demand a fixed schema) pal::rename_from(dict = var_names) %>% # create/recode variables dplyr::mutate( # ensure all supposed to floating-point numbers are actually of type double (JSON API is not reliable in this respect) dplyr::across(any_of(c("subterritories_no", "subterritories_yes", # TODO: remove/adapt next two lines once [issue #78](https://github.com/zdaarau/c2d-app/issues/78) is resolved "date_time_created"["date_time_created" %in% colnames(.) && any(purrr::map_lgl(.$date_time_created, is.numeric))], "date_time_last_edited"["date_time_last_edited" %in% colnames(.) && any(purrr::map_lgl(.$date_time_last_edited, is.numeric))])), ~ purrr::map_dbl(.x, ~ if (is.null(.x)) NA_real_ else as.double(.x))), # use explicit NA values dplyr::across(where(is.integer), ~ dplyr::if_else(.x %in% c(-1L, -2L), NA_integer_, .x)), dplyr::across(where(is.character), ~ dplyr::if_else(.x %in% c("", "-1", "-2"), NA_character_, .x)), dplyr::across(any_of(c("subterritories_yes", "subterritories_no")), ~ dplyr::if_else(.x %in% c(-1.0, -2.0), NA_real_, .x)), dplyr::across(any_of("result"), ~ dplyr::if_else(.x %in% c("Unknown", "Not provided"), NA_character_, .x)), # convert all values to lowercase ## vectors dplyr::across(any_of(c("result", "type", "inst_legal_basis_type", "inst_object_type", "inst_object_legal_level", "inst_object_revision_extent", "inst_trigger_type", "inst_trigger_actor_level", "inst_trigger_time_limit", "inst_quorum_approval", "inst_precondition_decision")), stringr::str_to_lower), ## lists dplyr::across(any_of(c("inst_object_revision_modes", "inst_topics_only", "inst_topics_excluded")), ~ purrr::map(.x = .x, .f = stringr::str_to_lower)), # convert only non-abbreviated values to lowercase dplyr::across(any_of(c("inst_object_author", "inst_trigger_actor", "inst_precondition_actor")), ~ purrr::map_chr(.x = .x, .f = lower_non_abbrs)), # specific recodings ## binary (dummies) dplyr::across(any_of("position_government"), ~ dplyr::case_when(.x == "Acceptance" ~ "yes", .x == "Rejection" ~ "no", .default = NA_character_)), dplyr::across(any_of("inst_has_urgent_legal_basis"), ~ dplyr::case_when(.x == "Urgent" ~ TRUE, .x == "Normal" ~ FALSE, .default = NA)), dplyr::across(any_of("inst_is_binding"), ~ dplyr::case_when(.x == "Binding" ~ TRUE, .x == "Non-binding" ~ FALSE, .default = NA)), dplyr::across(any_of("inst_is_counter_proposal"), ~ dplyr::case_when(.x == "Yes" ~ TRUE, .x == "No" ~ FALSE, .default = NA)), dplyr::across(any_of("inst_is_assembly"), ~ dplyr::case_when(.x == "Assembly" ~ TRUE, .x == "Ballot" ~ FALSE, .default = NA)), dplyr::across(any_of("inst_has_precondition"), ~ dplyr::case_when(.x == "Exists" ~ TRUE, .x == "Does not exist" ~ FALSE, .default = NA)), ## nominal ### flatten `id` id = purrr::list_c(id, ptype = character()), ### split `tags` into separate per-tier vars topics_tier_1 = tags %>% purrr::map(infer_topics, tier = 1L), topics_tier_2 = tags %>% purrr::map(infer_topics, tier = 2L), topics_tier_3 = tags %>% purrr::map(~ .x[.x %in% topics_tier_3_]), ### various cleanups dplyr::across(any_of("type"), ~ dplyr::case_match(.x = .x, "citizen assembly" ~ "citizens' assembly", "not provided" ~ NA_character_, .default = .x)), dplyr::across(any_of(c("inst_trigger_actor", "inst_object_author")), ~ dplyr::case_match(.x = .x, "institution" ~ "other institution", .default = .x)), dplyr::across(any_of("inst_object_type"), ~ dplyr::case_match(.x = .x, "legal text (ausformulierter vorschlag)" ~ "legal text (formulated proposal)", "legal text (allg. anregung)" ~ "legal text (general proposal)", .default = .x)), dplyr::across(any_of("inst_topics_only"), ~ purrr::map(.x = .x, .f = \(x) dplyr::case_match(.x = x, "infrastructural act" ~ "infrastructural acts", "competence shift" ~ "competence shifts", "financial act" ~ "financial acts", "financial act (expenses)" ~ "financial acts (expenses)", "financial act (taxes)" ~ "financial acts (taxes)", "financial act (obligations)" ~ "financial acts (obligations)", "total revision of the constitution" ~ "total revisions of the constitution", .default = x))), dplyr::across(any_of("inst_topics_excluded"), ~ purrr::map(.x = .x, .f = \(x) dplyr::case_match(.x = x, "budget" ~ "budgets", "parliamentary competence" ~ "everything within parliamentary competence", .default = x))), dplyr::across(any_of("inst_quorum_turnout"), ~ stringr::str_replace_all(string = .x, pattern = c("^(\\s+)?>(\\s+)?" = ">\u202f", "(\\s+)?%(\\s+)?$" = "\u202f%"))), ## ordinal ## interval # TODO: Remove else-clauses once [this](https://github.com/zdaarau/c2d-app/commit/6b72d1928e0182f01b188f3973ba15482fc8c04a) is deployed to # production date = if (is.list(date)) { clock::as_date(parse_datetime(date)) } else { clock::date_parse(date) }, dplyr::across(any_of(c("date_time_created", "date_time_last_edited")), parse_datetime), ## undefined files = files %>% purrr::map(~ .x %>% purrr::map(~ .x %>% # unnest and restore `date` purrr::modify_in(.where = "date", .f = parse_datetime) %>% # change subvariable names pal::rename_from(dict = sub_var_names$files)))) # complement `id_official` and `id_sudd` (a two-letter country code plus a 6-digit number) by old `number` # TODO: once [issue #?](https://github.com/zdaarau/c2d-app/issues/?) is resolved: # - correct this upstream using `edit_rfrnds()` # - remove corresponding code below # - file issue to completely get rid of field `number` if ("number" %in% colnames(data)) { data %<>% dplyr::mutate(number = dplyr::if_else(number %in% c("0", ""), NA_character_, number), dplyr::across(any_of("id_official"), ~ dplyr::if_else(is.na(.x) & stringr::str_detect(number, "^\\d"), number, .x)), dplyr::across(any_of("id_sudd"), ~ dplyr::if_else(is.na(.x) & stringr::str_detect(number, "^\\D"), # everything beyond the 8th char seems to be manually added -> strip! stringr::str_sub(string = number, end = 8L), .x))) } # ensure `id_official` and `id_sudd` are present if (!("id_official" %in% colnames(data))) data$id_official <- NA_character_ if (!("id_sudd" %in% colnames(data))) data$id_sudd <- NA_character_ # TODO: remove this once [issue #]() has been resolved # create `inst_is_variable/divisible` if necessary if ("categories.referendum_text_options" %in% colnames(data)) { if (!("inst_is_variable" %in% colnames(data))) { data %<>% dplyr::mutate(inst_is_variable = dplyr::case_when( categories.referendum_text_options %in% c("Variants possible", "Variants / splitting up possible") ~ TRUE, is.na(categories.referendum_text_options) ~ NA, .default = FALSE )) } if (!("inst_is_divisible" %in% colnames(data))) { data %<>% dplyr::mutate(inst_is_divisible = dplyr::case_when( categories.referendum_text_options %in% c("Splitting up possible", "Variants / splitting up possible") ~ TRUE, is.na(categories.referendum_text_options) ~ NA, .default = FALSE )) } } # ensure all country codes are known and assign canonical country name data %<>% add_country_name() data %<>% # remove obsolete vars dplyr::select(-any_of(c("categories.referendum_text_options", "country_code_historical", "is_past_jurisdiction", "number", "tags"))) %>% # convert to (ordered) factor where appropriate ## based on codebook dplyr::mutate(dplyr::across(everything(), ~ { metadata <- data_codebook %>% dplyr::filter(variable_name == dplyr::cur_column()) if (nrow(metadata) != 1L) { cli::cli_abort("Missing codebook metadata! Please debug", .internal = TRUE) } if (is.factor(unlist(metadata$ptype))) { lvls <- levels(unlist(metadata$ptype)) is_ordered <- metadata$value_scale %in% c("ordinal_ascending", "ordinal_descending") if (is.list(.x)) { .x %>% purrr::map(.f = factor, levels = lvls, ordered = is_ordered) } else { factor(x = .x, levels = lvls, ordered = is_ordered) } } else { .x } })) %>% ## fctrs without explicit variable_values set in codebook dplyr::mutate( ### fctrs where we defined a finite set of values country_code = factor(x = country_code, levels = val_set$country_code, ordered = FALSE), ### fctrs where we did not define a finite set of values (yet) dplyr::across(any_of(c("subnational_entity_name", "municipality")), as.factor) ) %>% # add vars which aren't always included and coerce to proper types vctrs::tib_cast(to = data_codebook %>% dplyr::filter(!is_opt) %$% magrittr::set_names(x = ptype, value = variable_name) %>% tibble::as_tibble(), call = this_env) %>% # harmonize col order order_rfrnd_cols() } # convert nested list cols to tibbles data %>% dplyr::mutate(dplyr::across(any_of(c("files", "votes_per_subterritory")), ~ purrr::map(.x, \(x) if (length(x) > 0L) x %>% purrr::map(tibble::as_tibble) %>% purrr::list_rbind() else NULL)), dplyr::across(any_of("archive"), ~ purrr::map(.x, \(x) if (length(x) > 0L) tibble::as_tibble(x) else NULL))) %>% # add variable labels (must be done at last since mutations above drop attrs) labelled::set_variable_labels(.labels = var_lbls, .strict = FALSE) }
untidy_date
untidy_date <- function(x) { as.numeric(x) %>% magrittr::multiply_by(1000.0) %>% as.list() %>% magrittr::set_names(rep("$date", times = length(.))) }
untidy_rfrnds
#' Untidy into "raw" RDB API referendum data #' #' Converts from the tidied [rfrnds()] to the "raw" MongoDB schema used by the RDB API. Basically reverts [tidy_rfrnds()]. #' #' @param data The data to untidy as returned by [rfrnds()]. #' @param as_tibble Whether or not to return the result as a [tibble][tibble::tbl_df]. If `FALSE`, a list is returned. #' #' @return #' If `as_tibble = FALSE`, a list with one element per referendum, suitable to be converted [jsonlite::toJSON()] and then fed to the RDB API. #' #' Otherwise a [tibble][tibble::tbl_df] of the same format as returned by [`rfrnds(tidy = FALSE)`][rfrnds]. #' @keywords internal untidy_rfrnds <- function(data, as_tibble = FALSE) { checkmate::assert_flag(as_tibble) var_names_inverse <- names(var_names) %>% magrittr::set_names(purrr::list_c(var_names, ptype = character())) sub_var_names_files_inverse <- names(sub_var_names$files) %>% magrittr::set_names(purrr::list_c(sub_var_names$files, ptype = character())) # restore `number` if (all(c("id_official", "id_sudd") %in% colnames(data))) { data %<>% dplyr::mutate(id_sudd = dplyr::if_else(is.na(id_sudd), id_official, id_sudd)) } data %<>% # remove variable labels labelled::remove_var_label() %>% dplyr::mutate( # restore strings dplyr::across(c(any_of("date"), where(is.factor)), as.character), dplyr::across(where(is.list), ~ { if (is.factor(.x[[1L]])) { .x %>% purrr::map(as.character) } else { .x } }), # restore dates dplyr::across(any_of(c("date_time_created", "date_time_last_edited")), untidy_date), # restore individual variables ## `files` dplyr::across(any_of("files"), ~ purrr::map(.x = .x, .f = \(x) { if ("date_time_attached" %in% colnames(x)) { x$date_time_attached %<>% untidy_date() } x %<>% pal::rename_from(dict = sub_var_names_files_inverse) })), ## `inst_topics_excluded` dplyr::across(any_of("inst_topics_excluded"), ~ purrr::map(.x = .x, .f = \(x) dplyr::case_match(.x = x, "budgets" ~ "budget", .default = x))), ## `inst_topics_only` dplyr::across(any_of("inst_topics_only"), ~ purrr::map(.x = .x, .f = \(x) dplyr::case_match(.x = x, "infrastructural acts" ~ "infrastructural act", "competence shifts" ~ "competence shift", "financial acts" ~ "financial act", "financial acts (expenses)" ~ "financial act (expenses)", "financial acts (taxes)" ~ "financial act (taxes)", "financial acts (obligations)" ~ "financial act (obligations)", "total revisions of the constitution" ~ "total revision of the constitution", .default = x))), ## `inst_object_type` dplyr::across(any_of("inst_object_type"), ~ dplyr::case_match(.x = .x, "legal text (formulated proposal)" ~ "legal text (ausformulierter vorschlag)", "legal text (general proposal)" ~ "legal text (allg. anregung)", .default = .x)), ## `inst_trigger_actor`, `inst_object_author` dplyr::across(any_of(c("inst_trigger_actor", "inst_object_author")), ~ dplyr::case_match(.x = .x, "other institution" ~ "institution", .default = .x)), ## `inst_precondition_actor` dplyr::across(any_of("inst_precondition_actor"), ~ dplyr::case_match(.x = .x, "parliament and president" ~ "parliament and President", "parliament and government" ~ "parliament and Government", .default = .x)), ## `type` dplyr::across(any_of("type"), ~ dplyr::case_match(.x = .x, "citizens' assembly" ~ "citizen assembly", NA_character_ ~ "not provided", .default = .x)), ## `id` dplyr::across(any_of("id"), ~ as.list(.x) %>% magrittr::set_names(rep("$oid", times = length(.)))), ## binary (dummies) dplyr::across(any_of("position_government"), ~ dplyr::case_match(.x = .x, "yes" ~ "Acceptance", "no" ~ "Rejection", .default = .x)), dplyr::across(any_of("inst_has_urgent_legal_basis"), ~ dplyr::if_else(.x, "Urgent", "Normal")), dplyr::across(any_of("inst_is_binding"), ~ dplyr::if_else(.x, "Binding", "Non-binding")), dplyr::across(any_of("inst_is_counter_proposal"), ~ dplyr::if_else(.x, "Yes", "No")), dplyr::across(any_of("inst_is_assembly"), ~ dplyr::if_else(.x, "Assembly", "Ballot")), dplyr::across(any_of("inst_has_precondition"), ~ dplyr::if_else(.x, "Exists", "Does not exist")), # uppercase first letter of various vars dplyr::across(any_of(c("result", "type", "inst_legal_basis_type", "inst_object_type", "inst_object_legal_level", "inst_object_revision_extent", "inst_trigger_type", "inst_trigger_actor_level", "inst_trigger_time_limit", "inst_quorum_approval", "inst_precondition_decision", # vars containing uppercase abbreviations "inst_object_author", "inst_trigger_actor", "inst_precondition_actor")), ~ pal::sentenceify(x = .x, punctuation_mark = "")), dplyr::across(any_of(c("inst_object_revision_modes", "inst_topics_only", "inst_topics_excluded")), ~ purrr::map(.x = .x, .f = pal::sentenceify, punctuation_mark = "")), # restore NA values dplyr::across(where(is.character) & !any_of("result"), ~ tidyr::replace_na(data = .x, replace = "")), ## implicit NAs (i.e. 'not provided' (-2)) dplyr::across(where(is.integer) & !any_of(field_to_var_name(union(rfrnd_fields$required_for_additions, rfrnd_fields$required_for_edits))), ~ tidyr::replace_na(data = .x, replace = -2L)), ## explicit NAs (i.e. 'unknown' (-1)) dplyr::across(any_of("result"), ~ tidyr::replace_na(data = .x, replace = "Unknown")), dplyr::across(where(is.integer) & any_of(field_to_var_name(union(rfrnd_fields$required_for_additions, rfrnd_fields$required_for_edits))), ~ tidyr::replace_na(data = .x, replace = -1L)), dplyr::across(any_of(c("subterritories_yes", "subterritories_no")), ~ tidyr::replace_na(data = .x, replace = -1.0)) ) %>% # restore variable names pal::rename_from(dict = var_names_inverse) # restore `referendum_text_options` if (all(c("inst_is_divisible", "inst_is_variable") %in% colnames(data))) { data %<>% dplyr::mutate(referendum_text_options = dplyr::case_when(!inst_is_divisible & !inst_is_variable ~ "Whole text only", inst_is_divisible & inst_is_variable ~ "Variants / splitting up possible", inst_is_divisible ~ "Splitting up possible", inst_is_variable ~ "Variants possible", .default = NA_character_)) } # restore `tags` topics_var_names <- paste0("topics_tier_", 1:3) topics_vars_present <- topics_var_names %in% colnames(data) if (any(topics_vars_present)) { if (!all(topics_vars_present)) { topics_vars_missing <- topics_var_names %>% setdiff(topics_vars_present) cli::cli_abort(paste0("{cli::qty(topics_vars_missing)}The following {.var {'topics_tier_#'}} variable{?s} {?is/are} missing from {.arg data}: ", "{.var {topics_vars_missing}}")) } data %<>% dplyr::mutate(tags = restore_topics(topics_tier_1, topics_tier_2, topics_tier_3)) %>% dplyr::select(-any_of(topics_var_names)) } # remove unknown columns data %<>% dplyr::select(any_of(rfrnd_fields$all_flat)) if (!as_tibble) { # remove nested field prefixes data %<>% dplyr::rename_with(.cols = matches("^(categories|context|title)\\."), .fn = ~ stringr::str_remove(string = .x, pattern = "^\\w+?\\.")) # restore nested structure categories_fields_present <- names(var_names) %>% stringr::str_subset(pattern = "^categories\\.") %>% stringr::str_remove(pattern = "^categories\\.") %>% intersect(colnames(data)) context_fields_present <- names(var_names) %>% stringr::str_subset(pattern = "^context\\.") %>% stringr::str_remove(pattern = "^context\\.") %>% intersect(colnames(data)) title_fields_present <- names(var_names) %>% stringr::str_subset(pattern = "^title\\.") %>% stringr::str_remove(pattern = "^title\\.") %>% intersect(colnames(data)) if (length(categories_fields_present)) { data %<>% tidyr::nest(categories = all_of(categories_fields_present)) } if (length(context_fields_present)) { data %<>% tidyr::nest(context = all_of(context_fields_present)) } if (length(title_fields_present)) { data %<>% tidyr::nest(title = all_of(title_fields_present)) } data %<>% # convert to list dplyr::group_split(dplyr::row_number(), .keep = FALSE) %>% purrr::map(as.list) %>% # tweak list structure purrr::modify_depth(.depth = 1L, .f = ~ .x %>% # flatten unnecessarily nested elements purrr::modify_at(.at = "tags", .f = flatten_array_as_is) %>% # convert nested tibbles to lists purrr::modify_at(.at = "files", .f = ~ .x[[1L]] %>% pal::when(is.null(.) ~ list(), ~ dplyr::group_split(.tbl = ., dplyr::row_number(), .keep = FALSE) %>% purrr::map(as.list))) %>% purrr::modify_at(.at = c("archive", "categories", "context", "title"), .f = ~ .x %>% purrr::map(as.list) %>% unlist(recursive = FALSE)) %>% # reduce nesting of nested tibble pal::when(is.null(purrr::pluck(., "context", "votes_per_canton")) ~ ., ~ purrr::modify_in(.x = ., .where = c("context", "votes_per_canton"), .f = dplyr::first)) %>% # reduce nesting of array fields pal::when(is.null(purrr::pluck(., "categories", "action")) ~ ., ~ purrr::modify_in(.x = ., .where = c("categories", "action"), .f = flatten_array_as_is)) %>% pal::when(is.null(purrr::pluck(., "categories", "excluded_topics")) ~ ., ~ purrr::modify_in(.x = ., .where = c("categories", "excluded_topics"), .f = flatten_array_as_is)) %>% pal::when(is.null(purrr::pluck(., "categories", "special_topics")) ~ ., ~ purrr::modify_in(.x = ., .where = c("categories", "special_topics"), .f = flatten_array_as_is))) } data }
url_api
#' Assemble RDB Services API URL #' #' @param ... Optional path components added to the base URL. #' @param .use_testing_server `r pkg_config$description[pkg_config$key == "use_testing_server"]` #' #' @return A character scalar. #' @family url_assembly #' @keywords internal #' #' @examples #' rdb:::url_api("health") url_api <- function(..., .use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { checkmate::assert_flag(.use_testing_server) ifelse(.use_testing_server, "stagservices.c2d.ch", "services.c2d.ch") %>% fs::path(...) %>% paste0("https://", .) }
url_admin_portal
#' Assemble RDB admin portal URL #' #' @inheritParams url_api #' #' @inherit url_api return #' @family url_assembly #' @keywords internal #' #' @examples #' rdb:::url_admin_portal("referendum/5bbbfd7b92a21351232e46b5") url_admin_portal <- function(..., .use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { checkmate::assert_flag(.use_testing_server) ifelse(.use_testing_server, "c2d-admin.netlify.app", "admin.c2d.ch") %>% fs::path(...) %>% paste0("https://", .) }
url_codebook
#' Assemble codebook URL #' #' @param var Optional variable name to add as the [fragment identifier](https://en.wikipedia.org/wiki/URI_fragment) of the returned URL, which leads to a #' direct link to the relevant codebook section. Must be either one of the column names of [`data_codebook`], or a valid fragment identifier of a codebook #' section above the individual variables (`r pal::enum_str(codebook_fragments, last = " or ")`). #' #' @return A character scalar. #' @family url_assembly #' @keywords internal #' #' @examples #' rdb:::url_codebook("level") #' rdb:::url_codebook("topics") url_codebook <- function(var = NULL) { checkmate::assert_string(var, null.ok = TRUE) if (!is.null(var)) { var %<>% rlang::arg_match0(values = c(data_codebook$variable_name, # additional HTML anchors codebook_fragments)) %>% stringr::str_replace_all(pattern = stringr::fixed("_"), replacement = "-") } paste0("https://rdb.rpkg.dev/articles/codebook.html", "#"[!is.null(var)], var) }
url_website
#' Assemble website URL #' #' @inheritParams url_api #' #' @inherit url_api return #' @family url_assembly #' @keywords internal #' #' @examples #' rdb:::url_website("referendum/CH/5bbc04f692a21351232e5a01") url_website <- function(..., .use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { checkmate::assert_flag(.use_testing_server) ifelse(.use_testing_server, "c2d-site.netlify.app", "c2d.ch") %>% fs::path(...) %>% paste0("https://", .) }
$date
NOTES:
min
and max
are expected to be datetime objects (POSIXct).query_filter_date()
handles custom processing,
while query_filter_datetime()
returns the native MongoDB datetime format.query_filter_date <- function(min, max) { list(`$gte` = min, `$lte` = max) %>% purrr::compact() } query_filter_datetime <- function(min, max) { list(`$gte` = purrr::compact(list(`$date` = min)), `$lte` = purrr::compact(list(`$date` = max))) %>% purrr::compact() }
$in
MongoDB documentation: https://docs.mongodb.com/manual/reference/operator/query/in/#mongodb-query-op.-in
query_filter_in <- function(x) { x %>% pal::when(length(.) == 0L ~ NULL, length(.) == 1L ~ ., ~ list(`$in` = .)) }
parse_sudd_date
Parses pseudo ISO 8601 strings like "1999-03-17"
(valid) and "2000-09-00"
or "1992-05"
(missing day).
parse_sudd_date <- function(x) { x_parts <- stringr::str_split_1(string = x, pattern = "-") to_int <- function(x) { x %<>% as.integer() x[x == 0L] <- NA_integer_ x } tibble::tibble(year = to_int(x_parts[1L]), month = to_int(x_parts[2L]), day = to_int(x_parts[3L])) }
parse_sudd_date_de
Parses German date strings like "17. März 1999"
and "??. September 2000"
.
parse_sudd_date_de <- function(x) { components <- stringr::str_split_fixed(string = x, pattern = "\\s+", n = 3L) list(year = components[, 3L] %>% stringr::str_extract(pattern = "\\d+") %>% as.integer(), month = components[, 2L] %>% dplyr::case_match(!!!months_de_fms), day = components[, 1L] %>% stringr::str_extract(pattern = "\\d+") %>% as.integer()) }
parse_sudd_id
Derives country_code
and country_name
from an id_sudd
.
An id_sudd
is composed of
country_code
01
for the first referendum in the given yearyear
the referendum took place.parse_sudd_id <- function(id_sudd) { sudd_year <- id_sudd %>% stringr::str_extract(pattern = "\\d{4}$") %>% as.integer() sudd_country_code <- id_sudd %>% stringr::str_sub(end = 2L) %>% stringr::str_to_upper() derive_country_vars(country_code = sudd_country_code, date = clock::date_build(year = sudd_year, month = 1L, day = 1L)) }
sudd_rfrnd_field_names
Determines the field_names
of a sudd.ch referendum, whether it has_field_names_duplicated
, and if so, the actual field_names_duplicated
(incl. number of
duplications).
NOTES:
This function is mainly useful for debugging and thus is not included in the final package (purl = FALSE
).
To get an overview of all duplicated sudd.ch referendum field names, use:
``` r data_sudd_field_names <- rdb::list_sudd_rfrnds() %$% id_sudd %>% purrr::map(rdb:::sudd_rfrnd_field_names) %>% purrr::list_rbind() %>% tibble::add_column(!!!(.$id_sudd %>% purrr::map(rdb:::parse_sudd_id) %>% purrr::list_rbind()), year = .$id_sudd %>% stringr::str_extract(pattern = "\d{4}$"))
data_sudd_field_names %>% dplyr::filter(has_field_names_duplicated) %>% dplyr::arrange(country_code, year, id_sudd) %>% print(n = Inf) ```
sudd_rfrnd_field_names <- function(id_sudd) { checkmate::assert_string(id_sudd) html <- httr::RETRY(verb = "GET", url = url_sudd("event.php"), query = list(id = id_sudd), times = 3L) %>% xml2::read_html() %>% rvest::html_element(css = "main table") %>% rvest::html_children() field_names <- html %>% rvest::html_elements(css = "td.feld") %>% rvest::html_text() field_names_duplicated <- field_names[field_names %in% field_names[duplicated(field_names)]] %>% table(exclude = NULL) %>% tibble::enframe(name = "field_name", value = "n") tibble::tibble(id_sudd = id_sudd, field_names = list(field_names), has_field_names_duplicated = nrow(field_names_duplicated) > 0L, field_names_duplicated = list(field_names_duplicated)) }
sudd_rfrnd
Retrieves a single referendum's data from sudd.ch.
sudd_rfrnd <- function(id_sudd) { checkmate::assert_string(id_sudd) html <- httr::RETRY(verb = "GET", url = url_sudd("event.php"), query = list(id = id_sudd), times = 3L) %>% xml2::read_html() %>% rvest::html_element(css = "main table") %>% rvest::html_children() field_names <- html %>% rvest::html_elements(css = "td.feld") %>% rvest::html_text() # handle fields with duplicated/ambiguous names if (anyDuplicated(field_names)) { ## simple duplicates (probably data errors) if (id_sudd == "gr011862") { ix_to_drop <- c(which(field_names == "\u2517\u2501 Republik")[2L], which(field_names == "\u2517\u2501 Russischer Prinz")[2L]) html %<>% .[-ix_to_drop] field_names %<>% .[-ix_to_drop] ## competing / mutually exclusive proposals, e.g. proposals with direct counter proposal and (optionally) tie-breaker question (CH and LI) } else { option_names <- html %>% rvest::html_elements(css = "td.feld strong") %>% rvest::html_text() if (length(option_names) < 2L) { cli::cli_abort("Unknown table layout detected for referendum with {.arg id_sudd = {id_sudd}}. Please debug.", .internal = TRUE) } ix_option_names <- which(field_names %in% option_names) option_names_counter <- c("Gegenentwurf", "Gegenvorschlag") option_names_tie_breaker <- "Stichfrage" option_names_special <- c(option_names_counter, option_names_tie_breaker) has_counter_proposal <- any(option_names_counter %in% option_names) n_proposals_original <- option_names %>% setdiff(option_names_special) %>% length() ix_option_field_names <- ix_option_names[-length(ix_option_names)] %>% purrr::imap(~ (.x + 1L):(ix_option_names[.y + 1L] - 1L)) %>% c(list((dplyr::last(ix_option_names) + 1L):(min(length(field_names), which(field_names %in% c("Medien", "Bemerkungen", "Gleichzeitig mit", "Quellen", "Vollst\u00e4ndigkeit", "Letzte \u00c4nderung"))) - 1L))) # rename field names option_suffixes <- option_names %>% purrr::imap_chr(~ .x %>% pal::when(. %in% option_names_counter ~ "counter_proposal", . %in% option_names_tie_breaker ~ "tie_breaker", has_counter_proposal && n_proposals_original == 1L ~ "proposal", ~ glue::glue("option_{.y}"))) renamings <- purrr::map2(.x = setdiff(option_names, option_names_tie_breaker), .y = setdiff(option_suffixes, "tie_breaker"), .f = ~ rlang::list2(!!paste("\u2517\u2501", .x) := glue::glue("votes_tie_breaker_{.y}"), !!paste("\u2517\u2501 St\u00e4nde", .x) := glue::glue("subterritories_{.y}_tie_breaker"))) %>% purrr::list_flatten() %>% as_fm_list() for (i in seq_along(option_names)) { field_names[ix_option_field_names[[i]]] %<>% dplyr::case_match(.x = ., !!!c(renamings, list("Abgegebene Stimmen" ~ glue::glue("votes_{option_suffixes[i]}_total"), "Stimmen ausser Betracht" ~ glue::glue("votes_{option_suffixes[i]}_invalid"), "Ohne Antwort" ~ glue::glue("votes_{option_suffixes[i]}_empty"), "G\u00fcltige (= massgebende) Stimmen" ~ glue::glue("votes_{option_suffixes[i]}_valid"), "\u2517\u2501 Ja-Stimmen" ~ glue::glue("votes_{option_suffixes[i]}_yes"), "\u2517\u2501 Nein-Stimmen" ~ glue::glue("votes_{option_suffixes[i]}_no"), "Ja-Stimmen" ~ glue::glue("votes_{option_suffixes[i]}_yes"), "Nein-Stimmen" ~ glue::glue("votes_{option_suffixes[i]}_no"), "St\u00e4nde (Kantone)" ~ glue::glue("subterritories_{option_suffixes[i]}"), "\u2517\u2501 Annehmende St\u00e4nde" ~ glue::glue("subterritories_{option_suffixes[i]}_yes"), "\u2517\u2501 Verwerfende St\u00e4nde" ~ glue::glue("subterritories_{option_suffixes[i]}_no"))), .default = .) } # drop obsolete fields html %<>% .[-ix_option_names] field_names %<>% .[-ix_option_names] } } # handle other special cases if (id_sudd %in% c("li011954", "li031985")) { field_names %<>% dplyr::case_match(.x = ., "\u2517\u2501 Initiative" ~ "votes_proposal", "\u2517\u2501 Gegenvorschlag" ~ "votes_counter_proposal", "\u2517\u2501 Nein-Stimmen" ~ "votes_option_none", .default = .) } ## move content of exotic fields to `remarks` remarks_field <- html[field_names == "Bemerkungen"] %>% pal::when(length(.) > 0L ~ rvest::html_elements(x = ., css = "td")[[2L]], ~ .) remarks_list_col <- list(list(text = rvest::html_text2(remarks_field), urls = remarks_field %>% rvest::html_elements(css = "a") %>% purrr::map_chr(~ .x %>% rvest::html_attr(name = "href") %>% url_sudd()), html = remarks_field %>% xml2::xml_contents() %>% as.character() %>% paste0(collapse = ""))) ix_fields_to_remarks <- field_names %>% stringr::str_detect(pattern = paste0("^", pal::fuse_regex("\u2517\u2501\u2501\u2501 .+Stimmen( .+)?", "Unklare Stimmen", "Unstimmigkeiten", "G\u00fcltig stimmende Personen"), "$")) %>% which() if (length(ix_fields_to_remarks) > 0L) { addendum <- field_names[ix_fields_to_remarks] if (length(addendum) > 0L) { addendum %<>% stringr::str_extract("\\w.*") %>% paste0(": ", html[ix_fields_to_remarks] %>% rvest::html_elements(css = "td") %>% magrittr::extract2(2L) %>% rvest::html_elements(css = "data") %>% rvest::html_attr("value"), collapse = "\n\n") } remarks_list_col[[1L]]$text %<>% paste0(addendum, "\n\n"[length(addendum) > 0L], .) remarks_list_col[[1L]]$html %<>% paste0("<p>\n", addendum, "\n</p>", .) html %<>% .[-ix_fields_to_remarks] field_names %<>% .[-ix_fields_to_remarks] } # remove unnecessary fields ix_to_drop <- which(field_names %in% c("Nicht eingelegte Stimmzettel", "Nicht eingelegte Stimmenzettel", "Nicht teilgenommen")) if (length(ix_to_drop)) { html %<>% .[-ix_to_drop] field_names %<>% .[-ix_to_drop] } field_names %<>% dplyr::case_match(.x = ., "Gebiet" ~ "territory_name_de", "\u2517\u2501 Stellung" ~ "territory_type_de", "Datum" ~ "date", "Titel" ~ "title_de", "Vorlage" ~ "title_de", "\u2517\u2501 Fragemuster" ~ "question_type_de", "\u2517\u2501 Abstimmungstyp" ~ "types", "Ergebnis" ~ "result_de", "Vollst\u00e4ndigkeit" ~ "result_status_de", "\u2517\u2501 Mehrheiten" ~ "adoption_requirements_de", "Stimmberechtigte" ~ "electorate_total", "\u2517\u2501 Davon im Ausland" ~ "electorate_abroad", "Stimmausweise" ~ "polling_cards", "Ausgegebene Stimmzettel" ~ "polling_cards", "Stimmbeteiligung" ~ "votes_total", "Stimmen ausser Betracht" ~ "votes_invalid", "Stimmzettel ausser Betracht" ~ "votes_invalid", "Leere Stimmen" ~ "votes_empty", "\u2517\u2501 Leere Stimmen" ~ "votes_empty", "\u2517\u2501 Leere Stimmzettel" ~ "votes_empty", "Ung\u00fcltige Stimmen" ~ "votes_void", "\u2517\u2501 Ung\u00fcltige Stimmen" ~ "votes_void", "\u2517\u2501 Ung\u00fcltige Stimmzettel" ~ "votes_void", "Ung\u00fcltig eingelegte Stimmzettel" ~ "votes_void", "Ganz ung\u00fcltige Stimmzettel" ~ "votes_void", "G\u00fcltige (= massgebende) Stimmen" ~ "votes_valid", "\u2517\u2501 Ja-Stimmen" ~ "votes_yes", "\u2517\u2501 Nein-Stimmen" ~ "votes_no", "\u2517\u2501 Nein zu beiden Vorschl\u00e4gen" ~ "votes_option_none", "\u2517\u2501 Stimmen ausser Betracht" ~ "votes_invalid", "Staaten" ~ "subterritories", "\u2517\u2501 Annehmende Staaten" ~ "subterritories_yes", "\u2517\u2501 Verwerfende Staaten" ~ "subterritories_no", "Gebiete" ~ "subterritories", "\u2517\u2501 Annehmende Gebiete" ~ "subterritories_yes", "\u2517\u2501 Verwerfende Gebiete" ~ "subterritories_no", "Provinzen" ~ "subterritories", "\u2517\u2501 Annehmende Provinzen" ~ "subterritories_yes", "\u2517\u2501 Verwerfende Provinzen" ~ "subterritories_no", "Inseln" ~ "subterritories", "\u2517\u2501 Annehmende Inseln" ~ "subterritories_yes", "\u2517\u2501 Verwerfende Inseln" ~ "subterritories_no", "St\u00e4nde (Kantone)" ~ "subterritories", "\u2517\u2501 Annehmende St\u00e4nde" ~ "subterritories_yes", "\u2517\u2501 Verwerfende St\u00e4nde" ~ "subterritories_no", "Regionen / St\u00e4dte" ~ "subterritories", "\u2517\u2501 Annehmende Regionen / St\u00e4dte" ~ "subterritories_yes", "\u2517\u2501 Verwerfende Regionen / St\u00e4dte" ~ "subterritories_no", "Wahlkreise" ~ "subterritories", "\u2517\u2501 Annehmende Wahlkreise" ~ "subterritories_yes", "\u2517\u2501 Verwerfende Wahlkreise" ~ "subterritories_no", "Senatswahlkreise" ~ "subterritories", "\u2517\u2501 Annehmende Senatswahlkreise" ~ "subterritories_yes", "\u2517\u2501 Verwerfende Senatswahlkreise" ~ "subterritories_no", "Medien" ~ "files", "Bemerkungen" ~ "remarks", "Gleichzeitig mit" ~ "ids_sudd_simultaneous", "Quellen" ~ "sources", "Letzte \u00c4nderung" ~ "date_last_edited", .default = .) %>% # assert field names are unique checkmate::assert_character(any.missing = FALSE, unique = TRUE, .var.name = "field_names") %>% # referendum-option-specific recodings (sequentially numbered `votes_option_#` columns) # TODO: adapt this once we can properly capture more than yes/no answer options, cf. https://gitlab.com/zdaarau/rpkgs/rdb/-/issues/5 purrr::map_at(.at = which(startsWith(., "\u2517\u2501 ")), .f = function(old_name, old_names) paste0("votes_option_", which(old_names == old_name)), old_names = stringr::str_subset(string = ., pattern = "^\u2517\u2501 ")) %>% purrr::list_c(ptype = character()) # assert no original uppercase field names are left over ix_field_names_unknown <- field_names %>% stringr::str_detect(pattern = "[:upper:]") %>% which() if (length(ix_field_names_unknown)) { cli::cli_abort(paste0("Unknown {cli::qty(length(ix_field_names_unknown))} field{?s} {.field {field_names[ix_field_names_unknown]}} present in data for ", "referendum with {.arg {paste0('id_sudd = ', id_sudd)}}."), .internal = TRUE) } purrr::map2_dfc(.x = html, .y = field_names, .f = function(html, col_name) { cells <- html %>% rvest::html_elements(css = "td") col_text <- rvest::html_text2(cells[[2L]]) # extract hyperlinks if necessary if (col_name %in% c("remarks", "ids_sudd_simultaneous", "sources")) { urls <- cells[[2L]] %>% rvest::html_elements(css = "a") %>% purrr::map_chr(~ .x %>% rvest::html_attr(name = "href") %>% url_sudd()) } tibble::tibble(!!col_name := col_name %>% pal::when( # character scalars . %in% c("territory_name_de", "territory_type_de", "title_de", "question_type_de", "result_de", "result_status_de") ~ col_text, # integer scalars stringr::str_detect( string = ., pattern = paste0( "^", pal::fuse_regex( "electorate_total", "electorate_abroad", "polling_cards", "votes_total", "votes_invalid", "votes_empty", "votes_void", "votes_valid", "votes_yes", "votes_no", "votes_proposal", "votes_counter_proposal", paste0("votes_", pal::fuse_regex("option_(\\d+|none)", "(counter_)?proposal", "tie_breaker(_(option_\\d+|(counter_)?proposal))?"), paste0("(_", pal::fuse_regex("total", "empty", "void", "invalid", "valid", "yes", "no"), ")?"))), "$")) ~ cells[[2L]] %>% rvest::html_elements(css = "data") %>% rvest::html_attr("value") %>% # fall back to parsing text if no semantic data could be extracted pal::when(length(.) == 0L ~ col_text %>% stringr::str_remove_all(pattern = "[^\\d]"), ~ .) %>% as.integer(), startsWith(., "subterritories") ~ cells[[2L]] %>% rvest::html_elements(css = "data") %>% rvest::html_attr("value") %>% # fall back to parsing text if no semantic data could be extracted pal::when(length(.) == 0L ~ col_text %>% stringr::str_remove_all(pattern = "[^\\d]"), ~ .) %>% as.numeric(), # date scalars . == "date" ~ cells[[2L]] %>% rvest::html_element(css = "time") %>% rvest::html_attr(name = "datetime"), . == "date_last_edited" ~ cells[[2L]] %>% rvest::html_element(css = "time") %>% rvest::html_attr(name = "datetime") %>% clock::date_parse(), # lists (multi-value cols) . == "remarks" ~ remarks_list_col, . == "sources" ~ list(list(text = col_text, urls = urls, html = cells[[2L]] %>% xml2::xml_contents() %>% as.character() %>% paste0(collapse = ""))), . == "types" ~ col_text %>% stringr::str_split(pattern = "\\s*\u2192\\s*"), . == "adoption_requirements_de" ~ col_text %>% stringr::str_split(pattern = ",\\s*"), . == "files" ~ cells[[2L]] %>% rvest::html_elements(css = "a") %>% purrr::map(~ .x %>% rvest::html_attr(name = "href") %>% url_sudd() %>% tibble::tibble(description = rvest::html_text(.x), url = .)) %>% purrr::list_rbind() %>% list(), . == "ids_sudd_simultaneous" ~ urls %>% stringr::str_extract(pattern = "(?<=[\\?&]id=)[\\w\\d]+") %>% list(), ~ "PARSING ERROR; PLEASE DEBUG" )) }) }
url_sudd
Completes relative URLs.
url_sudd <- function(x = "") { purrr::map_chr(x, \(x) { if (!is.na(x) && stringr::str_detect(x, "https?:")) { x } else { httr::modify_url(url = x %|% "", scheme = "https", hostname = "sudd.ch") } }) }
this_pkg
this_pkg <- utils::packageName()
cli_theme
cli_theme <- cli::builtin_theme() %>% purrr::list_modify(span.err = list(color = "red", `font-weight` = "bold"), span.warn = list(color = "orange", `font-weight` = "bold"), span.content = list(color = "mediumorchid"))
date_backup_rdb
date_backup_rdb <- pal::path_mod_time("data-raw/backups/rdb.rds") |> clock::as_date()
codebook_fragments
codebook_fragments <- c("institution-level-variables", "referendum-level-variables", "topics")
data_cols_absent
data_cols_absent <- tibble::tibble(col = character(), type = list(), msg = character()) %>% tibble::add_row(col = "id", type = list("add"), msg = "an {.var id} column. It is automatically set by the RDB API back-end. Did you mean to {.fun edit_rfrnds} instead?") %>% tibble::add_row(col = "country_name", type = list(c("add", "edit")), msg = "a {.var country_name} column. It is automatically set by the RDB API back-end based on {.var country_code}.") %>% tibble::add_row(col = "date_time_created", type = list(c("add", "edit")), msg = "a {.var date_time_created} column. This date is automatically set by the RDB API back-end and not supposed to be changed.") %>% tibble::add_row(col = "date_time_last_edited", type = list(c("add", "edit")), msg = paste0("a {.var date_time_last_edited} column. This date is automatically set by the RDB API back-end and not supposed to be changed ", "manually.")) %>% dplyr::mutate(msg = paste0("{.arg data} mustn't contain ", msg))
ballot_date_colnames
Mainly used in as_ballot_dates()
.
ballot_date_colnames <- c("country_code", "country_code_long", "country_code_continual", "country_name", "country_name_long", "subnational_entity_code", "subnational_entity_name", "municipality", "level", "date", "week", "month", "quarter", "year", "decade", "century", "era", "wave_of_democracy", "is_former_country", "un_country_code", "un_region_tier_1_code", "un_region_tier_1_name", "un_region_tier_2_code", "un_region_tier_2_name", "un_region_tier_3_code", "un_region_tier_3_name", "un_subregion")
rfrnd_fields
RDB referendum database field names as returned by the /referendums
API endpoint.
rfrnd_fields <- list()
$all*
rfrnd_fields$all <- c("_id", "archive", "canton", "categories", "citizens_abroad", "committee_name", "context", "country_code", "country_code_historical", "country_name", "created_on", "date", "date_time_last_edited", "draft", "files", "id_official", "id_sudd", "institution", "is_past_jurisdiction", "level", "question", "question_en", "municipality", "number", "remarks", "result", "sources", "tags", "title", "total_electorate", "votes_empty", "votes_invalid", "votes_no", "votes_yes") rfrnd_fields$all_flat <- rfrnd_fields$all %>% setdiff(c("categories", "context", "title")) %>% union(c("categories.action", "categories.author_of_the_vote_object", "categories.counter_proposal", "categories.decision_quorum", "categories.degree_of_revision", "categories.excluded_topics", "categories.hierarchy_of_the_legal_norm", "categories.institutional_precondition", "categories.institutional_precondition_decision", "categories.institutional_precondition_decision_actor", "categories.legal_act_type", "categories.official_status", "categories.referendum_text_options", "categories.special_topics", "categories.turnout_quorum", "categories.vote_object", "categories.vote_result_status", "categories.vote_trigger", "categories.vote_trigger_actor", "categories.vote_trigger_number", "categories.vote_trigger_state_level", "categories.vote_trigger_time", "categories.vote_venue", "context.national_council_abstentions", "context.national_council_no", "context.national_council_yes", "context.recommendation", "context.states_council_abstentions", "context.states_council_no", "context.states_council_yes", "context.states_no", "context.states_yes", "context.votes_per_canton", "title.de", "title.en", "title.fr"))
$required_for_*
The following fields are mandatory when adding/editing referendums via the admin portal, but not strictly mandatory schema-wise:
| field | edits | additions | remarks |
|--------------------------|-------|-----------|-----------------------------------------------------------------------------------------------|
| _id
| yes | no | must be provided in the URL, not the payload |
| country_code
| no | yes | must be a valid choice; the back-end automatically derives country_name
from country_code
|
| canton
| no | yes | only mandatory if "level":"subnational"
|
| municipality
| no | yes | only mandatory if "level":"local"
|
| level
| no | yes | must be a valid choice |
| date
| no | yes | must be a valid date (ISO 8601 date string) |
| title.en
| no | yes | contains NA
s from before relaunch |
| result
| no | yes | must be a valid choice ("Yes", "No" or "Unknown") |
| context.recommendation
| no | yes | only mandatory if "country_code":"CH","level":"national"
; must be a valid choice |
| total_electorate
| no | yes | must be a valid integer |
| citizens_abroad
| no | no | must be a valid integer |
| votes_yes
| no | no | must be a valid integer |
| votes_no
| no | no | must be a valid integer |
| votes_empty
| no | no | must be a valid integer |
| votes_invalid
| no | no | must be a valid integer |
| draft
| yes | no | defaults to true
if not provided for additions |
| institution
| no | yes | must be a valid choice |
Based on the above, we define the fields that must always be present in the JSON payload when adding/editing referendums as follows:
rfrnd_fields$required_for_edits <- c("_id", "draft", "total_electorate", "citizens_abroad", "votes_yes", "votes_no", "votes_empty", "votes_invalid") rfrnd_fields$required_for_additions <- c("country_code", "level", "date", "title.en", "result", "total_electorate", "citizens_abroad", "votes_yes", "votes_no", "votes_empty", "votes_invalid", "draft", "institution")
$never_empty
The following fields are expected to never be empty when returned by the API.
rfrnd_fields$never_empty <- c("_id", "country_code", "country_name", "created_on", "level", "total_electorate", "citizens_abroad", "votes_yes", "votes_no", "votes_empty", "votes_invalid", "draft")
*var_names*
NOTES:
This maps the field names of the back-end's MongoDB to the final variable names as found in the RDB codebook. The renamings should eventually all be upstreamed and this mapping shall become superfluous.
To convert var_names
(or any other similar list like sub_var_names
) to a tibble, use:
r
var_names %>%
unlist() %>%
tibble::enframe(name = "var_name_old",
value = "var_name_new")
# old name new name var_names <- list(`_id` = "id", canton = "subnational_entity_name", title.de = "title_de", title.en = "title_en", title.fr = "title_fr", context.states_no = "subterritories_no", context.states_yes = "subterritories_yes", total_electorate = "electorate_total", citizens_abroad = "electorate_abroad", context.votes_per_canton = "votes_per_subterritory", context.national_council_yes = "lower_house_yes", context.national_council_no = "lower_house_no", context.national_council_abstentions = "lower_house_abstentions", context.states_council_yes = "upper_house_yes", context.states_council_no = "upper_house_no", context.states_council_abstentions = "upper_house_abstentions", context.recommendation = "position_government", draft = "is_draft", created_on = "date_time_created", institution = "type", categories.official_status = "inst_legal_basis_type", categories.legal_act_type = "inst_has_urgent_legal_basis", categories.vote_result_status = "inst_is_binding", categories.counter_proposal = "inst_is_counter_proposal", categories.vote_venue = "inst_is_assembly", categories.vote_trigger = "inst_trigger_type", categories.vote_trigger_actor = "inst_trigger_actor", categories.vote_trigger_state_level = "inst_trigger_actor_level", categories.vote_trigger_number = "inst_trigger_threshold", categories.vote_trigger_time = "inst_trigger_time_limit", categories.vote_object = "inst_object_type", categories.author_of_the_vote_object = "inst_object_author", categories.hierarchy_of_the_legal_norm = "inst_object_legal_level", categories.degree_of_revision = "inst_object_revision_extent", categories.action = "inst_object_revision_modes", categories.turnout_quorum = "inst_quorum_turnout", categories.decision_quorum = "inst_quorum_approval", categories.institutional_precondition = "inst_has_precondition", categories.institutional_precondition_decision_actor = "inst_precondition_actor", categories.institutional_precondition_decision = "inst_precondition_decision", categories.special_topics = "inst_topics_only", categories.excluded_topics = "inst_topics_excluded") # old name new name sub_var_names <- list(files = list(date = "date_time_attached", object_key = "s3_object_key", size = "file_size", deleted = "is_deleted")) # create additional formula-lists (mainly to be fed to `dplyr::case_match()`) var_names_fms <- as_fm_list(var_names) sub_var_names_fms <- purrr::imap(sub_var_names, ~ as_fm_list(.x))
mime_error_suffix <- "This indicates either some network issue or a change in the RDB API."
sudd_years <- url_sudd("index.php") %>% xml2::read_html() %>% rvest::html_element(css = "select[id='first']") %>% rvest::html_elements("option") %>% rvest::html_attr("value") %>% as.integer() sudd_max_year <- pal::safe_max(sudd_years) sudd_min_year <- pal::safe_min(sudd_years) rm(sudd_years)
rfrnds
#' Get referendum data (old version) #' #' Downloads the referendum data from the Referendum Database (RDB). See the [`codebook`][codebook] for a detailed description of all variables. #' #' @inheritParams assemble_query_filter #' @inheritParams tidy_rfrnds #' @inheritParams url_api #' @param incl_archive Whether or not to include an `archive` column containing data from an earlier, obsolete state of the Referendum Database (RDB). #' @param use_cache `r pkgsnip::param_lbl("use_cache")` #' @param max_cache_age `r pkgsnip::param_lbl("max_cache_age")` #' @param quiet `r pkgsnip::param_lbl("quiet")` #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family rfrnd #' @export #' #' @examples #' # get all referendums (excl. drafts) #' rdb::rfrnds() #' #' # get only referendums in Austria and Australia on subnational level #' rdb::rfrnds(country_code = c("AT", "AU"), #' level = "subnational", #' quiet = TRUE) #' #' # get referendums in 2020 #' rdb::rfrnds(date_min = "2020-01-01", #' date_max = "2020-12-31", #' quiet = TRUE) #' #' # get referendums added to the database during the last 30 days #' rdb::rfrnds(date_time_created_min = clock::date_today(zone = "UTC") |> clock::add_days(-30L), #' date_time_created_max = clock::date_today(zone = "UTC"), #' quiet = TRUE) #' #' # provide custom `query_filter` for more complex queries like regex matches #' # cf. https://docs.mongodb.com/manual/reference/operator/query/regex/ #' rdb::rfrnds(query_filter = '{"country_code":{"$regex":"A."}}', #' quiet = TRUE) rfrnds <- function(country_code = NULL, subnational_entity_name = NULL, municipality = NULL, level = NULL, type = NULL, date_min = NULL, date_max = NULL, is_draft = FALSE, date_time_created_min = NULL, date_time_created_max = NULL, date_time_last_edited_min = NULL, date_time_last_edited_max = NULL, query_filter = NULL, incl_archive = FALSE, tidy = TRUE, use_cache = TRUE, max_cache_age = "1 week", use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg), quiet = FALSE) { checkmate::assert_flag(incl_archive) checkmate::assert_flag(quiet) # TODO: remove this check as soon as [issue #78](https://github.com/zdaarau/c2d-app/issues/78) is resolved if (isTRUE(use_testing_server)) cli::cli_abort("{.code mode=stream} is not yet supported on the testing servers.") result <- pkgpins::with_cache(expr = { if (!quiet) { status_msg <- "Fetching JSON data from RDB API..." cli::cli_progress_step(msg = status_msg, msg_done = paste(status_msg, "done"), msg_failed = paste(status_msg, "failed")) } data <- httr::RETRY(verb = "GET", url = url_api("referendums", .use_testing_server = use_testing_server), query = list(mode = "stream", format = "json", filter = assemble_query_filter(country_code = country_code, subnational_entity_name = subnational_entity_name, municipality = municipality, level = level, type = type, date_min = date_min, date_max = date_max, is_draft = is_draft, date_time_created_min = date_time_created_min, date_time_created_max = date_time_created_max, date_time_last_edited_min = date_time_last_edited_min, date_time_last_edited_max = date_time_last_edited_max, query_filter = query_filter)), if (!quiet) httr::progress(type = "down"), times = 3L) %>% # ensure we actually got a JSON response pal::assert_mime_type(mime_type = "application/json", msg_suffix = mime_error_suffix) %>% # extract JSON httr::content(as = "text", encoding = "UTF-8") %>% # ensure body is not empty assert_content() if (!quiet) { status_msg <- "Converting JSON to list data..." cli::cli_progress_step(msg = status_msg, msg_done = paste(status_msg, "done"), msg_failed = paste(status_msg, "failed")) } # NOTE that we cannot rely on params `simplify*` and `flatten` because of varying field lengths in API result (depending on `query`) data %<>% jsonlite::fromJSON(simplifyVector = FALSE, simplifyDataFrame = FALSE, simplifyMatrix = FALSE, flatten = FALSE) %$% items if (!quiet) { status_msg <- "Tidying data..." cli::cli_progress_step(msg = status_msg, msg_done = paste(status_msg, "done"), msg_failed = paste(status_msg, "failed")) } data %>% tidy_rfrnds(tidy = tidy) }, pkg = this_pkg, from_fn = "rfrnds", country_code, subnational_entity_name, municipality, level, type, date_min, date_max, is_draft, date_time_created_min, date_time_created_max, date_time_last_edited_min, date_time_last_edited_max, query_filter, tidy, use_testing_server, use_cache = use_cache, max_cache_age = max_cache_age) # exclude `archive` if requested if (!incl_archive) result %<>% dplyr::select(-any_of("archive")) result }
rfrnds_bkp
#' Get referendum data from backup #' #' Downloads the referendum data from the Referendum Database (RDB) backup [in the `zdaarau/rpkgs/rdb` #' repository](https://gitlab.com/zdaarau/rpkgs/rdb/-/blob/master/data-raw/backups/rdb.rds?ref_type=heads). See the [`codebook`][codebook] for a detailed #' description of all variables. #' #' @inheritParams rfrnds #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family rfrnd #' @export #' #' @examples #' # get all referendums (excl. drafts) #' rdb::rfrnds_bkp() rfrnds_bkp <- function(is_draft = FALSE, incl_archive = FALSE, use_cache = TRUE, max_cache_age = "1 week", quiet = FALSE) { checkmate::assert_flag(is_draft, null.ok = TRUE) checkmate::assert_flag(incl_archive) checkmate::assert_flag(quiet) result <- pkgpins::with_cache( expr = { if (!quiet) { pal::cli_progress_step_quick(msg = "Fetching latest RDB backup") } path_temp <- fs::file_temp(pattern = "rdb-", ext = "rds") utils::download.file(url = "https://gitlab.com/zdaarau/rpkgs/rdb/-/raw/master/data-raw/backups/rdb.rds?ref_type=heads&inline=false", destfile = path_temp, quiet = TRUE, mode = "wb") readRDS(file = path_temp) }, pkg = this_pkg, from_fn = "rfrnds", use_cache = use_cache, max_cache_age = max_cache_age ) # exclude `archive` if requested if (!incl_archive) result %<>% dplyr::select(-any_of("archive")) # respect `is_draft` if (!is.null(is_draft)) { result %<>% dplyr::filter(is_draft == !!is_draft) } result }
rfrnd
NOTES:
A minimal sample cURL request to retrieve a single referendum entry looks as follows:
curl
curl 'https://services.c2d.ch/referendums/6303a4cba52c3995043a8c24' \
--header 'Origin: https://c2d.ch'
#' Get a single referendum's data #' #' Downloads a single referendum's data from the Referendum Database (RDB). See the [`codebook`][codebook] for a detailed description of all variables. #' #' @inheritParams rfrnds #' @param id Referendum's unique [identifier](`r url_codebook("id")`). #' #' @inherit rfrnds return #' @family rfrnd #' @export #' #' @examples #' rdb::rfrnd(id = "5bbbe26a92a21351232dd73f") rfrnd <- function(id, incl_archive = FALSE, tidy = TRUE, use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { checkmate::assert_string(id, min.chars = 1L) checkmate::assert_flag(incl_archive) checkmate::assert_flag(tidy) # retrieve data data <- httr::RETRY(verb = "GET", url = url_api("referendums", id, .use_testing_server = use_testing_server), config = httr::add_headers(Origin = url_admin_portal(.use_testing_server = use_testing_server)), times = 3L) %>% # ensure we actually got a JSON response pal::assert_mime_type(mime_type = "application/json", msg_suffix = mime_error_suffix) %>% # extract JSON httr::content(as = "text", encoding = "UTF-8") %>% # ensure body is not empty assert_content() %>% # convert JSON to list # NOTE that we cannot rely on params `simplify*` and `flatten` because of varying field lengths in API result jsonlite::fromJSON(simplifyVector = FALSE, simplifyDataFrame = FALSE, simplifyMatrix = FALSE, flatten = FALSE) %>% # ensure no error occured assert_api_success() %>% # tidy data tidy_rfrnds(tidy = tidy) # exclude `archive` if requested if (!incl_archive) data %<>% dplyr::select(-any_of("archive")) # return data data }
download_file_attachment
#' Download file attachment #' #' Downloads a file attachment from the Referendum Database (RDB). The necessary `s3_object_key`s identifying individual files are found in the `files` list #' column returned by [rfrnds()]. #' #' @inheritParams url_api #' @param s3_object_key Key uniquely identifying the file in the RDB [Amazon S3 bucket](https://en.wikipedia.org/wiki/Amazon_S3#Design). A character scalar. #' @param path Path where the downloaded file is written to. #' @param use_original_filename Whether to save the file attachment using its original filename as uploaded. Note that original filenames are **not unique**, #' i.e. there are multiple file attachments with the same original filename (but differing content, of course). If `FALSE`, `s3_object_key` is used as #' filename. Only relevant if `path` is a directory. #' #' @return A [response object][httr::response], invisibly. #' @family rfrnd #' @export #' #' @examples #' # get object keys #' obj_keys <- #' rdb::rfrnds()$files |> #' purrr::list_rbind() |> #' dplyr::filter(!is_deleted) |> #' _$s3_object_key[1:3] #' #' # download them to the current working dir #' purrr::walk(obj_keys, #' rdb::download_file_attachment) #' #' # and delete them again #' fs::file_delete(obj_keys) download_file_attachment <- function(s3_object_key, path = ".", use_original_filename = FALSE, use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { checkmate::assert_string(s3_object_key) checkmate::assert_atomic(path) checkmate::assert_flag(use_original_filename) # TODO: remove this check as soon as [issue #78](https://github.com/zdaarau/c2d-app/issues/78) is resolved if (isTRUE(use_testing_server)) cli::cli_abort("Accessing file attachments is not yet supported on the testing servers.") is_dir <- fs::is_dir(path) if (is_dir) { checkmate::assert_directory_exists(path, access = "rw") } else { checkmate::assert_path_for_output(path, overwrite = TRUE) } temp_path <- fs::file_temp() response <- httr::RETRY(verb = "GET", url = url_api("s3_objects", s3_object_key, .use_testing_server = use_testing_server), httr::write_disk(path = temp_path), times = 3L) if (is_dir) { if (use_original_filename) { final_path <- response %>% httr::headers() %$% `content-disposition` %>% stringr::str_extract(pattern = "(?<=filename=\").+?(?=\")") %>% fs::path(path, .) } else { final_path <- fs::path(path, s3_object_key) } } else { final_path <- path } fs::file_move(path = temp_path, new_path = final_path) invisible(response) }
add_rfrnds
NOTES:
For additions, the following variables are currently mandatory:
country_code
subnational_entity_name
(only mandatory if "level":"subnational"
)municipality
(only mandatory if "level":"local"
)level
date
type
title_en
result
position_government
(only mandatory if "country_code":"CH","level":"national"
)electorate_total
electorate_abroad
votes_yes
votes_no
votes_empty
votes_invalid
If the draft
field is not provided, it defaults to true
.
Nested fields like categories
can be partially provided.
country_name
is ignored and automatically derived from country_code
.
The context.*
fields are ignored unless "country_code":"CH","level":"national"
, cf. issue #52.
The order of the JSON fields doesn't matter.
The API performs input validation on certain fields, and if it fails, it responds with a message like:
json
{"error": {"id": "validation_failed", "value": {"categories.author_of_the_vote_object": "invalid"}}}
A minimal sample cURL request to add a test entry looks as follows:
curl
curl 'https://services.c2d.ch/referendums' \
--request POST \
--header 'Content-Type: application/json' \
--header 'Origin: https://c2d.ch' \
--header 'Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NjI5MzExNDksInN1YiI6IjYyMjE1NDAxMTNiZWQ0MjBkNmZmYmMwYSJ9.06VA7kGcF2ajuTge_qAy7VD7UD0wX9QTwi7HIkntPQ0' \
--data-raw '{"draft":true,"title":{"en":"test test test"},"date":"2222-02-22","level":"national","country_code":"AT","institution":"Mandatory referendum","votes_no":-1,"votes_yes":-1,"votes_empty":-1,"votes_invalid":-1,"total_electorate":-1,"citizens_abroad":-1,"result":"Unknown"}'
#' Add new referendums to the RDB #' #' Adds new referendum entries to the Referendum Database (RDB) via [its #' API](https://github.com/zdaarau/c2d-app/blob/master/docs/services.md#3-referendum-routes). #' #' @details #' Note that adding/editing the column `files` is not supported, i.e. it is simply dropped from `data`. #' #' @inheritParams url_api #' @param data The new referendum data. A [tibble][tibble::tbl_df] that in any case must contain the columns #' `r rfrnd_fields$required_for_additions %>% dplyr::case_match(.x = ., !!!var_names_fms, .default = .) %>% md_link_codebook() %>% pal::as_md_list()` #' #' plus the column [`subnational_entity_name`](`r url_codebook("subnational_entity_name")`) for referendums of #' [`level`](`r url_codebook("subnational_entity_name")`) below `"national"`, and the column [`municipality`](`r url_codebook("municipality")`) for referendums #' of `level = "local"`, #' #' plus any additional [valid][codebook] columns containing the values for the corresponding database fields. #' @param email The e-mail address of the RDB API user account to be used for authentication. A character scalar. #' @param password The password of the RDB API user account to be used for authentication. A character scalar. #' @param quiet Whether or not to print the newly created referendum IDs to console. #' #' @return A character vector of newly created referendum IDs. #' @family rfrnd #' @export add_rfrnds <- function(data, email = pal::pkg_config_val(key = "api_username", pkg = this_pkg), password = pal::pkg_config_val(key = "api_password", pkg = this_pkg), use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg), quiet = FALSE) { checkmate::assert_data_frame(data, min.rows = 1L) checkmate::assert_flag(quiet) ## ensure forbidden columns are absent assert_cols_absent(data = data, type = "add") ## ensure mandatory columns are present rfrnd_fields$required_for_additions %>% dplyr::case_match(.x = ., !!!var_names_fms, .default = .) %>% purrr::walk(~ if (!(.x %in% colnames(data))) cli::cli_abort(paste0("Mandatory column {.var ", .x, "} is missing from {.arg data}."))) # drop non-applicable columns (they're supposed to be absent in MongoDB) data %<>% drop_non_applicable_vars() ## ensure remaining columns are valid assert_cols_valid(data = data, type = "add") # convert data to MongoDB schema json_items <- data %>% # restore MongoDB fields untidy_rfrnds() %>% # convert to JSON purrr::map(jsonlite::toJSON, auto_unbox = TRUE, digits = NA) # add data to the MongoDB via HTTP POST on `/referendums` API endpoint responses <- json_items %>% purrr::map(\(x) { httr::RETRY(verb = "POST", url = url_api("referendums", .use_testing_server = use_testing_server), config = httr::add_headers(Origin = url_admin_portal(.use_testing_server = use_testing_server), Authorization = paste("Bearer", auth_session(email = email, password = password, use_testing_server = use_testing_server))), body = x, times = 3L, httr::content_type_json()) %>% # ensure we actually got a JSON response pal::assert_mime_type(mime_type = "application/json", msg_suffix = mime_error_suffix) %>% # extract JSON string httr::content(as = "text", encoding = "UTF-8") %>% # ensure body is not empty assert_content() %>% # convert to list jsonlite::fromJSON(simplifyDataFrame = FALSE, simplifyMatrix = FALSE) }) %>% # ensure no error occured assert_api_success() # throw warnings for unsuccessful API calls purrr::walk2(.x = responses, .y = seq_along(responses), .f = ~ if (!is.list(.x) || !isTRUE(nchar(.x$`_id`$`$oid`) > 0L)) { api_failure(.x, raw = json_items[[.y]], prefix = "Failed to add the {.y}. referendum. ") }) ids_new <- unlist(responses, use.names = FALSE) if (!quiet) { cli::cli_alert_info("New referendum entries created with {.var id}s:") cli::cli_li(ids_new) } invisible(ids_new) }
edit_rfrnds
NOTES:
Currently, the following variables are mandatory even for edits:
is_draft
electorate_total
electorate_abroad
votes_yes
votes_no
votes_empty
votes_invalid
Otherwise, only the variables that are supposed to change need to be supplied. Nested fields like categories
can be partially overwritten, too.
country_name
is ignored and automatically derived from country_code
.
The context.*
fields are ignored unless "country_code":"CH","level":"national"
, cf. issue #52.
The order of the JSON fields doesn't matter.
The API performs input validation on certain fields, and if it fails, it responds with a message like:
json
{"error": {"id": "validation_failed", "value": {"categories.author_of_the_vote_object": "invalid"}}}
A minimal sample cURL request to change a test entry's mandatory columns plus title_fr
looks as follows:
sh
curl 'https://services.c2d.ch/referendums/610005e2c72633da60229938' \
--request PUT \
--header 'Content-Type: application/json' \
--header 'Origin: https://c2d.ch' \
--header 'Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NjI5MzExNDksInN1YiI6IjYyMjE1NDAxMTNiZWQ0MjBkNmZmYmMwYSJ9.06VA7kGcF2ajuTge_qAy7VD7UD0wX9QTwi7HIkntPQ0' \
--data-raw '{"draft":true,"votes_yes":-1,"votes_no":-1,"votes_empty":-1,"votes_invalid":-1,"total_electorate":-1,"citizens_abroad":-1,"title":{"fr":"letest 99"}}'
#' Edit existing referendums in the RDB #' #' Edits existing referendum entries in the API](https://github.com/zdaarau/c2d-app/blob/master/docs/services.md#3-referendum-routes) via [its #' API](https://github.com/zdaarau/c2d-app/blob/master/docs/services.md#3-referendum-routes). #' #' @inherit add_rfrnds details #' #' @inheritParams add_rfrnds #' @param data Updated referendum data. A [tibble][tibble::tbl_df] that must contain an [`id`](`r url_codebook("id")`) column #' identifying the referendums to be edited plus any additional columns containing the new values to update the corresponding database fields with. Note that #' due to [current API requirements](https://github.com/zdaarau/c2d-app/issues/50#issuecomment-1222660683), the following columns must always be supplied: #' #' ```r #' rfrnd_fields$required_for_edits %>% #' dplyr::case_match(.x = ., #' !!!var_names_fms, #' .default = .) |> #' setdiff("id") |> #' md_link_codebook() |> #' pal::as_md_list() |> #' cat() #' ``` #' #' @return `data`, invisibly. #' @family rfrnd #' @export edit_rfrnds <- function(data, email = pal::pkg_config_val(key = "api_username", pkg = this_pkg), password = pal::pkg_config_val(key = "api_password", pkg = this_pkg), use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { # ensure `data` is valid checkmate::assert_data_frame(data, min.rows = 1L) ## ensure forbidden columns are absent assert_cols_absent(data = data, type = "edit") ## ensure mandatory columns are present rfrnd_fields$required_for_edits %>% dplyr::case_match(.x = ., !!!var_names_fms, .default = .) %>% c("id") %>% purrr::walk(~ if (!(.x %in% colnames(data))) cli::cli_abort(paste0("Mandatory column {.var ", .x, "} is missing from {.arg data}."))) # drop non-applicable columns (they're absent in MongoDB) data %<>% drop_non_applicable_vars() ## ensure remaining columns are valid assert_cols_valid(data, type = "edit") # convert data to MongoDB schema ids <- data$id json_items <- data %>% # drop `id` dplyr::select(-id) %>% # restore MongoDB fields untidy_rfrnds() %>% # convert to JSON purrr::map(jsonlite::toJSON, auto_unbox = TRUE, digits = NA) # edit data in the MongoDB via HTTP PUT on `/referendums/{id}` API endpoint responses <- purrr::map2(.x = ids, .y = json_items, .f = ~ httr::RETRY(verb = "PUT", url = url_api("referendums", .x, .use_testing_server = use_testing_server), config = httr::add_headers(Origin = url_admin_portal(.use_testing_server = use_testing_server), Authorization = paste("Bearer", auth_session(email = email, password = password, use_testing_server = use_testing_server))), body = .y, times = 3L, httr::content_type_json()) %>% # ensure we actually got a JSON response pal::assert_mime_type(mime_type = "application/json", msg_suffix = mime_error_suffix) %>% # extract JSON string httr::content(as = "text", encoding = "UTF-8") %>% # ensure body is not empty assert_content()) # throw warnings for unsuccessful API calls purrr::walk(.x = seq_along(ids), .f = ~ { parsed <- jsonlite::fromJSON(responses[[.x]]) if (!isTRUE(parsed$ok)) { api_failure(parsed, raw = json_items[[.x]], prefix = paste0("Failed to edit referendum with {.var id} {.val ", ids[.x], "}. ")) } }) invisible(data) }
delete_rfrnds
#' Delete referendums in the RDB #' #' Deletes existing referendum entries in the Referendum Database (RDB) via [its #' API](https://github.com/zdaarau/c2d-app/blob/master/docs/services.md#3-referendum-routes). #' #' @inheritParams add_rfrnds #' @param ids IDs of the referendums to be deleted. A character vector. #' #' @return `ids`, invisibly. #' @family rfrnd #' @export delete_rfrnds <- function(ids, email = pal::pkg_config_val(key = "api_username", pkg = this_pkg), password = pal::pkg_config_val(key = "api_password", pkg = this_pkg), use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { checkmate::assert_character(ids, min.chars = 1L, any.missing = FALSE, unique = TRUE) # TODO: remove this as soon as https://github.com/zdaarau/c2d-app/issues/45 is deployed to master if (!use_testing_server) { cli::cli_abort("Referendum deletions are not yet supported on the production servers.") } responses <- purrr::map(.x = ids, .f = ~ httr::RETRY(verb = "DELETE", url = url_api("referendums", .x, .use_testing_server = use_testing_server), config = httr::add_headers(Authorization = paste("Bearer", auth_session(email = email, password = password, use_testing_server = use_testing_server))), times = 3L) %>% # ensure we actually got a JSON response pal::assert_mime_type(mime_type = "application/json", msg_suffix = mime_error_suffix) %>% # extract JSON string httr::content(as = "text", encoding = "UTF-8") %>% # ensure body is not empty assert_content()) # throw warnings for unsuccessful API calls purrr::walk2(.x = ids, .y = responses, .f = ~ { parsed <- jsonlite::fromJSON(.y) if (!isTRUE(parsed$ok)) { api_failure(parsed, prefix = "Failed to delete referendum with {.var id} {.val {.x}}. ") } }) invisible(ids) }
validate_rfrnds
#' Validate referendum data #' #' Performs various data validation steps to ensure there are no errors in the supplied `data`. #' #' @param data Referendum data to validate, as returned by [rfrnds()]. #' @param check_applicability_constraint Whether or not to check that no applicability constraints as defined in the [codebook][data_codebook] are violated. #' @param check_id_sudd_prefix Whether or not to check that all [`id_sudd`](`r url_codebook("id_sudd")`) prefixes are valid. #' #' @return `data`, invisibly. #' @family rfrnd #' @export validate_rfrnds <- function(data, check_applicability_constraint = TRUE, check_id_sudd_prefix = TRUE) { checkmate::assert_data_frame(data, min.rows = 1L) checkmate::assert_subset(colnames(data), choices = rfrnd_cols_order) checkmate::assert_flag(check_applicability_constraint) checkmate::assert_flag(check_id_sudd_prefix) # check columns status_msg <- "Checking basic column validity..." cli_progress_id <- cli::cli_progress_step(msg = status_msg, msg_done = paste(status_msg, "done"), msg_failed = paste(status_msg, "failed")) assert_cols_valid(data = data, type = "validate", action = cli::cli_alert_warning, cli_progress_id = cli_progress_id) # check applicability constraints if (check_applicability_constraint) { status_msg <- "Asserting applicability constraints..." cli::cli_progress_step(msg = status_msg, msg_done = paste(status_msg, "done"), msg_failed = paste(status_msg, "failed")) var_names_violated <- data_codebook %>% dplyr::filter(variable_name %in% colnames(data) & !is.na(applicability_constraint)) %$% purrr::map2_lgl(.x = magrittr::set_names(x = variable_name, value = variable_name), .y = applicability_constraint, .f = ~ { data %>% dplyr::filter(!eval(parse(text = .y))) %$% eval(as.symbol(.x)) %>% { is.na(.) | purrr::map_lgl(., is.null) } %>% all() }) %>% magrittr::extract(!.) %>% names() n_var_names_violated <- length(var_names_violated) if (n_var_names_violated) { cli::cli_progress_done(result = "failed") cli::cli_alert_warning("Applicability constraints are violated for {n_var_names_violated} variable{?s}:") paste0("{.var ", var_names_violated, "}") %>% magrittr::set_names(rep("x", times = length(.))) %>% cli::cli_bullets() first_var_name_violated <- var_names_violated[1L] cli::cli({ cli::cli_text("\nTo get the applicability constraint of e.g. {.var {first_var_name_violated}}, run:") cli::cli_text("") cli::cli_code(c("rdb::data_codebook %>%", glue::glue(" dplyr::filter(variable_name == \"{first_var_name_violated}\") %$%"), " applicability_constraint")) cli::cli_text("") cli::cli_text("To inspect the entries in violation of the above applicability constraint, run:") cli::cli_text("") cli::cli_code(c("data %>%", glue::glue(" dplyr::filter(rdb::data_codebook %>%\n", " dplyr::filter(variable_name == \"{first_var_name_violated}\") %$%\n", " applicability_constraint %>%\n", " parse(text = .) %>%\n", " eval() %>%\n", " magrittr::not()) %>%\n", " dplyr::select(id, {first_var_name_violated})", .trim = FALSE))) }) } } # check `id_sudd` prefix if requested if (check_id_sudd_prefix) { status_msg <- "Validating `id_sudd` prefixes..." cli::cli_progress_step(msg = status_msg, msg_done = paste(status_msg, "done"), msg_failed = paste(status_msg, "failed")) if (!all(c("country_code", "id_sudd") %in% colnames(data))) { cli::cli_progress_done(result = "failed") cli::cli_abort("Columns {.var country_code} and {.var id_sudd} must be present in {.arg data}.") } # define allowed exceptions allowed_exceptions <- tibble::tribble( ~country_code, ~id_sudd_prefix, # Curacao "CW", "an", # Szeklerland, cf. https://sudd.ch/event.php?id=hu042008 "RO", "hu" ) # assemble target country codes country_codes <- data$country_code %>% as.character() %>% as.list() for (country_code in allowed_exceptions$country_code) { additional_country_codes <- allowed_exceptions %>% dplyr::filter(country_code == !!country_code) %$% id_sudd_prefix %>% stringr::str_to_upper() ix_country_codes <- country_codes %>% purrr::map_lgl(~ country_code %in% .x) %>% which() for (i in ix_country_codes) { country_codes[[i]] <- unique(c(country_codes[[i]], additional_country_codes)) } } # add dummy indicating if target country codes match # TODO: instead of modifying input data, print cli msg with all relevant info! data$matches_id_sudd_prefix <- data$id_sudd %>% stringr::str_extract(pattern = "^..") %>% stringr::str_to_upper() %>% purrr::map2_lgl(.y = country_codes, .f = ~ .x %in% .y) data$matches_id_sudd_prefix[is.na(data$id_sudd)] <- NA } invisible(data) }
count_rfrnds
#' Count number of referendums #' #' Counts the number of referendums per [`level`](`r url_codebook("level")`) in the Referendum Database (RDB). #' #' @inheritParams assemble_query_filter #' @inheritParams url_api #' #' @return A named list with `level` as names and referendum counts as values. #' @family rfrnd #' @export #' #' @examples #' # the whole database (excl. drafts) #' rdb::count_rfrnds() #' #' # only Swiss and Austrian referendums #' rdb::count_rfrnds(country_code = c("CH", "AT")) #' #' # only Swiss referendums created between 2020 and 2021 #' rdb::count_rfrnds(country_code = "CH", #' date_time_created_min = "2020-01-01", #' date_time_created_max = "2021-01-01") count_rfrnds <- function(is_draft = FALSE, country_code = NULL, subnational_entity_name = NULL, municipality = NULL, level = NULL, type = NULL, date_min = NULL, date_max = NULL, date_time_created_min = NULL, date_time_created_max = NULL, date_time_last_edited_min = NULL, date_time_last_edited_max = NULL, query_filter = NULL, use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { httr::RETRY(verb = "GET", url = url_api("referendums/stats", .use_testing_server = use_testing_server), query = list(filter = assemble_query_filter(country_code = country_code, subnational_entity_name = subnational_entity_name, municipality = municipality, level = level, type = type, date_min = date_min, date_max = date_max, is_draft = is_draft, date_time_created_min = date_time_created_min, date_time_created_max = date_time_created_max, date_time_last_edited_min = date_time_last_edited_min, date_time_last_edited_max = date_time_last_edited_max, query_filter = query_filter)), config = httr::add_headers(Origin = url_admin_portal(.use_testing_server = use_testing_server)), times = 3L) %>% # ensure we actually got a JSON response pal::assert_mime_type(mime_type = "application/json", msg_suffix = mime_error_suffix) %>% # parse response httr::content(as = "parsed") %$% votes %>% magrittr::set_names(names(.) %>% dplyr::case_match(.x = ., "sub_national" ~ "subnational", .default = .)) }
rfrnd_exists
#' Test if referendum ID exists #' #' Tests whether the referendum with the supplied `id` exists or not. #' #' @inheritParams rfrnd #' @inheritParams url_api #' #' @return A logical scalar. #' @family rfrnd #' @export #' #' @examples #' rdb::rfrnd_exists("6303a4cba52c3995043a8c24") rfrnd_exists <- function(id, .use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg)) { checkmate::assert_string(id, min.chars = 1L) httr::GET(url = url_api("referendums", id, .use_testing_server = .use_testing_server), config = httr::add_headers(Origin = url_admin_portal(.use_testing_server = .use_testing_server))) %>% httr::http_error() %>% magrittr::not() }
assert_vars
#' Assert referendum variables are present #' #' Asserts the specified `vars` are present in the supplied referendum `data`. Depending on `vars`, additional integrity checks are performed. #' #' @param data RDB referendum data as returned by [rfrnds()]. #' @param vars Names of the variables to check. A character vector. #' #' @return `data`, invisibly. #' @family rfrnd #' @export #' #' @examples #' rdb::rfrnd(id = "5bbbe26a92a21351232dd73f") |> rdb::assert_vars(vars = "country_code") #' #' try( #' tibble::tibble(country_code = "AN") |> rdb::assert_vars(vars = "country_code") #' ) assert_vars <- function(data, vars) { vars %>% purrr::walk(~ { msg_suffix <- switch(EXPR = .x, country_code = " with ISO 3166-1 alpha-2 or ISO 3166-3 alpha-4 codes.", "") if (!(.x %in% colnames(data))) { cli::cli_abort(paste0("{.arg data} must contain a column {.var {.x}}", msg_suffix)) } # run additional content check assert_content <- switch(EXPR = .x, country_code = \(x) { checkmate::assert_vector(x = x, .var.name = "data$country_code") check <- checkmate::check_subset(x = as.character(x), choices = val_set$country_code) if (!isTRUE(check)) { expired_codes <- intersect(as.character(x), data_iso_3166_3$Alpha_2) cli::cli_abort(paste0( "Assertion on {.var data$country_code} failed: ", ifelse(length(expired_codes), paste0("The following country codes have been deleted from ISO 3166-1 and were moved to ISO ", "3166-3 (former countries) instead: {.val {expired_codes}}"), # escape curly braces from checkmate msg stringr::str_replace_all(string = check, pattern = "([\\{\\}])", replacement = "\\1\\1")))) } }, \(x) TRUE) assert_content(data[[.x]]) }) invisible(data) }
data_codebook
#' RDB Codebook #' #' A tibble containing the complete metadata of all [rfrnds()] variables. The codebook below is also available [online](`r url_codebook()`). #' #' # Codebook #' #' ```r #' ``` #' #' @format `r pkgsnip::return_lbl("tibble")` #' @aliases codebook #' @family metadata #' @export #' #' @examples #' rdb::data_codebook "data_codebook"
val_lbls
#' Get set of possible *value labels* of referendum data variable #' #' Returns a character vector of value labels of a specific [rfrnds()] column, in the same order as [var_vals()], or of length `0` if `var_name`'s values are #' not restricted to a predefined set or no value labels are defined in the [codebook][data_codebook]. #' #' @param var_name Variable name present in [`data_codebook`] for which the labels are to be returned. A character scalar. #' @param incl_affixes Whether or not to add the corresponding `value_label_prefix` and `value_label_suffix` to the returned labels. #' #' @return A character vector. Of length `0` if `var_name`'s values are not restricted to a predefined set or no value labels are defined in the #' [codebook][data_codebook]. #' @family metadata #' @export #' #' @examples #' rdb::val_lbls("result", #' incl_affixes = FALSE) #' rdb::val_lbls("result") #' #' # Convert the labels to sentence case with trailing dot #' rdb::val_lbls("result") |> pal::sentenceify() val_lbls <- function(var_name, incl_affixes = TRUE) { var_name <- rlang::arg_match0(arg = var_name, values = rfrnd_cols_order) metadata <- data_codebook |> dplyr::filter(variable_name == !!var_name | variable_name_unnested == !!var_name) result <- metadata$value_labels |> purrr::list_c(ptype = character()) if (incl_affixes) { if (!is.na(metadata$value_label_prefix)) result <- paste(metadata$value_label_prefix, result) if (!is.na(metadata$value_label_suffix)) result <- paste(metadata$value_label_suffix, result) } result }
val_scale
#' Get *value scale* of referendum data variables #' #' Returns the value scale of the specified [rfrnds()] columns. #' #' @param var_names Variable name(s) present in [`data_codebook`] for which the value scale is to be returned. A character vector. #' #' @return A character scalar. #' @family metadata #' @export #' #' @examples #' rdb::val_scale("level") #' paste0("topics_tier_", 1:3) |> rdb::val_scale() val_scale <- function(var_names) { var_name <- rlang::arg_match(arg = var_names, values = rfrnd_cols_order, multiple = TRUE) c(data_codebook$value_scale, data_codebook$value_scale)[match(x = var_names, table = c(data_codebook$variable_name, data_codebook$variable_name_unnested))] }
var_vals
#' Get set of possible *values* of referendum data variable #' #' Returns a vector of the possible predefined values a specific column in [rfrnds()] can hold. If the variable values aren't restricted to a predefined #' set, `NULL` is returned. #' #' @param var_name Variable name present in [`data_codebook`]. A character scalar. #' #' @return #' If `var_name`'s values are restricted to a predefined set and #' - `var_name` is *not* of type list, a vector of the same type as `var_name`. #' - `var_name` is of type list, a vector of the same type as the elements of `var_name`. #' #' Else `NULL`. #' @family metadata #' @export #' #' @examples #' rdb::var_vals("result") #' rdb::var_vals("id") var_vals <- function(var_name) { var_name <- rlang::arg_match0(arg = var_name, values = rfrnd_cols_order) data_codebook |> dplyr::filter(variable_name == !!var_name | variable_name_unnested == !!var_name) %$% variable_values |> unlist() }
var_name_unnested
#' Get unnested variable names #' #' Returns the unnested analogue(s) of the specified variable name(s), which result from [unnesting][unnest_var]. For variable names that do *not* refer to #' nested list columns, `var_names` is simply returned as-is. #' #' @inheritParams prettify_var_names #' #' @return A character vector of the same length as `var_names`. #' @family metadata #' @family unnest #' @export #' #' @examples #' rdb::var_name_unnested("inst_object_revision_modes") #' rdb::var_name_unnested(paste0("topics_tier_", 1:3)) var_name_unnested <- function(var_names) { var_names <- rlang::arg_match(arg = var_names, values = data_codebook$variable_name, multiple = TRUE) data_codebook |> dplyr::filter(variable_name %in% !!var_names) %$% variable_name_unnested }
prettify_var_names
#' Prettify referendum data variable names #' #' Converts referendum data variable names to their ready-for-publication version. Variable names that are unknown, i.e. not present in [`data_codebook`]), are #' left untouched. #' #' @param var_names Variable name(s). Those not present in [`data_codebook`] remain untouched. A character vector. #' #' @return A character vector of the same length as `var_names`. #' @family metadata #' @family prettify #' @export #' #' @examples #' rdb::prettify_var_names("topics_tier_1") #' #' # also supports unnested var names #' rdb::prettify_var_names("topic_tier_1") #' #' # unknown var names are left untouched #' rdb::prettify_var_names(var_names = c("topic_tier_1", "topic_tier_99")) prettify_var_names <- function(var_names) { checkmate::assert_character(var_names) c(data_codebook$variable_name_print, data_codebook$variable_name_unnested_print)[match(x = var_names, table = c(data_codebook$variable_name, data_codebook$variable_name_unnested))] %|% var_names }
The classification of political topics assigned to referendums was developped together with Swissvotes, the Institute of Federalism of the University of Fribourg and the Section Politics of the Federal Statistical Office. Any modifications should be coordinated with these parties.
Functions to work with referendum topics.
data_topics
#' Topic hierarchy #' #' A tibble reflecting the complete [referendum topics hierarchy](`r url_codebook("topics")`). #' #' @format `r pkgsnip::return_lbl("tibble")` #' @family topics #' @export #' #' @examples #' rdb::data_topics "data_topics"
topics
#' List available topics #' #' Lists the set of available [referendum topics](`r url_codebook("topics")`) on the specified `tiers`. #' #' @param tiers Tiers to include topics from. An integerish vector. #' #' @return A character vector. #' @family topics #' @export #' #' @examples #' rdb::topics(tiers = 1:2) topics <- function(tiers = 1:3) { checkmate::assert_integerish(tiers, lower = 1L, upper = 3L, any.missing = FALSE, unique = TRUE) topic_set <- character() if (1L %in% tiers) { topic_set %<>% c(data_topics$topic_tier_1) } if (2L %in% tiers) { topic_set %<>% c(data_topics$topic_tier_2) } if (3L %in% tiers) { topic_set %<>% c(data_topics$topic_tier_3) } topic_set %>% setdiff(NA_character_) %>% unique() }
hierarchize_topics
NOTES:
#' Hierarchize topics #' #' Reconstructs the hierarchical relations between the three topic variables `topics_tier_1`, `topics_tier_2` and `topics_tier_3`. Can also be used to simply #' determine the parent topic(s) of any topic. #' #' @param x The topics to hierarchize. Either a character vector of topics or a single-row data frame containing at least the columns `topics_tier_1`, #' `topics_tier_2` and `topics_tier_3`. #' #' @return A [tibble][tibble::tbl_df] with the columns `topic_tier_1`, `topic_tier_2` and `topic_tier_3`. #' @family topics #' @export #' #' @examples #' rdb::hierarchize_topics("territorial questions") #' #' # hierarchize the topics of all Austrian referendums #' rdb::rfrnds(quiet = TRUE) |> #' dplyr::filter(country_code == "AT") |> #' dplyr::group_split(id) |> #' purrr::map(rdb::hierarchize_topics) hierarchize_topics <- function(x) { test_char <- checkmate::test_character(x, any.missing = FALSE) if (!test_char) { topic_var_names <- paste0("topics_tier_", 1:3) test_df <- checkmate::test_data_frame(x, min.rows = 1L, max.rows = 1L) has_topic_vars <- all(topic_var_names %in% colnames(x)) if (!test_df || !has_topic_vars) { cli::cli_abort(paste0("{.arg x} must be either a character vector of topics or a single-row data frame containing at least the columns ", "{.field topics_tier_1}, {.field topics_tier_2} and {.field topics_tier_3}.")) } x <- unlist(x[, topic_var_names], use.names = FALSE) } checkmate::assert_subset(x, choices = c(topics_tier_1_, topics_tier_2_, topics_tier_3_), empty.ok = TRUE) topics_tier_1 <- x[x %in% topics_tier_1_] topics_tier_2 <- x[x %in% topics_tier_2_] topics_tier_3 <- x[x %in% topics_tier_3_] inferred_topics_tier_1 <- infer_topics(topics = c(topics_tier_2, topics_tier_3), tier = 1L) inferred_topics_tier_2 <- infer_topics(topics = topics_tier_3, tier = 2L) non_parent_topics_tier_1 <- setdiff(topics_tier_1, inferred_topics_tier_1) non_parent_topics_tier_2 <- setdiff(topics_tier_2, inferred_topics_tier_2) # 0. initialize empty tibble result <- tibble::tibble(topic_tier_1 = character(), topic_tier_2 = character(), topic_tier_3 = character()) # 1. add third-tier topics result <- topics_tier_3 %>% purrr::map(~ tibble::tibble(topic_tier_1 = infer_topics(topics = .x, tier = 1L), topic_tier_2 = infer_topics(topics = .x, tier = 2L), topic_tier_3 = .x)) %>% purrr::list_rbind() %>% dplyr::bind_rows(result) # 2. add remaining second-tier topics result <- non_parent_topics_tier_2 %>% purrr::map(~ tibble::tibble(topic_tier_1 = infer_topics(topics = .x, tier = 1L), topic_tier_2 = .x, topic_tier_3 = NA_character_)) %>% purrr::list_rbind() %>% dplyr::bind_rows(result) # 3. add remaining first-tier topics result %>% dplyr::bind_rows(tibble::tibble(topic_tier_1 = non_parent_topics_tier_1, topic_tier_2 = NA_character_, topic_tier_3 = NA_character_)) %>% # sort result dplyr::arrange(topic_tier_1, topic_tier_2, topic_tier_3) }
hierarchize_topics_fast
#' Hierarchize topics (fast) #' #' Reconstructs the hierarchical relations between the three topic variables `topics_tier_1`, `topics_tier_2` and `topics_tier_3`. Other than #' [hierarchize_topics()], this function assumes that the three topic variables are always *complete*, i.e. that no (grand)parent topics of lower-tier topics #' are missing. This assumption is met by [rfrnds()] and [rfrnd()]. #' #' @param topics_tier_1 First-tier topics. A character vector. #' @param topics_tier_2 Second-tier topics. A character vector. #' @param topics_tier_3 Third-tier topics. A character vector. #' #' @inherit hierarchize_topics return #' @family topics #' @export #' #' @examples #' library(magrittr) #' #' rdb::rfrnd(id = "5bbbe26a92a21351232dd73f") %$% #' rdb::hierarchize_topics_fast(unlist(topics_tier_1), #' unlist(topics_tier_2), #' unlist(topics_tier_3)) #' #' # hierarchize the topics of all Austrian referendums #' rdb::rfrnds(quiet = TRUE) |> #' dplyr::filter(country_code == "AT") |> #' dplyr::group_split(id) |> #' purrr::map(~ rdb::hierarchize_topics_fast(unlist(.x$topics_tier_1), #' unlist(.x$topics_tier_2), #' unlist(.x$topics_tier_3))) hierarchize_topics_fast <- function(topics_tier_1 = character(), topics_tier_2 = character(), topics_tier_3 = character()) { checkmate::assert_subset(topics_tier_1, choices = topics_tier_1_) checkmate::assert_subset(topics_tier_2, choices = topics_tier_2_) checkmate::assert_subset(topics_tier_3, choices = topics_tier_3_) # add tier-3 hierarchy result <- data_topics[data_topics$topic_tier_3 %in% topics_tier_3, ] # add non-parent tier-2 hierarchy topics_tier_2 %<>% setdiff(result$topic_tier_2) result %<>% dplyr::bind_rows(unique(data_topics[data_topics$topic_tier_2 %in% topics_tier_2, 1:2])) # add non-parent tier-1 topics topics_tier_1 %<>% setdiff(result$topic_tier_1) result %>% dplyr::bind_rows(tibble::tibble(topic_tier_1 = topics_tier_1)) }
infer_topics
NOTES:
Performance-wise, this fn is pretty much optimized.
A more straightforward but considerably slower main fn body would be
r
data_topics %>%
dplyr::filter(dplyr::if_any(.cols = c(topic_tier_2, topic_tier_3),
.fns = ~ .x %in% topics)) %>%
dplyr::select(paste0("topic_tier_", tier)) %>%
purrr::list_c(ptype = character()) %>%
unique()
#' Infer higher-tier topics #' #' Determines the top-tier (`tier = 1L`) or second-tier (`tier = 2L`) topics corresponding to `topics` in the #' [hierarchy][data_topics], i.e. either `topics` themselves or their (grand)parent topics. #' #' @param topics Topics from which the corresponding (grand)parent topics are to be determined. A factor or character vector. #' @param tier Tier of the inferred topics. Either `1L` or `2L`. #' #' @return A character vector. #' @family topics #' @export #' #' @examples #' rdb::infer_topics(topics = c("EU", "animal protection"), #' tier = 1L) #' rdb::infer_topics(topics = c("EU", "animal protection"), #' tier = 2L) #' #' # topics of different tiers can mixed in `topics` #' rdb::infer_topics(topics = c("EU", "environment"), #' tier = 2L) #' #' # but `topics` of a higher tier than `tier` will be ignored #' rdb::infer_topics(topics = "foreign policy", #' tier = 2L) infer_topics <- function(topics, tier = 1L) { if (is.factor(topics)) topics <- as.character(topics) checkmate::assert_subset(topics, choices = c(topics_tier_1_, topics_tier_2_, topics_tier_3_)) checkmate::assert_int(tier, lower = 1L, upper = 2L) # inferred from lower-tier topics result <- data_topics[data_topics$topic_tier_2 %in% topics | data_topics$topic_tier_3 %in% topics, ] result %<>% .[[paste0("topic_tier_", tier)]] # plus top-tier topics if (tier == 1L) result %<>% c(topics[topics %in% topics_tier_1_]) unique(result) }
Functions to augment the RDB referendum data by additional information (columns).
add_former_country_flag
#' Add `is_former_country` flag to referendum data #' #' Augments `data` with an additional column `is_former_country` indicating whether or not the column `country_code` holds an [ISO 3166-3 #' alpha-4 code](https://en.wikipedia.org/wiki/ISO_3166-3) referring to a historical country which ceased to exist. `is_former_country` being `FALSE` means #' `country_code` holds an [ISO 3166-1 alpha-2 code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) instead. #' #' @inheritParams add_world_regions #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family augment #' @export #' #' @examples #' rdb::rfrnds() |> #' rdb:::add_former_country_flag() |> #' dplyr::select(id, #' starts_with("country_"), #' is_former_country) add_former_country_flag <- function(data) { # ensure minimal validity checkmate::assert_data_frame(data) assert_vars(data = data, vars = "country_code") data %>% dplyr::mutate(is_former_country = nchar(as.character(country_code)) > 2L) %>% # add var lbl labelled::set_variable_labels(.labels = var_lbls["is_former_country"]) }
add_country_code_continual
TODO:
country_code
and country_code_continual
.#' Add continual country code to referendum data #' #' Augments `data` with an additional column `country_code_continual` holding the current or future [ISO 3166-1 #' alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) code of the country where the referendum took place. If the country still exists, #' `country_code_continual` is identical to `country_code`, otherwise it is the `country_code` of the successor country. If the country was succeeded by #' multiple countries, the code of the largest one in terms of population is taken. #' #' @inheritParams add_world_regions #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family augment #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb::add_country_code_continual() |> #' dplyr::select(id, #' starts_with("country_")) add_country_code_continual <- function(data) { # ensure minimal validity checkmate::assert_data_frame(data) assert_vars(data = data, vars = "country_code") data %>% dplyr::mutate(country_code_continual = factor(x = purrr::map2_chr(.x = as.character(country_code), .y = add_former_country_flag(data)$is_former_country, .f = ~ { if (.y) { data_iso_3166_3$Alpha_2_new_main[data_iso_3166_3$Alpha_4 == .x] } else { .x } }), levels = val_set$country_code_continual, ordered = FALSE)) %>% # add var lbl labelled::set_variable_labels(.labels = var_lbls["country_code_continual"]) }
add_country_code_long
#' Add long country code to referendum data #' #' Augments `data` with an additional column holding the current or former three-letter [ISO 3166-1 alpha-3](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3) #' code of the country in which the referendum took place (see [ISO 3166-3](https://en.wikipedia.org/wiki/ISO_3166-3_alpha-3) for former country codes). #' #' @inheritParams add_world_regions #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family augment #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb:::add_country_code_long() |> #' dplyr::select(id, #' starts_with("country_")) add_country_code_long <- function(data) { # ensure minimal validity checkmate::assert_data_frame(data) assert_vars(data = data, vars = "country_code") data %>% # remove possibly existing long country code dplyr::select(-any_of("country_code_long")) %>% # add long country code dplyr::mutate(country_code_long = factor(x = purrr::map2_chr(.x = as.character(country_code), .y = add_former_country_flag(data)$is_former_country, .f = ~ if (.y) { data_iso_3166_3$Alpha_3[data_iso_3166_3$Alpha_4 == .x] } else { data_iso_3166_1$Alpha_3[data_iso_3166_1$Alpha_2 == .x] }), levels = val_set$country_code_long, ordered = FALSE)) %>% # ensure no NAs assertr::assert(predicate = assertr::not_na, country_code_long) %>% order_rfrnd_cols() %>% # add var lbl labelled::set_variable_labels(.labels = var_lbls["country_code_long"]) }
add_country_name
#' Add short country name to referendum data #' #' Augments `data` with an additional column holding the common English name of the country in which the referendum took place. #' #' @inheritParams add_world_regions #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family augment #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb:::add_country_name() |> #' dplyr::select(id, #' starts_with("country_")) add_country_name <- function(data) { # ensure minimal validity checkmate::assert_data_frame(data) assert_vars(data = data, vars = "country_code") data %>% # remove possibly existing country name dplyr::select(-any_of("country_name")) %>% # add country name dplyr::mutate(country_name = factor(x = purrr::map2_chr(.x = as.character(country_code), .y = add_former_country_flag(data)$is_former_country, .f = ~ if (.y) { data_iso_3166_3$name_short[data_iso_3166_3$Alpha_4 == .x] } else { data_iso_3166_1$name_short[data_iso_3166_1$Alpha_2 == .x] }), levels = val_set$country_name, ordered = FALSE)) %>% # ensure no NAs assertr::assert(predicate = assertr::not_na, country_name) %>% order_rfrnd_cols() %>% # add var lbl labelled::set_variable_labels(.labels = var_lbls["country_name"]) }
add_country_name_long
#' Add long country name to referendum data #' #' Augments `data` with an additional column holding the official full English name(s) of the country in which the referendum took place. #' #' @inheritParams add_world_regions #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family augment #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb:::add_country_name_long() |> #' dplyr::select(id, #' starts_with("country_name")) add_country_name_long <- function(data) { # ensure minimal validity checkmate::assert_data_frame(data) assert_vars(data = data, vars = "country_code") data %>% # remove possibly existing long country name dplyr::select(-any_of("country_name_long")) %>% # add long country name dplyr::mutate(country_name_long = factor(x = purrr::map2_chr(.x = as.character(country_code), .y = add_former_country_flag(data)$is_former_country, .f = ~ if (.y) { data_iso_3166_3$name_long[data_iso_3166_3$Alpha_4 == .x] } else { data_iso_3166_1$name_long[data_iso_3166_1$Alpha_2 == .x] }), levels = val_set$country_name_long, ordered = FALSE)) %>% # ensure no NAs assertr::assert(predicate = assertr::not_na, country_name_long) %>% order_rfrnd_cols() %>% # add var lbl labelled::set_variable_labels(.labels = var_lbls["country_name_long"]) }
add_period
#' Add period to referendum data #' #' Augments `data` with an additional column holding the specified period in which the referendum took place. The new column is named after `period` and its #' values are always of type integer. #' #' ```r #' ``` #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the column `date`. #' @param period Type of period to add. One of #' `r pal::fn_param_defaults(fn = add_period, param = "period") |> pal::wrap_chr("\x60") |> cli::ansi_collapse(sep2 = " or ", last = " or ")`. #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family augment #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb::add_period() |> #' dplyr::select(id, date, week) #' #' rdb::rfrnds(quiet = TRUE) |> #' rdb::add_period("year") |> #' dplyr::select(id, date, year) add_period <- function(data, period = c("week", "month", "quarter", "year", "decade", "century")) { checkmate::assert_data_frame(data) period <- rlang::arg_match(period) assert_vars(data = data, vars = "date") # define necessary date transformations get_period <- switch(EXPR = period, week = function(x) clock::as_iso_year_week_day(x) %>% clock::get_week(), month = function(x) clock::get_month(x), quarter = function(x) clock::as_year_quarter_day(x) %>% clock::get_quarter(), year = function(x) clock::get_year(x), decade = function(x) (clock::get_year(x) %/% 10L) * 10L, century = function(x) (clock::get_year(x) %/% 100L) * 100L) # define lbl parts period_lbl <- switch(EXPR = period, week = glue::glue("{period} (1\u201353)"), month = glue::glue("{period} (1\u201312)"), quarter = glue::glue("{period} (1\u20134)"), period) data %>% # add period dplyr::mutate(!!as.symbol(period) := get_period(date)) %>% # harmonize col order order_rfrnd_cols() %>% # add var lbl labelled::set_variable_labels(.labels = var_lbls[period]) }
add_turnout
TODO:
rough
to FALSE
.#' Add turnout to referendum data #' #' @description #' Augments `data` with an additional column `turnout` containing the voter turnout calculated as: #' #' \Sexpr[results=rd, stage=build]{ #' katex::math_to_rd(tex = "\\\\frac{votes\\\\_yes+votes\\\\_no+votes\\\\_empty+votes\\\\_invalid}{electorate\\\\_total}", #' ascii = "(votes_yes + votes_no + votes_empty + votes_invalid) / electorate_total", #' displayMode = TRUE) #' } #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the columns `electorate_total`, `votes_yes`, `votes_no`, #' `votes_empty` and `votes_invalid`. #' @param rough Whether to fall back on a "rough" calculation of the turnout in case any of the variables `votes_empty` or `votes_invalid` is unknown (`NA`), or #' to be strict and return `NA` in such a case. #' @param excl_dubious Whether or not to exclude obviously dubious turnout numbers (those > 1.0) by setting them to `NA`. Such numbers stem either from #' data errors or (officially) tampered numbers. #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family augment #' @export #' #' @examples #' # rough turnout numbers #' rdb::rfrnds(quiet = TRUE) |> #' rdb::add_turnout() |> #' dplyr::select(id, #' electorate_total, #' starts_with("votes_"), #' turnout) #' #' # strict turnout numbers #' rdb::rfrnds(quiet = TRUE) |> #' rdb::add_turnout(rough = FALSE) |> #' dplyr::select(id, #' electorate_total, #' starts_with("votes_"), #' turnout) add_turnout <- function(data, rough = TRUE, excl_dubious = TRUE) { checkmate::assert_data_frame(data) checkmate::assert_flag(rough) checkmate::assert_flag(excl_dubious) assert_vars(data = data, vars = c("electorate_total", "votes_yes", "votes_no", "votes_empty", "votes_invalid")) data %>% dplyr::rowwise() %>% dplyr::mutate(turnout = sum(votes_yes, votes_no, votes_empty, votes_invalid, na.rm = rough) / electorate_total) %>% dplyr::ungroup() %>% # set dubious turnout numbers to NA if requested dplyr::mutate(turnout = dplyr::if_else(excl_dubious & turnout > 1.0, NA_real_, turnout)) %>% # harmonize col order order_rfrnd_cols() %>% # add var lbl labelled::set_variable_labels(turnout = var_lbls[["turnout"]] %>% ifelse(test = rough, yes = stringr::str_replace(string = ., pattern = stringr::fixed("turnout"), replacement = "turnout (rough)"), no = .)) }
add_world_regions
#' Add UN world regions to referendum data #' #' @description #' Augments `data` with information about the [United Nations (UN) geoscheme](https://en.wikipedia.org/wiki/United_Nations_geoscheme) on three different #' grouping tiers based on the [UN M49 area code hierarchy](https://en.wikipedia.org/wiki/UN_M49#Code_lists). #' #' In total, eight different columns are added: #' - `un_country_code`: UN M49 country code #' - `un_region_tier_1_code`: UN tier-1 region's M49 area code #' - `un_region_tier_1_name`: UN tier-1 region's English name #' - `un_region_tier_2_code`: UN tier-2 region's M49 area code #' - `un_region_tier_2_name`: UN tier-2 region's English name #' - `un_region_tier_3_code`: UN tier-3 region's M49 area code #' - `un_region_tier_3_name`: UN tier-3 region's English name #' - `un_subregion`: Combinatiorial English UN subregion name which, except for Northern Europe, corresponds to the lowest `un_region_tier_*_name`. #' #' Tier-1 regions are the highest, i.e. most aggregated UN regions, commonly referred to as continents. Tier-2 regions are also known as "subregions" and tier-3 #' regions as "sub-subregions". #' #' Only part of all UN tier-2 regions are further divided into UN tier-3 regions, meaning that not all countries are part of a UN tier-3 region. If a country #' doesn't belong to any UN tier-3 region, the corresponding `un_region_tier_3_*` values will simply be `NA`. The `un_subregion` column specifically addresses #' this issue by providing a uniform combination of `un_region_tier_2_name` and `un_region_tier_3_name`. #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the column `country_code` (with [ISO 3166-1 #' alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) or [ISO 3166-3 alpha-4](https://en.wikipedia.org/wiki/ISO_3166-3) codes). #' @param add_un_country_code Whether or not to also add a column `un_country_code` holding the UN M49 code of the country in which the referendum took place. #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family augment #' @export #' #' @examples #' rdb::rfrnd(id = "5bbbe26a92a21351232dd73f") |> #' rdb::add_world_regions() |> #' dplyr::select(id, #' starts_with("country_"), #' starts_with("un_")) add_world_regions <- function(data, add_un_country_code = TRUE) { # ensure minimal validity checkmate::assert_data_frame(data) assert_vars(data = data, vars = "country_code") checkmate::assert_flag(add_un_country_code) has_country_code_continual <- "country_code_continual" %in% colnames(data) # add UN regions to input data data %<>% # temporarily add required base var `country_code_continual` if necessary add_country_code_continual() %>% # remove possibly existing UN region vars dplyr::select(-any_of(setdiff(colnames(un_regions), "country_code"))) %>% # add UN regions dplyr::left_join(y = un_regions, by = c(country_code_continual = "country_code")) %>% # ensure every row got at least a UN tier-1 region assigned assertr::assert(predicate = assertr::not_na, un_region_tier_1_code) %>% # harmonize col order order_rfrnd_cols() %>% # add var lbl labelled::set_variable_labels(.labels = purrr::keep_at(x = var_lbls, at = c("un_country_code", "un_region_tier_1_code", "un_region_tier_1_name", "un_region_tier_2_code", "un_region_tier_2_name", "un_region_tier_3_code", "un_region_tier_3_name", "un_subregion"))) # drop vars if necessary/requested if (!has_country_code_continual) { data %<>% dplyr::select(-country_code_continual) } if (!add_un_country_code) { data %<>% dplyr::select(-un_country_code) } data }
add_urls
#' Add various URLs to referendum data #' #' Augments `data` with additional columns holding URLs of the specified `types`. The new columns will be named after `types`, prefixed with `url_`, so #' `types = "sudd"` will add the column `url_sudd` etc. #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the column `id_sudd` for `types = "sudd"` and the columns #' `country_code`, `level` and `id_official` for `types = "swissvotes"`. #' @param types Type(s) of URLs to add. One or more of #' `r pal::fn_param_defaults(fn = add_urls, param = "types") |> pal::wrap_chr("\x60") |> cli::ansi_collapse(sep2 = " or ", last = " or ")`. #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family augment #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' dplyr::filter(country_code == "CH" & level == "national") |> #' rdb::add_urls() |> #' dplyr::select(id, #' country_code, #' level, #' starts_with("id_"), #' starts_with("url_")) add_urls <- function(data, types = c("sudd", "swissvotes")) { checkmate::assert_data_frame(data) types <- rlang::arg_match(arg = types, multiple = TRUE) if ("sudd" %in% types) { assert_vars(data = data, vars = "id_sudd") data %<>% dplyr::mutate(url_sudd = dplyr::if_else(is.na(id_sudd), NA_character_, url_sudd(glue::glue("event.php?id={id_sudd}")))) } if ("swissvotes" %in% types) { assert_vars(data = data, vars = c("country_code", "level", "id_official")) data %<>% dplyr::mutate(url_swissvotes = dplyr::if_else(country_code == "CH" & level == "national" & !is.na(id_official), paste0("https://swissvotes.ch/vote/", id_official), # nolint: paste_linter NA_character_)) } data }
Functions to transform the RDB referendum data into other shapes, each with a specific purpose.
as_ballot_dates
#' Transform to ballot-date-level observations #' #' Transforms referendum-level observations to ones on the level of ballot date and jurisdiction via [nesting][tidyr::nest] of referendum-level columns. The #' individual values of all the referendums on a specific ballot date in a specific jurisdiction are preserved in a list column named `rfrnd_data`. #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the column `date`. #' @param cols_to_retain Additional non-standard columns to be preserved as top-level columns instead of being nested in the list column `rfrnd_data`. They #' mustn't vary within ballot-date-level observations. `r pkgsnip::param_lbl("tidy_select_support")` #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family transform #' @export #' #' @examples #' # standard RDB columns are retained as far as possible #' rdb::rfrnds(quiet = TRUE) |> #' rdb::as_ballot_dates() #' #' # non-standard columns must be explicitly specified in order to be retained #' data_rdb <- #' rdb::rfrnds(quiet = TRUE) |> #' rdb::add_world_regions() |> #' dplyr::mutate(region_custom = #' factor(x = dplyr::if_else(country_code == "CH", #' "Switzerland & Liechtenstein", #' un_region_tier_1_name), #' levels = c("Switzerland & Liechtenstein", #' levels(un_region_tier_1_name))) |> #' forcats::fct_relevel("Switzerland & Liechtenstein", #' after = 3L) |> #' forcats::fct_recode("rest of Europe" = "Europe")) #' #' data_rdb |> rdb::as_ballot_dates() |> colnames() #' data_rdb |> rdb::as_ballot_dates(cols_to_retain = region_custom) |> colnames() #' #' # non-standard columns to retain must actually be retainable #' try( #' data_rdb |> rdb::as_ballot_dates(cols_to_retain = title_en) #' ) as_ballot_dates <- function(data, cols_to_retain = NULL) { checkmate::assert_data_frame(data) defused_cols_to_retain <- rlang::enquo(cols_to_retain) ix_cols_to_retain <- tidyselect::eval_select(expr = defused_cols_to_retain, data = data) names_cols_to_retain <- names(ix_cols_to_retain) # ensure date col is present if (!("date" %in% colnames(data))) { cli::cli_abort("Unable to transform to ballot-date-level data since no {.var {date}} column is present in {.arg data}.") } # nest data cols_to_nest <- data |> colnames() |> setdiff(c(ballot_date_colnames, names_cols_to_retain)) result <- data |> tidyr::nest(rfrnd_data = any_of(cols_to_nest)) # ensure `cols_to_retain` don't vary within ballot dates n_rows_nested <- data |> dplyr::summarise(n = dplyr::n(), .by = any_of(ballot_date_colnames)) %$% n if (!identical(purrr::map_int(result$rfrnd_data, nrow), n_rows_nested)) { cli::cli_abort(paste0("Retaining the additional non-standard {cli::qty(length(ix_cols_to_retain))} column{?s} {.var {names_cols_to_retain}} while ", "converting to ballot-date-level observations is impossible because {?(some of)} {?this/these} column{?s} var{?ies/y} within ballot ", "dates.")) } result }
unnest_var
#' Unnest multi-value variable #' #' Unnests a multi-value variable of type list to long format. Multi-value variables can contain more than one value per observation and thus break with the #' [tidy-data convention](https://tidyr.tidyverse.org/articles/tidy-data.html). This function allows to conveniently expand `data` to contain a single `var` #' value per observation only, thereby increasing the number of observations (i.e. rows). #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the column specified in `var`. #' @param var `data` column to unnest. One of the multi-value variables: #' `r data_codebook |> dplyr::filter(is_multi_valued) %$% variable_name |> pal::wrap_chr(wrap = "\x60") |> pal::as_md_list()` #' #' `r pkgsnip::param_lbl("tidy_select_support")` #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family transform #' @family unnest #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb::unnest_var(topics_tier_2) unnest_var <- function(data, var) { # tidy selection and arg check checkmate::assert_data_frame(data) defused_var <- rlang::enquo(var) i_var <- tidyselect::eval_select(expr = defused_var, data = data) name_var <- names(i_var) n_var <- length(i_var) if (n_var > 1L) { cli::cli_abort("Only {.emph one} {.arg var} can be unnested at a time, but {.val {n_var}} were provided.") } name_var <- rlang::arg_match0(arg = name_var, arg_nm = "var", values = data_codebook |> dplyr::filter(is_multi_valued) %$% variable_name) name_var_unnested <- var_name_unnested(name_var) data |> tidyr::unnest_longer(col = all_of(name_var), values_to = name_var_unnested, keep_empty = TRUE, ptype = character()) |> dplyr::mutate(!!as.symbol(name_var_unnested) := factor(x = !!as.symbol(name_var_unnested), levels = var_vals(name_var), ordered = val_scale(name_var) %in% c("ordinal_ascending", "ordinal_descending"))) }
n_rfrnds
#' Count number of referendums #' #' Counts the number of RDB referendums, optionally by additional columns specified via `by`. #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the columns specified in `by` (if any). #' @param by Optional `data` column(s) to group by before counting number of referendums. `r pkgsnip::param_lbl("tidy_select_support")` #' @param complete_fcts Whether or not to complete the result with implicitly missing combinations of those columns specified in `by` which are of type factor. #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family transform #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb::n_rfrnds(by = level) #' #' # count ballot dates instead of referendums #' rdb::rfrnds(quiet = TRUE) |> #' rdb::as_ballot_dates() |> #' rdb::n_rfrnds(by = level) n_rfrnds <- function(data, by = NULL, complete_fcts = TRUE) { # arg checks checkmate::assert_data_frame(data) checkmate::assert_flag(complete_fcts) # tidy selection defused_by <- rlang::enquo(by) ix_by <- tidyselect::eval_select(expr = defused_by, data = data) names_by <- names(ix_by) result <- data |> dplyr::group_by(!!!rlang::syms(names_by)) |> dplyr::summarise(n = dplyr::n(), .groups = "drop") if (complete_fcts) { result %<>% tidyr::complete(!!!rlang::syms(names_by), fill = list(n = 0L)) } result }
n_rfrnds_per_period
#' Count number of referendums per period #' #' Counts the number of RDB referendums per desired period, optionally by additional columns specified via `by`. #' #' ```r #' ``` #' #' @inheritParams n_rfrnds #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the column specified in `period` or the column `date` (to #' compute the [period column][add_period]), plus the one(s) specified via `by` (if any). #' @param period Type of period to count referendums by. One of #' `r pal::fn_param_defaults(fn = add_period, param = "period") |> pal::wrap_chr("\x60") |> cli::ansi_collapse(sep2 = " or ", last = " or ")`. #' @param fill_gaps Whether or not to add zero-value rows to the result for `period` gaps in `data`. #' @param period_floor Lower `period` limit up to which gaps are filled. If `NULL`, the lower limit is set to the minimum of `period` present in `data`. Only #' relevant if `fill_gaps = TRUE` and `period` is set to a unique timespan type (`"year"`, `"decade"` or `"century"`). #' @param period_ceiling Upper `period` limit up to which gaps are filled. If `NULL`, the upper limit is set to the maximum of `period` present in `data`. Only #' relevant if `fill_gaps = TRUE` and `period` is set to a unique timespan type (`"year"`, `"decade"` or `"century"`). #' @param descending Whether to sort the resulting table by `period` in descending or in ascending order. #' #' @inherit add_period details #' @return `r pkgsnip::return_lbl("tibble")` #' @family transform #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb::n_rfrnds_per_period() #' #' rdb::rfrnds(quiet = TRUE) |> #' rdb::n_rfrnds_per_period(by = level) #' #' # without filling gaps #' rdb::rfrnds(quiet = TRUE) |> #' rdb::n_rfrnds_per_period(by = level, #' fill_gaps = FALSE) #' #' # per decade and by multiple columns #' rdb::rfrnds(quiet = TRUE) |> #' rdb::n_rfrnds_per_period(by = c(level, type), #' period = "decade") #' #' # count ballot dates instead of referendums #' rdb::rfrnds(quiet = TRUE) |> #' rdb::as_ballot_dates() |> #' rdb::n_rfrnds_per_period() n_rfrnds_per_period <- function(data, by = NULL, period = c("week", "month", "quarter", "year", "decade", "century"), fill_gaps = TRUE, period_floor = NULL, period_ceiling = NULL, descending = FALSE) { # arg checks checkmate::assert_data_frame(data) period <- rlang::arg_match(period) checkmate::assert_flag(fill_gaps) checkmate::assert_int(period_floor, null.ok = TRUE) checkmate::assert_int(period_ceiling, null.ok = TRUE) checkmate::assert_flag(descending) # tidy selection defused_by <- rlang::enquo(by) ix_by <- tidyselect::eval_select(expr = defused_by, data = data) names_by <- names(ix_by) # add period col if necessary if (!(period %in% colnames(data))) { data %<>% add_period(period = period) } result <- data |> dplyr::group_by(!!!rlang::syms(names_by), !!as.symbol(period)) |> dplyr::summarise(n = dplyr::n(), .groups = "drop") # fill gaps # (only if input data (and thus result) is non-empty since otherwise we can't infer a sensible period range for year/decade/century) if (fill_gaps && nrow(result)) { # define sensible min/max period vals is_recurring_period <- period %in% c("week", "month", "quarter") period_step <- switch(EXPR = period, century = 100L, decade = 10L, 1L) period_min <- period |> pal::when(is.null(period_floor) && !is_recurring_period ~ pal::safe_min(data[[.]]), !is_recurring_period ~ period_floor, ~ 1L) period_max <- period |> pal::when(is.null(period_ceiling) && !is_recurring_period ~ pal::safe_max(data[[.]]), !is_recurring_period ~ period_ceiling, . == "week" ~ 53L, . == "month" ~ 12L, . == "quarter" ~ 4L) period_seq <- seq(from = (period_min %/% period_step) * period_step, to = period_max, by = period_step) result %<>% # reduce to results `>= period_floor` and `<= period_ceiling` dplyr::filter(!!as.symbol(period) %in% period_seq) %>% # convert period col to fct, so `tidyr::complete()` knows the missing vals dplyr::mutate(!!as.symbol(period) := factor(x = !!as.symbol(period), levels = period_seq, ordered = TRUE)) %>% tidyr::complete(!!!rlang::syms(names_by), !!as.symbol(period), fill = list(n = 0L)) %>% # convert period col back to int dplyr::mutate(!!as.symbol(period) := as.integer(as.character(!!as.symbol(period)))) } result |> dplyr::arrange(if (descending) dplyr::desc(!!as.symbol(period)) else !!as.symbol(period)) }
prettify_col_names
#' Prettify referendum data column names #' #' Renames referendum data column names to be ready for publication. Useful e.g. to create tables or visualizations. #' #' Note that #' - column names unknown to this function are not changed. #' - column *labels* are [removed][labelled::remove_var_label] so they aren't inadvertently used instead of the column *names* (i.a. relevant for [gt][gt::gt] #' [>= 0.9.0](https://gt.rstudio.com/news/index.html#minor-improvements-and-bug-fixes-0-9-0)). #' #' @param data RDB referendum data as returned by [rfrnds()]. #' #' @return `r pkgsnip::return_lbl("tibble")` #' @family transform #' @family prettify #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb::prettify_col_names() prettify_col_names <- function(data) { data |> dplyr::rename_with(.cols = everything(), .fn = prettify_var_names) |> # we remove the var lbls so gt doesn't automatically pick them up instead of the column names # cf. https://gt.rstudio.com/news/index.html#minor-improvements-and-bug-fixes-0-9-0 labelled::remove_var_label() }
Functions to visualize the RDB referendum data (using plotly).
plot_rfrnd_share_per_period
#' Referendum share per period stacked area chart #' #' Creates a [Plotly stacked area chart](https://plotly.com/r/filled-area-plots/#stacked-area-chart-with-cumulative-values) that visualizes the share of #' referendums per period, grouped by another column. #' #' ```r #' ``` #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the column specified in `period` or the column `date` (to #' compute the [period column][add_period]), plus the column specified in `by`. #' @param by `data` column to group by before counting number of referendums. `r pkgsnip::param_lbl("tidy_select_support")` #' @param period Type of period to count referendums by. One of #' `r pal::fn_param_defaults(fn = add_period, param = "period") |> pal::wrap_chr("\x60") |> cli::ansi_collapse(sep2 = " or ", last = " or ")`. #' #' @return `r pkgsnip::param_lbl("plotly_obj")` #' @family visualize #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE, #' max_cache_age = "1 year") |> #' rdb::plot_rfrnd_share_per_period(by = "level") plot_rfrnd_share_per_period <- function(data, by, period = c("week", "month", "quarter", "year", "decade", "century")) { period <- rlang::arg_match(period) # add period col if necessary if (!(period %in% colnames(data))) { data %<>% add_period(period = period) } # tidy selection defused_by <- rlang::enquo(by) i_by <- tidyselect::eval_select(expr = defused_by, data = data) n_by <- length(i_by) name_by <- names(i_by) # ensure `x` is < 2 if (n_by > 1L) { cli::cli_abort("Only {.emph one} column can be specified in {.arg by}, but {.val {n_by}} were provided.") } data %>% # calculate freqs dplyr::group_by(!!as.symbol(name_by), !!as.symbol(period)) %>% dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% # plot plot_share_per_period(x = name_by, period = period) }
plot_topic_segmentation
NOTES:
Unless either
a) topic assignment is limited to one single topic lineage per referendum (and the existing data cleaned), or
b) topics are stored in a non-deduplicated way per tier (e.g. in a tibble as returned by rdb::hierarchize_topics()
),
we cannot cheaply create the visualization since we have to restore the full topic hierarchy for every row. For b), it'd probably still be slow for
method = "per_rfrnd"
.
If we decide for a), we must adapt the spec in issue #41 accordingly and in the admin portal should nudge towards assigning the topic on tier 3 whenever possible.
TODO:
Tweak hover labels (show different numbers etc.)
Function is computationally heavy (i.e. slow) because of the above. The rdb::hierarchize_topics_fast()
call per row still takes too much time. Maybe there
are other optimizations possible?
This alternative to the relevant first step is equally slow:
r
data_plot <-
data %>%
dplyr::rowwise() %>%
dplyr::mutate(topics_hierarchy =
rdb::hierarchize_topics_fast(unlist(topics_tier_1),
unlist(topics_tier_2),
unlist(topics_tier_3)) %>%
dplyr::mutate(value = 1 / nrow(.)) %>%
list()) %$%
topics_hierarchy %>%
purrr::map(identity) %>%
purrr::list_rbind()
#' Topic segmentation sunburst chart #' #' Creates a [Plotly sunburst chart](https://plotly.com/r/sunburst-charts/) that visualizes the hierarchical segmentation of referendum topic occurences. #' #' A *topic lineage* is the hierarchical compound of a `topic_tier_1` and optionally a grandchild `topic_tier_3` and/or a child `topic_tier_2`. #' #' Note that topics can be assigned on any tier to referendums (i.e. in one case, a `topic_tier_1` plus a child `topic_tier_2` is assigned, and in another case #' only a `topic_tier_1` without any further child topic). #' #' Furthermore, it should be noted that not every `topic_tier_2` has potential child `topic_tier_3`s. See the [full topic hierarchy](`r url_codebook("topics")`) #' for details. #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the columns `topics_tier_1`, `topics_tier_2` and #' `topics_tier_3`. #' @param method Applied method to count the number of topic occurences. One of #' - **`"per_rfrnd"`**: All *referendums* have the same weight. For a referendum with n different topics of the same tier, every topic is counted 1/n. #' - **`"per_topic_lineage"`**: All *topic lineages* have the same weight. For a referendum with n different topics of the same tier, every topic is fully #' counted, meaning that e.g. a referendum with three different tier-3 topics has a tripled impact on the result compared to a referendum that only has a #' single one. Noticeably faster than `"per_rfrnd"`. #' - **`"naive"`**: Naive procedure which doesn't properly reflect topic proportions on tier 2 and 3. Based on the (wrong) assumptions that a) all referendums #' have the same number of topic lineages assigned and b) topics are not deduplicated per tier. By far the fastest method, though. #' #' @return `r pkgsnip::param_lbl("plotly_obj")` #' @family visualize #' @export #' #' @examples #' # count each referendum equally #' rdb::rfrnds(quiet = TRUE) |> #' rdb::plot_topic_segmentation(method = "per_rfrnd") #' #' # count each topic lineage equally #' rdb::rfrnds(quiet = TRUE) |> #' rdb::plot_topic_segmentation(method = "per_topic_lineage") #' #' # naive count (way faster, but with misleading proportions on tier 2 and 3) #' rdb::rfrnds(quiet = TRUE) |> #' rdb::plot_topic_segmentation(method = "naive") plot_topic_segmentation <- function(data, method = c("per_rfrnd", "per_topic_lineage", "naive")) { method <- rlang::arg_match(method) rlang::check_installed("plotly", reason = pal::reason_pkg_required()) is_naive <- method == "naive" # assemble necessary data structure if (is_naive) { ## naively data_plot <- dplyr::bind_rows( data$topics_tier_1 %>% topic_frequency(tier = 1L) %>% dplyr::mutate(parent_topic = ""), data$topics_tier_2 %>% topic_frequency(tier = 2L) %>% dplyr::mutate(parent_topic = purrr::map_chr(.x = as.character(topic), .f = infer_topics, tier = 1L)), data$topics_tier_3 %>% topic_frequency(tier = 3L) %>% dplyr::mutate(parent_topic = purrr::map_chr(.x = as.character(topic), .f = infer_topics, tier = 2L)) ) %>% dplyr::rename(value = n) } else { is_per_rfrnd <- method == "per_rfrnd" data_plot <- data %>% dplyr::select(starts_with("topics_tier_")) ### per rfrnd, i.e. in fractional numbers if (is_per_rfrnd) { data_plot %<>% purrr::pmap(~ hierarchize_topics_fast(unlist(..1), unlist(..2), unlist(..3)) %>% dplyr::mutate(value = 1.0 / nrow(.))) %>% purrr::list_rbind() ### per topic lineage } else { data_plot %<>% purrr::pmap(~ hierarchize_topics_fast(unlist(..1), unlist(..2), unlist(..3))) %>% purrr::list_rbind() %>% dplyr::mutate(value = 1.0) } data_plot <- dplyr::bind_rows( data_plot %>% dplyr::group_by(topic_tier_1) %>% dplyr::summarise(value = sum(value)) %>% dplyr::mutate(topic = topic_tier_1, parent_topic = "", value, .keep = "none"), data_plot %>% dplyr::group_by(topic_tier_2) %>% dplyr::summarise(value = sum(value)) %>% dplyr::mutate(topic = topic_tier_2, parent_topic = topic %>% purrr::map_chr(~ { if (is.na(.x)) { NA_character_ } else { infer_topics(topics = .x, tier = 1L) }}), value, .keep = "none"), data_plot %>% dplyr::group_by(topic_tier_3) %>% dplyr::summarise(value = sum(value)) %>% dplyr::mutate(topic = topic_tier_3, parent_topic = topic %>% purrr::map_chr(~ { if (is.na(.x)) { NA_character_ } else { infer_topics(topics = .x, tier = 2L) }}), value, .keep = "none") ) %>% dplyr::filter(!is.na(topic)) ### add NA rows filling the gaps data_plot %<>% dplyr::filter(parent_topic != "") %>% dplyr::group_by(parent_topic) %>% dplyr::summarise(value_total = sum(value), .groups = "drop") %>% dplyr::mutate(topic = "<i>not defined</i>", value = purrr::map2_dbl(.x = value_total, .y = parent_topic, .f = ~ data_plot %>% dplyr::filter(topic == .y) %$% value %>% checkmate::assert_number() %>% magrittr::subtract(.x)), parent_topic, .keep = "none") %>% dplyr::bind_rows(data_plot, .) %>% dplyr::mutate(id = ifelse(topic == "<i>not defined</i>", paste0("NA_", parent_topic), topic)) } # create plot plotly::plot_ly(data = data_plot, type = "sunburst", labels = ~topic, parents = ~parent_topic, ids = if (is_naive) ~topic else ~id, values = ~value, branchvalues = ifelse(is_naive, "remainder", "total"), insidetextorientation = "radial") }
plot_topic_share_per_period
#' Topic share per period stacked area chart #' #' Creates a [Plotly stacked area chart](https://plotly.com/r/filled-area-plots/#stacked-area-chart-with-cumulative-values) that visualizes the share of #' referendum topic occurences per period. #' #' ```r #' ``` #' #' @param data RDB referendum data as returned by [rfrnds()]. A data frame that at minimum contains the column `topics_tier_#` of the specified `tier`. #' @param tier Tier of the topics variable to plot. `1L`, `2L` or `3L`. #' @param period Type of period to count topics by. One of #' `r pal::fn_param_defaults(fn = add_period, param = "period") |> pal::wrap_chr("\x60") |> cli::ansi_collapse(sep2 = " or ", last = " or ")`. #' @param weight_by_n_rfrnds Whether or not to weight topic occurences by number of referendums. If `TRUE`, for a referendum with n different topics of the same #' `tier`, every topic is counted 1/n. #' #' @return `r pkgsnip::param_lbl("plotly_obj")` #' @family visualize #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb::plot_topic_share_per_period(period = "decade") plot_topic_share_per_period <- function(data, tier = 1L, period = c("week", "month", "quarter", "year", "decade", "century"), weight_by_n_rfrnds = TRUE) { checkmate::assert_int(tier, lower = 1L, upper = 3L) period <- rlang::arg_match(period) checkmate::assert_flag(weight_by_n_rfrnds) # add period col if necessary if (!(period %in% colnames(data))) { data %<>% add_period(period = period) } # ensure topics var is present var_name_topics <- glue::glue("topics_tier_{tier}") var_name_topic <- var_name_unnested(var_name_topics) if (!(var_name_topics %in% colnames(data))) { cli::cli_abort("Required column {.var {var_name_topics}} is missing from {.arg data}.") } data |> # add proper count var dplyr::mutate(count = if (weight_by_n_rfrnds) 1.0 / lengths(!!as.symbol(var_name_topics)) else 1.0) |> # unnest topics var unnest_var(var = var_name_topics) |> # calculate freqs dplyr::group_by(!!as.symbol(period), !!as.symbol(var_name_topic)) |> dplyr::summarise(n = sum(count), .groups = "drop") |> # plot plot_share_per_period(x = var_name_topic, period = period) }
ggplot_streamgraph
TODO:
Figure out optimal kernel density estimation bandwidth default.
Experiment with plotly::ggplotly()
to create an interactive streamgraph.
#' Period streamgraph #' #' Creates a ggplot2 [streamgraph](https://en.wikipedia.org/wiki/Streamgraph) based on [ggstream::geom_stream()] using the specified `period` as time #' resolution. #' #' @inheritParams n_rfrnds_per_period #' @param by `data` column to group by before counting number of referendums. #' @param stacking Stacking type. One of #' - `"mirror"` to stack absolute values symmetrically around the zero line on the x-axis. #' - `"ridge"` to stack absolute values from the zero line on the x-axis upwards. #' - `"proportional"` to stack relative values that add up to 100 %. #' @param bandwidth Kernel density estimation bandwidth. A numeric scalar. #' @param y_lim Optional Y axis range limit. Only relevant if `stacking` is one of `"mirror"` or `"ridge"`. The limit applies to the upper side if #' `stacking = "ridge"` and to both sides if `stacking = "mirror"`. A numeric scalar equal or greater than zero. #' @param color_palette Color palette function that when called with a single integer argument returns that many color codes. #' @param prune_legend Whether or not to drop `by` factor levels which don't occur in `data` from the legend. Only has an effect if `by` is of type factor. #' #' @return `r pkgsnip::return_lbl("ggplot2_obj")` #' @family visualize #' @export #' #' @examples #' data_rdb <- rdb::rfrnds(quiet = TRUE) #' #' rdb::ggplot_streamgraph(data = data_rdb, #' by = topics_tier_1, #' period = "year") #' #' # you can specify a different color palette #' rdb::ggplot_streamgraph(data = data_rdb, #' by = topics_tier_1, #' period = "year", #' color_palette = viridisLite::viridis) #' #' # by default, only factor levels which occur in data are included in the legend #' data_rdb |> #' dplyr::filter(country_code == "AT") |> #' rdb::ggplot_streamgraph(by = topics_tier_1, #' period = "decade") #' #' # but you can include *all* factor levels in the legend if you want to #' data_rdb |> #' dplyr::filter(country_code == "AT") |> #' rdb::ggplot_streamgraph(by = topics_tier_1, #' period = "decade", #' prune_legend = FALSE) ggplot_streamgraph <- function(data, by, period = c("week", "month", "quarter", "year", "decade", "century"), stacking = c("mirror", "ridge", "proportional"), bandwidth = 0.75, y_lim = NULL, color_palette = viridisLite::turbo, prune_legend = TRUE) { stacking <- rlang::arg_match(stacking) checkmate::assert_number(bandwidth) checkmate::assert_number(y_lim, lower = 0.0, finite = TRUE, null.ok = TRUE) checkmate::assert_function(color_palette) checkmate::assert_flag(prune_legend) rlang::check_installed("ggplot2", reason = pal::reason_pkg_required()) rlang::check_installed("ggstream", reason = pal::reason_pkg_required()) rlang::check_installed("scales", reason = pal::reason_pkg_required()) rlang::check_installed("viridisLite", reason = pal::reason_pkg_required()) ix_by <- tidyselect::eval_select(expr = rlang::enquo(by), data = data) names_by <- names(ix_by) name_by <- names_by n_by <- length(ix_by) if (n_by > 1L) { cli::cli_abort("Only {.emph one} data column can be specified in {.arg by}, but {.val {n_by}} were provided.") } # unnest list col if necessary if (is.list(data[[ix_by]])) { data %<>% unnest_var(var = tidyselect::all_of(names_by)) name_by <- var_name_unnested(names_by) } result <- n_rfrnds_per_period(data = data, period = period, by = !!as.symbol(name_by)) # create stable color mapping based on lvls of `by` # TODO: this probably could be removed and we could instead pass on a *discrete fill scale* fn (e.g. `\(...) viridis::scale_fill_viridis(..., discrete = T)`) # to `ggplot2::scale_fill_discrete(type = )` once the below mentioned issue #23 is resolved if (is.factor(data[[name_by]])) { vals_by <- levels(data[[name_by]]) } else { vals_by <- sort(unique(data[[name_by]])) } colors_by <- color_palette(length(vals_by)) names(colors_by) <- vals_by # we need to remove zero-n rows (plus the corresponding colors) since ggstream doesn't handle them properly # TODO: remove this workaround once [issue #23](https://github.com/davidsjoberg/ggstream/issues/23) is fixed ## if we prune the legend, we need to prune the fill colors, too (otherwise, colors aren't matched properly) if (prune_legend) { colors_by %<>% magrittr::extract(names(.) %in% unique(result[[name_by]][result$n > 0L])) } result %<>% dplyr::group_by(!!as.symbol(name_by)) %>% dplyr::group_modify(\(d, k) if (sum(d$n) > 0L) d else d[0L, ]) %>% dplyr::ungroup() result <- ggplot2::ggplot(data = result, mapping = ggplot2::aes(x = !!as.symbol(period), y = n, fill = !!as.symbol(name_by))) + ggstream::geom_stream(type = stacking, n_grid = 10000L, show.legend = TRUE, bw = bandwidth) + ggplot2::scale_fill_discrete(type = colors_by, name = prettify_var_names(name_by), drop = prune_legend) + ggplot2::xlab(ggplot2::element_blank()) + ggplot2::ylab(ggplot2::element_blank()) if (stacking == "ridge" && !is.null(y_lim)) { result <- result + ggplot2::coord_cartesian(ylim = c(0.0, y_lim), default = TRUE) } else if (stacking == "mirror") { if (!is.null(y_lim)) { result <- result + ggplot2::coord_cartesian(ylim = c(-y_lim, y_lim), default = TRUE) } # make y scale absolute in both directions result <- result + ggplot2::scale_y_continuous(labels = \(x) abs(x)) } else if (stacking == "proportional") { result <- result + ggplot2::scale_y_continuous(labels = scales::label_percent(suffix = "\u2009%")) } result }
Functions to create analyses in tabular form (using gt tables).
tbl_n_rfrnds
#' Tabulate number of referendums #' #' Creates a ready-to-print [gt][gt::gt] table with the number of referendums, optionally counted `by` up to three additional variables. #' #' The first variable specified in `by` will be reflected in additional rows in the resulting table, i.e. expand it vertically. The second and third variables #' will be reflected in additional columns, i.e. expand it horizontally. #' #' @inheritParams n_rfrnds #' @param by Up to three additional `data` columns to group by before counting number of referendums. `r pkgsnip::param_lbl("tidy_select_support")` #' @param n_rows Maximum number of rows to be included in the resulting table. All the rows exceeding that limit are replaced by a single row of ellipses. An #' integer scalar or `Inf` for an unlimited number of rows. #' @param order How to order the rows of the resulting table. One of #' - `"ascending"` to sort in ascending order by the number of referendums, #' - `"descending"` to sort in descending order by the number of referendums, or #' - `NULL` to leave the sorting unchanged. #' @param incl_row_head Whether or not to include a row heading with the [prettified][prettify_var_names] name of the first `by` variable. #' @param incl_col_head Whether or not to include column headings (in the table's [stub][gt::tab_stubhead]) with the [prettified][prettify_var_names] names of #' the second and third `by` variables. #' @param add_total_row Whether or not to add a summary row at the very end of the table containing column totals. If `NULL`, a total row is added only if #' at least one column is provided in `by`. #' @param add_total_col Whether or not to add a summary column at the very end of the table containing row totals. If `NULL`, a total column is added only if #' multiple columns are provided in `by`. #' @param lbl_total_row Label of the summary row containing column totals. Only relevant if `add_total_row = TRUE`. A character scalar. [gt::md()] or #' [gt::html()] can be used to format the label text. #' @param lbl_total_col Label of the summary column containing row totals. Only relevant if `add_total_col = TRUE`. A character scalar. [gt::md()] or #' [gt::html()] can be used to format the label text. #' #' @return `r pkgsnip::return_lbl("gt_obj")` #' @family tabulate #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb::tbl_n_rfrnds() #' #' # grouped by a single column #' rdb::rfrnds(quiet = TRUE) |> #' rdb::tbl_n_rfrnds(by = level) #' #' # grouped by two columns #' rdb::rfrnds(quiet = TRUE) |> #' rdb::tbl_n_rfrnds(by = c(type, level)) #' #' # grouped by three columns #' rdb::rfrnds(quiet = TRUE) |> #' rdb::tbl_n_rfrnds(by = c(country_name, level, type), #' n_rows = 10L, #' order = "descending") #' #' # count ballot dates instead of referendums #' rdb::rfrnds(quiet = TRUE) |> #' rdb::as_ballot_dates() |> #' rdb::tbl_n_rfrnds(by = c(country_name, level), #' n_rows = 10L, #' order = "descending") tbl_n_rfrnds <- function(data, by = NULL, complete_fcts = TRUE, n_rows = Inf, order = NULL, incl_row_head = TRUE, incl_col_head = TRUE, add_total_row = NULL, add_total_col = NULL, lbl_total_row = gt::md("**Total**"), lbl_total_col = gt::md("**Total**")) { if (!isTRUE(is.infinite(n_rows))) { checkmate::assert_int(n_rows, lower = 1L) } if (!is.null(order)) { rlang::arg_match0(arg = order, values = c("ascending", "descending")) } checkmate::assert_flag(incl_row_head) checkmate::assert_flag(incl_col_head) checkmate::assert_flag(add_total_row, null.ok = TRUE) checkmate::assert_flag(add_total_col, null.ok = TRUE) checkmate::assert_string(lbl_total_row) checkmate::assert_string(lbl_total_col) rlang::check_installed("gt", version = "0.9.0", reason = pal::reason_pkg_required()) ix_by <- tidyselect::eval_select(expr = rlang::enquo(by), data = data) n_by <- length(ix_by) has_by <- n_by > 0L has_by_rest <- n_by > 1L if (n_by > 3L) { cli::cli_abort("At most {.emph three} data columns can be specified in {.arg by}, but {.val {n_by}} were provided.") } if (is.null(add_total_row)) { add_total_row <- has_by } if (is.null(add_total_col)) { add_total_col <- has_by_rest } by_colname_1st <- names(ix_by[1L]) %|% ":no_by" by_colnames_rest <- names(ix_by[-1L]) result <- data |> n_rfrnds(by = {{ by }}, complete_fcts = complete_fcts) |> dplyr::mutate(dplyr::across(where(is.factor), ~ forcats::fct_na_value_to_level(f = .x, level = "N/A"))) |> pal::when(has_by_rest ~ tidyr::pivot_wider(data = ., names_from = by_colnames_rest, names_sort = TRUE, values_from = n), ~ .) |> dplyr::mutate(`:total` = rowSums(x = dplyr::pick(-any_of(by_colname_1st)), na.rm = TRUE), dplyr::across(everything(), ~ tidyr::replace_na(data = .x, replace = 0L)), # TODO: remove type conversion below once [issue #1305](https://github.com/rstudio/gt/issues/1305) is fixed dplyr::across(any_of(by_colname_1st), as.character)) |> pal::when(isTRUE(order == "descending") ~ dplyr::arrange(.data = ., -`:total`), isTRUE(order == "ascending") ~ dplyr::arrange(.data = ., `:total`), ~ .) |> pal::when(!add_total_col ~ dplyr::select(.data = ., -`:total`), ~ .) total_n <- result |> dplyr::select(-any_of(by_colname_1st)) |> purrr::map_int(\(x) sum(x, na.rm = TRUE)) chop_rows <- n_rows < nrow(result) if (chop_rows) { result %<>% utils::head(n = n_rows) %>% # add placeholder/ellipsis row dplyr::mutate(dplyr::across(everything(), as.character)) %>% rbind("\u2026") } # NOTE: if we chop rows (and have multiple n cols), it's impossible to create our total row using `gt::grand_summary_rows()` since its `fns` arg only # receives column content, no metadata; thus we create the total row manually if (add_total_row && chop_rows) { result %<>% rbind(c(lbl_total_row, total_n)) } result %<>% gt::gt(rowname_col = ifelse(has_by, by_colname_1st, "rowname"), process_md = TRUE) if (incl_row_head && has_by) { result %<>% gt::tab_row_group(label = by_colname_1st |> prettify_var_names() |> pal::wrap_chr(wrap = "*") |> gt::md(), rows = tidyselect::everything(), id = by_colname_1st) } if (add_total_col) { result %<>% gt::cols_label(`:total` = lbl_total_col) } if (add_total_row) { if (chop_rows) { result %<>% gt::tab_style(style = gt::cell_borders(sides = "top", color = "#D3D3D3", style = "double", weight = gt::px(6L)), locations = list(gt::cells_body(rows = n_rows + 2L), gt::cells_stub(rows = n_rows + 2L))) } else { result %<>% gt::grand_summary_rows(fns = list(id = "total", label = "DUMMY") ~ sum(., na.rm = TRUE), fmt = ~ gt::fmt_integer(., sep_mark = "")) # TODO: remove this workaround and replace `"DUMMY"` with `lbl_total_row` above as soon as [#1295](https://github.com/rstudio/gt/issues/1295) # is fixed. result$`_summary`[[1L]]$fns$total$label <- lbl_total_row } } if (incl_col_head && has_by_rest) { result %<>% gt::tab_stubhead(label = by_colnames_rest |> prettify_var_names() |> pal::wrap_chr(wrap = "*") |> paste0(collapse = "<br><br>") |> gt::md()) %>% gt::tab_style(style = gt::cell_text(align = "right", v_align = "middle"), locations = gt::cells_stubhead()) } result |> gt::tab_spanner_delim(delim = "_", split = "last") |> # right-align cols; required since they're of type chr if we chopped rows gt::cols_align(align = "right", columns = -tidyselect::any_of(by_colname_1st)) |> # hide table header if there are less than two `by` cols pal::when(!has_by_rest ~ gt::tab_options(data = ., column_labels.hidden = TRUE), ~ .) }
tbl_n_rfrnds_per_period
#' Tabulate number of referendums per period #' #' Creates a ready-to-print [gt][gt::gt] table with the number of referendums per period, optionally counted `by` up to two additional columns. #' #' ```r #' ``` #' #' @inheritParams n_rfrnds_per_period #' @inheritParams tbl_n_rfrnds #' @param by Up to two additional `data` columns to group by before counting number of referendums. `r pkgsnip::param_lbl("tidy_select_support")` #' @param squeeze_zero_rows Whether or not to compress consecutive zero-sum rows into single period span rows. #' @param add_total_col Whether or not to add a summary column at the very end of the table containing row totals. If `NULL`, a total column is added only if #' `by` is non-empty. #' #' @return `r pkgsnip::return_lbl("gt_obj")` #' @family tabulate #' @export #' #' @examples #' rdb::rfrnds(quiet = TRUE) |> #' rdb::tbl_n_rfrnds_per_period(period = "decade") #' #' # grouped by a single additional column #' rdb::rfrnds(quiet = TRUE) |> #' rdb::tbl_n_rfrnds_per_period(by = level, #' period = "decade") #' #' # grouped by two addtional columns #' rdb::rfrnds(quiet = TRUE) |> #' rdb::tbl_n_rfrnds_per_period(by = c(level, type), #' period = "decade") #' #' # count ballot dates instead of referendums #' rdb::rfrnds(quiet = TRUE) |> #' rdb::as_ballot_dates() |> #' rdb::tbl_n_rfrnds_per_period(period = "decade") tbl_n_rfrnds_per_period <- function(data, by = NULL, period = c("week", "month", "quarter", "year", "decade", "century"), fill_gaps = TRUE, period_floor = NULL, period_ceiling = NULL, squeeze_zero_rows = TRUE, descending = TRUE, add_total_row = TRUE, add_total_col = NULL, lbl_total_row = gt::md("**Total**"), lbl_total_col = lbl_total_row) { period <- rlang::arg_match(period) checkmate::assert_flag(squeeze_zero_rows) checkmate::assert_flag(add_total_row) checkmate::assert_flag(add_total_col, null.ok = TRUE) checkmate::assert_string(lbl_total_row) checkmate::assert_string(lbl_total_col) rlang::check_installed("gt", version = "0.9.0", reason = pal::reason_pkg_required()) ix_by <- tidyselect::eval_select(expr = rlang::enquo(by), data = data) n_by <- length(ix_by) has_by <- n_by > 0L if (n_by > 2L) { cli::cli_abort("At most {.emph two} additional data columns can be specified in {.arg by}, but {.val {n_by}} were provided.") } if (is.null(add_total_col)) { add_total_col <- has_by } by_names_print <- ifelse(has_by, names(ix_by) %>% prettify_var_names() %>% pal::wrap_chr(wrap = "*") %>% paste0(collapse = "<br><br>"), "") data_to_plot <- data %>% n_rfrnds_per_period(period = period, by = {{ by }}, fill_gaps = fill_gaps, period_floor = period_floor, period_ceiling = period_ceiling, descending = descending) %>% dplyr::mutate(dplyr::across(where(is.factor), ~ forcats::fct_na_value_to_level(f = .x, level = "N/A"))) %>% pal::when(has_by ~ tidyr::pivot_wider(data = ., names_from = {{ by }}, names_sort = TRUE, values_from = n), ~ .) %>% pal::when(add_total_col ~ dplyr::mutate(.data = ., `:total` = rowSums(x = dplyr::pick(-!!as.symbol(period)), na.rm = TRUE)), ~ .) %>% dplyr::mutate(dplyr::across(everything(), ~ tidyr::replace_na(data = .x, replace = 0L)), # TODO: remove type conversion below once [issue #1305](https://github.com/rstudio/gt/issues/1305) is fixed dplyr::across(all_of(period), as.character)) # squeeze consecutive all-0 rows into single row if requested ix <- integer() ix_rm <- integer() if (squeeze_zero_rows) { for (i in pal::safe_seq_len(nrow(data_to_plot))) { if (data_to_plot %>% dplyr::select(-any_of(c(period, ":total"))) %>% magrittr::extract(i, ) %>% sum() %>% magrittr::equals(0L)) { ix %<>% c(i) } else { if (length(ix) > 1L) { data_to_plot[ix[1L], period] <- paste0(data_to_plot[ix[length(ix)], period], "\u2013", data_to_plot[ix[1L], period]) ix_rm %<>% c(ix[-1L]) } ix <- integer() } } if (length(ix) > 1L) { data_to_plot[ix[1L], period] <- paste0(data_to_plot[ix[length(ix)], period], "\u2013", data_to_plot[ix[1L], period]) ix_rm %<>% c(ix[-1L]) } } # add "s" to decade/century period values if (period %in% c("decade", "century")) { data_to_plot[[period]] %<>% stringr::str_replace_all(pattern = "(\\d+)", replacement = "\\1s") } data_to_plot %>% dplyr::filter(!(dplyr::row_number() %in% ix_rm)) %>% gt::gt(rowname_col = period) %>% pal::when(add_total_col ~ gt::cols_label(.data = ., `:total` = lbl_total_col), ~ .) %>% pal::when(add_total_row ~ { result <- gt::grand_summary_rows(data = ., fns = list(label = "DUMMY", id = "total") ~ sum(., na.rm = TRUE), fmt = ~ gt::fmt_integer(., sep_mark = "")) # TODO: remove this workaround and replace `"DUMMY"` with `lbl_total_row` above as soon as [#1295](https://github.com/rstudio/gt/issues/1295) is fixed. result$`_summary`[[1L]]$fns$total$label <- lbl_total_row result }, ~ .) %>% gt::tab_spanner_delim(delim = "_", split = "last") %>% gt::tab_stubhead(label = gt::md(by_names_print)) %>% gt::tab_style(style = gt::cell_text(align = "right", v_align = "middle"), locations = gt::cells_stubhead()) }
The list.php
endpoint supports
the following filtering URL parameters which can be combined arbitrarily:
sense
: the sorting order of the results; one of desc
(descending) or asc
(ascending)area
: search the combined territory_name_de
field; case-insensitive; partial (but no fuzzy) matching is supportedtopic
: search the title_de
field; case-insensitive; partial (but no fuzzy) matching is supportedfirst
/last
: the minimum and maximum date
years to include; an integer or NaN
for no limitation; strangely, last
and first
can be
interchanged, i.e. last
can be lower than first
(results stay the same)a mode
URL parameter for "special views" with the following possible values (filtering parameters are usually ignored when mode
is provided):
allareas
: list all territories (territory_name_de
) by country (country_name_de
)alltopics
: list all title_de
values together with the number of referendumsallrefs
: ; list all referendums; supports the sense
parametermoddate
: list all referendums by date_last_edited
random
: list 5 randomly selected referendumsFunctions to scrape the data from the sudd.ch database.
list_sudd_territories
NOTES:
subnational_entity_name_de
,
or a municipality_de
. There seems to be no semantic data included in any form which we could derive this from.#' List referendum territories from [sudd.ch](https://sudd.ch/) #' #' Lists [all referendum territories from sudd.ch](https://sudd.ch/list.php?mode=allareas), which means each `country_name_de` together with all the associated #' `territory_name_de`, their search URL and their number of occurrences. #' #' Note that the values in the `territory_name_de` column returned by this function can differ from those in the `territory_name_de` column of #' [sudd_rfrnds()] and [list_sudd_rfrnds()]. The latter is often more extensive and usually includes the `country_name_de` (in parentheses) for #' subnational referendums. #' #' @inheritSection sudd_rfrnds About [sudd.ch](https://sudd.ch/) #' @return `r pkgsnip::return_lbl("tibble")` #' @family sudd #' @export #' #' @examples #' rdb::list_sudd_territories() list_sudd_territories <- function() { rows <- httr::RETRY(verb = "GET", url = url_sudd("list.php"), query = list(mode = "allareas"), times = 3L) %>% xml2::read_html() %>% rvest::html_elements(css = "main table tr") %>% purrr::map(rvest::html_elements, css = "td") col_1 <- rows %>% purrr::map(magrittr::extract2, 1L) col_2 <- rows %>% purrr::map(\(x) if (length(x) > 1L) x[[2L]] else xml2::as_xml_document(list())) col_3 <- rows %>% purrr::map(\(x) if (length(x) > 2L) x[[3L]] else xml2::as_xml_document(list())) tibble::tibble(country_name_de = purrr::map_chr(col_1, rvest::html_text), territory_name_de = purrr::map_chr(col_2, rvest::html_text), filter_url = purrr::map_chr(col_2, ~ { if (length(.x) > 0L) { .x %>% rvest::html_element(css = "a") %>% rvest::html_attr(name = "href") %>% url_sudd() } else { NA_character_ } }), n = purrr::map_chr(col_3, rvest::html_text)) %>% dplyr::mutate(country_name_de = dplyr::if_else(stringr::str_detect(string = country_name_de, pattern = "^\\s*$"), NA_character_, country_name_de)) %>% tidyr::fill(country_name_de, .direction = "down") %>% dplyr::filter(!dplyr::if_all(c(territory_name_de, n), is.na)) }
list_sudd_titles
#' List referendum titles from [sudd.ch](https://sudd.ch/) #' #' Lists [all referendum titles from sudd.ch](https://sudd.ch/list.php?mode=alltopics), together with their search URLs and number of occurrences. #' #' @inheritSection sudd_rfrnds About [sudd.ch](https://sudd.ch/) #' @return `r pkgsnip::return_lbl("tibble")` #' @family sudd #' @export #' #' @examples #' rdb::list_sudd_titles() |> dplyr::filter(n > 1) list_sudd_titles <- function() { rows <- httr::RETRY(verb = "GET", url = url_sudd("list.php"), query = list(mode = "alltopics"), times = 3L) %>% xml2::read_html() %>% rvest::html_elements(css = "main table tr") %>% purrr::map(rvest::html_elements, css = "td") col_1 <- rows %>% purrr::map(magrittr::extract2, 1L) col_2 <- rows %>% purrr::map(magrittr::extract2, 2L) tibble::tibble(title_de = col_1 %>% purrr::map_chr(rvest::html_text), filter_url = col_1 %>% purrr::map_chr(~ .x %>% rvest::html_element(css = "a") %>% rvest::html_attr(name = "href") %>% url_sudd()), n = col_2 %>% purrr::map_chr(rvest::html_text)) }
list_sudd_rfrnds
#' List referendum data from [sudd.ch](https://sudd.ch/) #' #' @description #' Lists the referendum data from [sudd.ch](https://sudd.ch/) in various ways its [`list.php`](https://sudd.ch/list.php) endpoint allows. The output of this #' function can be directly fed to [sudd_rfrnds()]. #' #' @inheritSection sudd_rfrnds About [sudd.ch](https://sudd.ch/) #' @param mode The listing mode. One of #' - `"by_date"`: Lists [all referendums in the sudd.ch database **by `date`**](https://sudd.ch/list.php?mode=allrefs), together with their `id_sudd`, #' `country_code`, `territory_name_de` and `title_de`. Specifying the sorting order of the results via the `order` parameter is supported. #' - `"by_mod_date"`: Lists [all referendums in the sudd.ch database **by `date_last_edited`**](https://sudd.ch/list.php?mode=moddate), together with their #' `id_sudd`, `country_code`, `territory_name_de`, `date` and `title_de`. #' - `"filter"`: Allows to provide additional arguments (via the `filter` parameter) that limit the results accordingly. Specifying the sorting order of the #' results via the `order` parameter is supported. #' - `"random"`: Lists the `id_sudd` of five randomly selected referendums, together with their `country_code`. #' @param order The sorting order of the results. Only relevant if `mode` is either `"by_date"` or `"filter"`. #' @param filter A list with valid filtering arguments. Only relevant if `mode = "filter"`. The supported filtering arguments include #' - `territory_name_de`: A string that must be (partially) matched by the referendums' `territory_name_de`. Matching is case-insensitive and no [fuzzy #' matching](https://en.wikipedia.org/wiki/Approximate_string_matching) is supported. #' - `title_de`: A string that must be (partially) matched by the referendums' `title_de`. Matching is case-insensitive and no [fuzzy #' matching](https://en.wikipedia.org/wiki/Approximate_string_matching) is supported. #' - `year_min`: The lower year limit of the referendums' `date`. A positive integer. #' - `year_max`: The upper year limit of the referendums' `date`. A positive integer. #' @param use_cache `r pkgsnip::param_lbl("use_cache")` #' @param max_cache_age `r pkgsnip::param_lbl("max_cache_age")` #' @param quiet `r pkgsnip::param_lbl("quiet")` #' #' @return A [tibble][tibble::tbl_df] containing at least an `id_sudd` column. #' @family sudd #' @export #' #' @examples #' # list all referendums by modification date (takes a while) #' \dontrun{ #' rdb::list_sudd_rfrnds(mode = "by_mod_date")} #' #' # list all referendums whose title matches "AHV" #' rdb::list_sudd_rfrnds(mode = "filter", #' filter = list(title_de = "AHV"), #' quiet = TRUE) #' #' # get sudd.ch referendum data from all referendums from 2020 onwards #' rdb::list_sudd_rfrnds(mode = "filter", #' filter = list(year_min = 2020), #' quiet = TRUE) |> #' rdb::sudd_rfrnds(quiet = TRUE) #' #' # get sudd.ch referendum data from five randomly picked referendums #' rdb::list_sudd_rfrnds(mode = "random", #' quiet = TRUE) |> #' rdb::sudd_rfrnds(quiet = TRUE) list_sudd_rfrnds <- function(mode = c("by_date", "by_mod_date", "filter", "random"), order = c("ascending", "descending"), filter = list(territory_name_de = NULL, title_de = NULL, year_min = NULL, year_max = NULL), use_cache = TRUE, max_cache_age = "1 week", quiet = FALSE) { # check args mode <- rlang::arg_match(mode) order <- rlang::arg_match(order) %>% dplyr::case_match(.x = ., "ascending" ~ "asc", "descending" ~ "desc", .default = .) checkmate::assert_flag(quiet) # do not cache `mode = "random"` if (mode == "random") use_cache <- FALSE pkgpins::with_cache(expr = { checkmate::assert_list(filter, names = "unique") checkmate::assert_subset(names(filter), choices = c("territory_name_de", "title_de", "year_min", "year_max")) checkmate::assert_string(filter$territory_name_de, null.ok = TRUE) checkmate::assert_string(filter$title_de, null.ok = TRUE) has_filter <- filter %>% purrr::map_lgl(is.null) %>% all() %>% magrittr::not() if (has_filter && mode != "filter") { cli::cli_alert_warning("{.arg filter} is ignored because {.arg mode} is set to {.val {mode}}.") } filter$year_min <- checkmate::assert_int(filter$year_min, lower = sudd_min_year, upper = sudd_max_year, null.ok = TRUE, coerce = TRUE) filter$year_max <- checkmate::assert_int(filter$year_max, lower = filter$year_min %||% sudd_min_year, upper = sudd_max_year, null.ok = TRUE, coerce = TRUE) is_year_missing <- purrr::map_lgl(c(filter$year_min, filter$year_max), is.null) %>% { any(.) && !all(.) } if (is_year_missing) { filter$year_min <- filter$year_min %||% sudd_min_year filter$year_max <- filter$year_max %||% sudd_max_year } if (mode == "filter" && all(purrr::map_lgl(filter, is.null))) { cli::cli_abort("At least one filtering argument must be provided in {.arg filter} when {.arg mode = \"filter\"}.") } names(filter) %<>% dplyr::case_match(.x = ., "territory_name_de" ~ "area", "title_de" ~ "topic", "year_min" ~ "first", "year_max" ~ "last", .default = .) # assemble query params query <- c(list(mode = mode %>% dplyr::case_match(.x = ., "by_date" ~ "allrefs", "by_mod_date" ~ "moddate", .default = .))[mode != "filter"], list(sense = order)[mode %in% c("by_date", "filter")], filter[mode == "filter"]) # retrieve and parse data if (!quiet) { status_msg <- "Fetching raw HTML data from {.url sudd.ch}..." cli::cli_progress_step(msg = status_msg, msg_done = paste(status_msg, "done"), msg_failed = paste(status_msg, "failed")) } html <- httr::RETRY(verb = "GET", url = url_sudd("list.php"), query = query, times = 3L) %>% xml2::read_html() if (!quiet) { status_msg <- "Parsing and tidying raw HTML data..." cli::cli_progress_step(msg = status_msg, msg_done = paste(status_msg, "done"), msg_failed = paste(status_msg, "failed")) } if (mode == "random") { result <- html %>% rvest::html_elements(css = "main ul li a") %>% rvest::html_attr(name = "href") %>% stringr::str_extract(pattern = "(?<=id=)[\\w\\d]+") %>% tibble::tibble(id_sudd = .) } else { rows <- html %>% rvest::html_element(css = "main table") %>% rvest::html_children() %>% purrr::map(rvest::html_elements, css = "td") col_1 <- rows %>% purrr::map(magrittr::extract2, 1L) col_2 <- rows %>% purrr::map(magrittr::extract2, 2L) col_3 <- rows %>% purrr::map(magrittr::extract2, 3L) col_4 <- rows %>% purrr::map(magrittr::extract2, 4L) result <- tibble::tibble(id_sudd = purrr::map_chr(col_4, \(x) { x %>% rvest::html_element(css = "a") %>% rvest::html_attr(name = "href") %>% stringr::str_extract(pattern = "(?<=[\\?&]id=)[\\w\\d]+") }), territory_name_de = purrr::map_chr(col_2, rvest::html_text), !!!(col_3 %>% purrr::map_chr(rvest::html_text) %>% parse_sudd_date_de()), title_de = purrr::map_chr(col_4, rvest::html_text)) %>% # add `date` dplyr::mutate(date = clock::date_build(year = year, month = month, day = day, invalid = "NA")) %>% dplyr::relocate(date, .before = year) if (mode == "by_mod_date") { result %<>% tibble::add_column(date_last_edited = col_1 %>% purrr::map_chr(~ .x %>% rvest::html_element(css = "time") %>% rvest::html_attr(name = "datetime")) %>% clock::date_parse()) %>% tidyr::fill(date_last_edited, .direction = "down") } } result %>% # derive vars from `id_sudd` dplyr::bind_cols(.$id_sudd |> purrr::map(parse_sudd_id) |> purrr::list_rbind()) %>% dplyr::select(id_sudd, starts_with("country_"), is_former_country, starts_with("subnational_entity_"), everything()) }, pkg = this_pkg, from_fn = "list_sudd_rfrnds", mode, order, filter, use_cache = use_cache, max_cache_age = max_cache_age) }
sudd_rfrnds
NOTES:
@importFrom rlang :=
directive is to get rid of the R CMD check no visible global function definition for ‘:=’
, see this SO
question.#' Get referendum data from [sudd.ch](https://sudd.ch/) #' #' Downloads referendum data from [sudd.ch](https://sudd.ch/). #' #' # About [sudd.ch](https://sudd.ch/) #' #' **sudd** stands for _**Su**chmaschine für **d**irekte **D**emokratie_ (German) and is operated by [Beat Müller](mailto:beat@sudd.ch). Its database content is #' licensed under [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA #' 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). #' #' @param ids_sudd The referendum identifiers assigned by [sudd.ch](https://sudd.ch/). Either as a character vector or a data frame containing a column #' `id_sudd`. `NA`s are ignored. #' @param use_cache `r pkgsnip::param_lbl("use_cache")` #' @param max_cache_age `r pkgsnip::param_lbl("max_cache_age")` #' @param quiet `r pkgsnip::param_lbl("quiet")` #' #' @return `r pkgsnip::return_lbl("tibble")` The column names are aligned with those of [rfrnds()] as closely as possible. #' @family sudd #' @importFrom rlang := #' @export #' #' @examples #' rdb::rfrnd(id = "5bbc045192a21351232e596f")$id_sudd |> rdb::sudd_rfrnds() #' #' rdb::rfrnds(quiet = TRUE) |> #' dplyr::filter(country_code == "AT") |> #' rdb::sudd_rfrnds() sudd_rfrnds <- function(ids_sudd, use_cache = TRUE, max_cache_age = "1 week", quiet = FALSE) { checkmate::assert_flag(quiet) if (purrr::pluck_depth(ids_sudd) > 1L) { if (!("id_sudd" %in% colnames(ids_sudd))) { cli::cli_abort(paste0("{.arg ids_sudd} must be either a character vector of valid sudd.ch referendum identifiers or a data frame with a column of such", " named {.var id_sudd}.")) } ids_sudd <- ids_sudd$id_sudd } ids_sudd <- checkmate::assert_character(ids_sudd, all.missing = FALSE) %>% magrittr::extract(!is.na(.)) pkgpins::with_cache(expr = { ids_sudd %>% purrr::map(.f = \(x) sudd_rfrnd(x), .progress = if (quiet) FALSE else "Scraping referendum data from sudd.ch") %>% purrr::list_rbind() %>% # properly parse `date` dplyr::bind_cols(.$date |> purrr::map(parse_sudd_date) |> purrr::list_rbind()) %>% dplyr::mutate(date = clock::date_build(year = year, month = month, day = day, invalid = "NA")) %>% # add `id_sudd` tibble::add_column(id_sudd = ids_sudd, .before = 1L) %>% # derive vars from `id_sudd` dplyr::bind_cols(ids_sudd |> purrr::map(parse_sudd_id) |> purrr::list_rbind()) %>% # reorder columns dplyr::relocate(id_sudd, country_code, country_name, is_former_country, subnational_entity_code, territory_name_de, any_of(c("territory_type_de", "date", "year", "month", "day", "title_de", "question_type_de", "types", "result_de", "result_status_de", "adoption_requirements_de", "electorate_total", "electorate_abroad", "polling_cards", "votes_total", "votes_empty", "votes_void", "votes_invalid", "votes_valid", "votes_yes", "votes_no")), matches("^votes_(option_\\d+|(counter_)?proposal)(_(total|empty|void|invalid|valid|yes|no))?$"), any_of("votes_option_none"), matches("^$"), any_of(c("subterritories", "subterritories_yes", "subterritories_no", "files", "remarks", "sources", "ids_sudd_simultaneous", "date_last_edited"))) }, pkg = this_pkg, from_fn = "sudd_rfrnds", ids_sudd, use_cache = use_cache, max_cache_age = max_cache_age) }
is_online
#' Test RDB API availability #' #' Checks if the RDB API server is online and operational. #' #' @inheritParams url_api #' @param quiet Whether or not to suppress printing a warning in case the API is unavailable. #' #' @return A logical scalar. #' @family api_status #' @export is_online <- function(use_testing_server = pal::pkg_config_val(key = "use_testing_server", pkg = this_pkg), quiet = FALSE) { checkmate::assert_flag(quiet) result <- FALSE response <- tryCatch(expr = httr::RETRY(verb = "GET", url = url_api("health", .use_testing_server = use_testing_server), times = 3L), error = function(e) e$message) if (inherits(response, "response")) { response %<>% # ensure we actually got a plaintext response pal::assert_mime_type(mime_type = "text/plain", msg_suffix = mime_error_suffix) %>% # parse response httr::content(as = "text", encoding = "UTF-8") if (response == "OK") { result <- TRUE } else if (!quiet) { cli::cli_alert_warning("RDB API server responded with: {.val {response}}") } } else { cli::cli_alert_warning(response) } result }
pkg_config
#' `r this_pkg` package configuration metadata #' #' A [tibble][tibble::tbl_df] with metadata of all possible `r this_pkg` package configuration options. See [pal::pkg_config_val()] for more information. #' #' @format `r pkgsnip::return_lbl("tibble_cols", cols = colnames(pkg_config))` #' @export #' #' @examples #' rdb::pkg_config "pkg_config"
data_rdb_aargau <- readxl::read_excel("/home/salim/Arbeit/ZDA/Git/zdaarau/rpkgs/rdb/19-AG-Abstimmungen-1888-1971.xlsx") %>% dplyr::mutate(date = clock::date_parse(x = Datum, format = "%d.%m.%Y"), title_en = deeplr::translate2(text = Vorlage, auth_key = Sys.getenv("DEEPL_TOKEN"), target_lang = "EN", source_lang = "DE", preserve_formatting = TRUE)) %>% dplyr::select(date, title_de = Vorlage, title_en, electorate_total = Stimmberechtigte, votes_total = "Eingegangene Stimmzettel", votes_yes = "Anzahl JA-Stimmen", votes_no = "Anzahl NEIN-Stimmen", votes_empty = Leer)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.