# Several functions which are not exposed to make life easier internally
table_to_long <- function(data){
#' Convert wide to long
#'
#' @description Merely converts a census dataframe from wide to long.
#'
#' @param data The data set to be transformed.
# Set NULL values to remove notes
variable <- NULL
value <- NULL
geometry <- NULL
if ("geometry" %in% colnames(data)) gather(data, variable, value, -1, -2, -3, -geometry)
else gather(data, variable, value, -1, -2, -3)
}
clean_census_columns <- function(.data){
#' Clean variables
#'
#' Cleans the variable names and splits them into columns in a long table.
#'
#' @param .data The data frame to be cleaned. Must be a long dataframe returned by read_nz_census_data()
#' @examples
#' nz_dwelling_regions_long <- transform_census(dwelling_regions,
#' gis = FALSE,
#' long = TRUE)
#' cleaned_data <- clean_census_columns(nz_dwelling_regions_long)
#' head(cleaned_data)
#' @importFrom magrittr "%>%"
#' @export
# Set NULL variables to avoid package test 'notes'
variable <- NULL
years <- NULL
topic <- NULL
year <- NULL
value <- NULL
# Extract geography
if ("sf" %in% class(.data) | "sfc" %in% class(.data)) {
is_geog <- TRUE
geog <- sf::st_geometry(.data)
sf::st_geometry(.data) <- NULL
} else is_geog <- FALSE
# Extract year and create cleaned variable variable.
.data <- mutate(.data,
# Extract the year
year = stringr::str_extract(variable, "...."),
# Remove it from the column incl the "Census" word from all columns.
variable = stringr::str_replace(variable, "20.._Census_", ""),
# Create the topic variable by finding the location of the
# first capital letter or digit preceded by an underscore _[A-Z] and taking everything in front of it
topic = stringr::str_sub(variable,
start = 0,
end = stringr::str_locate(variable, "_[A-Z]|_[0-9]|_\\$")[row_number(), 1]) %>%
stringr::str_replace_all("_", " "),
# Create the variable as the remaining part of the string after the "_[A-Z]"
cleaned_variable = stringr::str_sub(variable,
start = stringr::str_locate(variable, "_[A-Z]|_[0-9]|_\\$")[row_number(), 2]) %>%
stringr::str_replace_all("_", " "),
topic = ifelse(is.na(cleaned_variable), variable, topic)
)
# Create new data frame and get all of the column in the desired order.
.data <- select(.data, 1:3, year, topic, variable = cleaned_variable, value)
# Add the geometry column back in if it was present in the first place.
if (is_geog) {
sf::st_geometry(.data) <- geog
}
return(.data)
}
replace_confidential <- function(.data, replacement_value = NA_integer_){
#' Replace confidential values
#'
#' The NZ census commonly uses the notation of '..C' when values are below a
#' certain threshold that they may reveal private details of certain individuals.
#' However, it is often required in an analysis to replace these values and
#' convert the column to an integer (to include ..C they need to be character).
#'
#' This function takes the data set, and a replacement value and replaces all of the ..C values.
#' Mainly for use within the transform_census function.
#'
#' @param .data The data set to have its confidential values removed.
#' @param replacement_value The value to replace the confidential ones with. Defaults to NA_integer.
# Need to actually convert to function.
if (!is.numeric(replacement_value)) stop("Replacement value must be a number or NA_integer_")
do_not_mutate <- c("Area_Code_and_Description", "Code", "Description", "geometry")
replace_confidential_cols <- colnames(.data)[!(colnames(.data) %in% do_not_mutate)]
replacement_value <- as.character(replacement_value)
if(is.na(replacement_value)) {
replacement_function <- function(col) {suppressWarnings(as.integer(col))}
} else {
replacement_function <- function(col) {as.integer(stringr::str_replace(col, "..C", replacement_value))}
}
.data <- dplyr::mutate_at(.data, dplyr::vars(replace_confidential_cols), dplyr::funs(replacement_function))
.data <- sf::st_as_sf(.data)
return(.data)
}
extract_topics <- function(raw_topics, unique_topics = TRUE){
#' Extract topic
#'
#' Extracts the topics for a raw census column from a vector strings.
#'
#' @param raw_topics A vector of raw topics.
#' @param unique_topics Whether the results should be unique or not. Defaults to TRUE.
#'
#' @return A vector of topics
# Get unique values
raw_topics <- unique(raw_topics)
# Extract years and then strip away year
no_year <- stringr::str_replace(raw_topics, "^20.._Census_", "")
# Locate the first capital letter.
capital_letter_locations <- stringr::str_locate(no_year, "_[A-Z]|_[0-9]")[,1]
# Generate sequence to iterate over
sequence <- seq_along(no_year)
cleaned_topics <- sapply(sequence, function(x){
cleaned_topic <- stringr::str_sub(no_year[x], start = 0, end = capital_letter_locations[x] - 1) %>%
stringr::str_replace_all("_", " ")
if (is.na(cleaned_topic)) cleaned_topic <- no_year[x]
return(cleaned_topic)
})
if (unique_topics == TRUE) cleaned_topics <- unique(cleaned_topics)
return(cleaned_topics)
}
extract_variables <- function(raw_variables){
#' Extract variable
#'
#' Extracts the variables for a raw census column from a vector strings.
#'
#'
#' @param raw_variables A vector of census columns containing the raw variables.
#'
#' @return A vector of variables
# Get unique values
raw_variables <- unique(raw_variables)
# Strip away year
no_year <- stringr::str_replace(raw_variables, "^20.._Census_", "")
# Locate the first capital letter.
capital_letter_locations <- stringr::str_locate(no_year, "_[A-Z]|_[0-9]")[,2]
# Generate sequence to iterate over
sequence <- seq_along(no_year)
sapply(sequence, function(x){
cleaned_variable <- stringr::str_sub(no_year[x],
start = capital_letter_locations[x]) %>%
stringr::str_replace_all("_", " ")
})
}
## Select & filter functions.
select_by_topic <- function(.data, topics, exclude = FALSE){
#' Filter the census topics in a census dataframe
#'
#' This is a helper function for quickly and easily selecting or excluding topics on a wide nzcensr data set.
#'
#' Really just a wrapper around a str_detect("pattern_1|pattern_2") selection.
#'
#' @param .data The data frame from which the topics should be selected/excluded from.
#' @param topics A vector of census topics to be selected or excluded e.g.
#' (c("number_of_rooms_for_occupied_private_dwellings", "number_of_bedrooms_for_occupied_private_dwellings")).
#' @param exclude Whether the topics should be selected or excluded.
#'
#' @return A dataframe
#'
#' @examples
#' select_by_topic(dwelling_area_units, c("fuel_types", "private_dwelling"))
#'
#' @importFrom stringr str_replace_all str_detect
#'
#' @export
# Protect regular expression characters
topics_protected <- stringr::str_replace_all(topics, " ", "_") %>%
stringr::str_replace_all("(\\W)", "\\\\\\1")
topics_regex <- paste(topics_protected, collapse = "|")
# Detect the columns that match the vectors given.
all_columns <- colnames(.data)
topic_column_match <- stringr::str_detect(all_columns, topics_regex)
# Flip the logical if the excluding.
if (exclude == TRUE) topic_column_match <- !topic_column_match
# Check that the topics are actually in the dataframe
if (any(topic_column_match) == FALSE) stop("No columns match the topics you've given me in the data set.")
# Get the column names that match (including the description columns e.g. Description, Code, geometry etc.)
descriptive_columns <- all_columns[!str_detect(all_columns, "Census")]
matching_columns <- all_columns[topic_column_match]
wanted_columns <- c(descriptive_columns, matching_columns)
# Select and return data frame.
select(.data, one_of(wanted_columns))
}
filter_by_census_area <- function(.data, geographic_level, geographic_filter, pattern, append_geog_desc = TRUE){
#' Filters census data by area
#'
#' Filters a census table by another area e.g. meshblocks by territorial area or region.
#' It can only work if the geographic level is at the same or a greater scale than the input data e.g.
#' meshblocks <= area_units <= wards <= local boards <= territoral authorities <= regions
#'
#' @param .data The data set to be filtered. Must contain a column with census spatial
#' names in it e.g. (Area_Code_and_Description).
#' @param geographic_level The geographic level of the input spatial data e.g. area_units.
#' @param geographic_filter The geographic level of the spatial filter e.g. region.
#' @param pattern The pattern to match the area to filter by e.g. 'Wellington Region'.
#'
#' @return A sf dataframe.
#'
#' @examples
#'
#' @importFrom stringr str_detect str_interp
#' @importFrom dplyr filter select
#' @importFrom sf st_set_geometry
#' @importFrom tidyr unite
#' @export
# Check if a Area_Code_and_Description column is present
if (!('Area_Code_and_Description' %in% colnames(.data))) {
stop("Area_Code_and_Description is needed in the input data")
}
# Check is the geographic level is available.
g_levels <- c("meshblocks", "area_units", "wards", "local_boards", "tas", "regions")
if (!(geographic_level %in% g_levels)){
stop(stringr::str_interp("Incorrect geographic levels specified. Must be one of: ${paste(g_levels, collapse = ', ')}"))
}
# Check if the geographic level matches the available levels
if (any(stringr::str_detect(geographic_level, g_levels)) == FALSE){
stop(stringr::str_interp("Doesn't appear to be any geographic match for ${geographic_level} in the geographic levels: ${paste(g_levels, collapse = ', ')}"))
}
# Check if the geographic level of the input data is lower than the geographic level specified.
geographic_levels <- factor(g_levels, levels = g_levels)
.data_geographic_match <- geographic_levels[str_detect(geographic_level, g_levels)]
geographic_filter_factor <- geographic_levels[str_detect(geographic_filter, g_levels)]
if (.data_geographic_match %>% as.integer > geographic_filter_factor %>% as.integer){
stop("The input geographic area is at a larger scale than the filter by area. You can only filter by geographic levels which are equal or more than the input data. Remember: meshblocks < area_units < wards < local boards < territoral authorities < regions.")
}
# Add in filter of function here. NB: Meshblock cannot be filtered by.
name_concordance <- list(meshblocks = c("MB2013"), "area_units" = c("AU2013", "AU2013_NAM"),
"tas" = c("TA2013", "TA2013_NAM"), "wards" = c("WARD2013", "WARD2013_N"),
"local_boards" = c("CB2013", "CB2013_NAM"), "regions" = c("REGC2013", "REGC2013_N"))
# I need to create and compare the Area_Code_Description to the
.data_geography <- as.character(.data_geographic_match)
wanted_columns <- c(name_concordance[[.data_geography]], name_concordance[[geographic_filter]])
# Get the ids from the meshblock concordance table.
wanted_areas <- meshblocks %>%
sf::st_set_geometry(NULL) %>%
dplyr::select(one_of(wanted_columns)) %>%
tidyr::unite(lower_geo, one_of(name_concordance[[.data_geography]]), sep = " ", remove = FALSE) %>%
tidyr::unite(upper_geo, one_of(name_concordance[[geographic_filter]]), sep = " ", remove = FALSE) %>%
mutate(upper_geo = stringr::str_to_lower(upper_geo)) %>%
dplyr::filter(stringr::str_detect(upper_geo, stringr::str_to_lower(pattern))) %>%
dplyr::distinct()
# return(.data)
# return(list(.data, wanted_areas))
# If the lower level is meshblocks, then paste the "MB " onto the
# front so that it matches the Area_Code_and_Description meshblock id.
if (.data_geography == "meshblocks") wanted_areas <- paste0("MB ", wanted_areas)
if (append_geog_desc == TRUE){
# Filter data and join geog description
geog_desc <- select(wanted_areas,
one_of(name_concordance[[.data_geography]]),
one_of(name_concordance[[geographic_filter]]))
# Get the join parameters
code_join <- name_concordance[[.data_geography]][1]
description_join <- name_concordance[[.data_geography]][2]
dplyr::filter(.data, Area_Code_and_Description %in% wanted_areas$lower_geo) %>%
dplyr::left_join(geog_desc, by = c("Code" = code_join, "Description" = description_join))
} else {
# Filter data
dplyr::filter(.data, Area_Code_and_Description %in% wanted_areas$lower_geo)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.