# Read in census profiles data, only keep Toronto and Toronto CTs and derive dimension hierarchy
# Retrieved from: https://www12.statcan.gc.ca/census-recensement/2016/dp-pd/prof/details/download-telecharger/comp/page_dl-tc.cfm?Lang=E
# Geographic level for Toronto CTs: Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)
# Geographic level for City of Toronto: Census divisions (CDs)
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(janitor)
### Set up paths for data sets ------
cts_census_path <- here::here("data-raw", "aggregate_data", "census_profiles_2016", "raw", "98-401-X2016043_eng_CSV", "98-401-X2016043_English_CSV_data.csv")
toronto_census_path <- here::here("data-raw", "aggregate_data", "census_profiles_2016", "raw", "98-401-X2016060_eng_CSV", "98-401-X2016060_English_CSV_data.csv")
# Variables are the same between geographies / data sets, so we can just use/parse one
variables_path <- here::here("data-raw", "aggregate_data", "census_profiles_2016", "raw", "98-401-X2016043_eng_CSV", "98-401-X2016043_English_meta.txt")
# ### Get starting and ending row of Toronto -----
cts_starting_row <- read_csv(here::here("data-raw", "aggregate_data", "census_profiles_2016", "raw", "98-401-X2016043_eng_CSV", "Geo_starting_row_CSV.csv"))
toronto_starting_row <- read_csv(here::here("data-raw", "aggregate_data", "census_profiles_2016", "raw", "98-401-X2016060_eng_CSV", "Geo_starting_row_CSV.csv"))
# For CTs, the Geo Names seem to be numeric, unless they are a city - then everything between cities is for that city. So look for Toronto, then look for the next city, and everything between is Toronto.
cts_toronto_start_with_next_geo <- cts_starting_row %>%
mutate(geo_name_as_numeric = parse_number(`Geo Name`)) %>%
filter(is.na(geo_name_as_numeric)) %>%
arrange(`Line Number`) %>%
mutate(
next_geo = lead(`Geo Name`),
next_geo_line_number = lead(`Line Number`)
) %>%
filter(`Geo Name` == "Toronto")
# Next geography is Hamilton (makes sense), and starts on line 7235342.
cts_toronto_start_end <- cts_toronto_start_with_next_geo %>%
select(
start = `Line Number`,
end = next_geo_line_number
) %>%
mutate(end = end - 1) # Subtract 1 since Hamilton starts on 7235342.
# For Toronto, just look for Toronto, then the next row, and everything between is Toronto
toronto_start_end <- toronto_starting_row %>%
mutate(
next_geo_nam = lead(`Geo Name`),
next_geo_line_number = lead(`Line Number`)
) %>%
filter(`Geo Name` == "Toronto") %>%
select(
start = `Line Number`,
end = next_geo_line_number
) %>%
mutate(end = end - 1)
### Build variable hierarchy from metadata -----
original_variables <- read_lines(variables_path, skip = 211, n_max = 2458 - 211)
variables <- tibble(x = original_variables) %>%
# Fix some where there's " -[A-Z]" instead of " - " (missing space at end)
mutate(x = str_replace(x, " -[A-Z]", " - ")) %>%
# And some where there's " - " instead of " - " (one extra space at end)
mutate(x = str_replace(x, " - ", " - ")) %>%
mutate(
x = str_replace(x, " \\(\\d+\\)", ""), # Remove (#) at the end of lines
x = str_replace_all(x, " ", "@@") # Replace any spaces with @@, for easier viewing / separating
) %>%
separate(x, into = c("member_id", "dimension"), "\\.@@", extra = "merge") %>%
mutate(
member_id = str_replace_all(member_id, "@", ""),
dimension = str_replace_all(dimension, "@@@@", ";"),
dimension = str_replace_all(dimension, "@@", " ")
)
n_levels <- variables %>%
mutate(levels = str_count(dimension, ";")) %>%
pull(levels) %>%
max()
n_levels <- n_levels + 1
level_cols <- glue::glue("level_{1:n_levels}")
variables <- variables %>%
separate(dimension, into = level_cols, sep = ";", fill = "right", remove = FALSE) %>%
mutate_all(na_if, "") %>%
fill(level_1)
# Fill each level down, unless the level *before* it has changed
fill_level <- function(df, level) {
var <- glue::glue("level_{level}")
var <- rlang::sym(var)
prev_var <- glue::glue("level_{level - 1}")
prev_var <- rlang::sym(prev_var)
all_prev_vars <- glue::glue("level_{1:(level - 1)}")
all_prev_vars <- rlang::syms(all_prev_vars)
df %>%
group_by(!!!all_prev_vars) %>%
fill(!!var) %>%
mutate(!!var := case_when(
!!prev_var != lag(!!prev_var) ~ NA_character_,
TRUE ~ !!var
)) %>%
ungroup()
}
variables <- variables %>%
fill_level(2) %>%
fill_level(3) %>%
fill_level(4) %>%
fill_level(5) %>%
fill_level(6) %>%
fill_level(7) %>%
fill_level(8)
# Construct full dimension and the final level
variables <- variables %>%
mutate(
dimension_full = paste(level_1, level_2, level_3, level_4, level_5, level_6, level_7, level_8, sep = "::"),
level = as.numeric(!is.na(level_1)) +
as.numeric(!is.na(level_2)) +
as.numeric(!is.na(level_3)) +
as.numeric(!is.na(level_4)) +
as.numeric(!is.na(level_5)) +
as.numeric(!is.na(level_6)) +
as.numeric(!is.na(level_7)) +
as.numeric(!is.na(level_8))
) %>%
mutate_all(list(~ str_replace_all(., "::NA", ""))) %>%
mutate(dimension = str_replace_all(dimension, ";", ""))
hierarchy_for_level <- function(df, level) {
if (level == 1) {
distinct_levels <- df %>%
distinct(level_1) %>%
mutate(level_hierarchy = as.character(row_number()))
res <- df %>%
left_join(distinct_levels, by = "level_1") %>%
select(level_hierarchy)
} else {
vars_chr <- glue::glue("level_{1:(level - 1)}")
vars <- rlang::syms(vars_chr)
level_var_chr <- glue::glue("level_{level}")
level_var <- level_var_chr %>%
rlang::sym()
distinct_levels <- df %>%
group_by(!!!vars) %>%
distinct(!!level_var) %>%
filter(!is.na(!!level_var)) %>%
mutate(level_hierarchy = as.character(row_number())) %>%
ungroup()
res <- df %>%
left_join(distinct_levels, by = c(vars_chr, level_var_chr)) %>%
select(level_hierarchy) %>%
mutate(level_hierarchy = coalesce(level_hierarchy, ""))
}
names(res) <- glue::glue("level_{level}")
res
}
generate_hierarchy <- function(df) {
level_vars <- df %>%
select(starts_with("level_", ignore.case = FALSE)) %>%
names()
levels <- readr::parse_number(level_vars)
res <- purrr::map(levels, ~ hierarchy_for_level(df, .x))
res <- bind_cols(res)
res %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = "level", values_to = "value", names_prefix = "level_") %>%
group_by(id) %>%
arrange(level) %>%
filter(value != "") %>%
summarise(dimension_id = paste0(value, collapse = ".")) %>%
select(dimension_id)
}
variables_hierarchy <- variables %>%
generate_hierarchy()
variables_with_hierarchy <- variables_hierarchy %>%
bind_cols(variables)
hierarchy <- variables_with_hierarchy %>%
mutate(
parent_id = str_replace(dimension_id, "\\.[^\\.]*$", ""),
member_id = as.numeric(member_id)
) %>%
select(member_id, parent_id, dimension_id, dimension, dimension_full)
### Get Toronto census tracts -----
# Load only from the start to the end of Toronto
toronto_cts <- read_csv(cts_census_path, skip = cts_toronto_start_end[["start"]] - 1, guess_max = 100000, n_max = cts_toronto_start_end[["end"]] - cts_toronto_start_end[["start"]] + 1, trim_ws = FALSE, col_names = FALSE)
# Get column names since we skipped those on load
census_col_names <- read_csv(cts_census_path, n_max = 1, trim_ws = FALSE)
names(toronto_cts) <- names(census_col_names)
# ### Add hierarchy to census tracts ----
toronto_cts <- hierarchy %>%
inner_join(toronto_cts, by = c("member_id" = "Member ID: Profile of Census Tracts (2247)"))
# ### Tidy names
toronto_cts <- toronto_cts %>%
rename(
total = `Dim: Sex (3): Member ID: [1]: Total - Sex`,
# Some conversion of values (... to NA) needs to be done, but I'll do that in the cleaning step so that I can document the ...
male = `Dim: Sex (3): Member ID: [2]: Male`,
female = `Dim: Sex (3): Member ID: [3]: Female`
) %>%
clean_names()
# ### Save Toronto census tracts
saveRDS(toronto_cts, here::here("data-raw", "aggregate_data", "census_profiles_2016", "extract", "toronto_census_tracts.rds"))
### Get Toronto CD -----
# Load only from the start to the end of Toronto
toronto_cd <- read_csv(toronto_census_path, skip = toronto_start_end[["start"]] - 1, guess_max = 100000, n_max = toronto_start_end[["end"]] - toronto_start_end[["start"]] + 1, trim_ws = FALSE, col_names = FALSE)
# Get column names since we skipped those on load
census_col_names <- read_csv(toronto_census_path, n_max = 1, trim_ws = FALSE)
names(toronto_cd) <- names(census_col_names)
# ### Add hierarchy to census tracts ----
toronto_cd <- hierarchy %>%
inner_join(toronto_cd, by = c("member_id" = "Member ID: Profile of Census Divisions (2247)"))
# ### Tidy names
toronto_cd <- toronto_cd %>%
rename(
total = `Dim: Sex (3): Member ID: [1]: Total - Sex`,
# Some conversion of values (... to NA) needs to be done, but I'll do that in the cleaning step so that I can document the ...
male = `Dim: Sex (3): Member ID: [2]: Male`,
female = `Dim: Sex (3): Member ID: [3]: Female`
) %>%
clean_names()
# ### Save Toronto census tracts
saveRDS(toronto_cd, here::here("data-raw", "aggregate_data", "census_profiles_2016", "extract", "toronto_census_division.rds"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.