Nothing
correspondence_ca_census_urls <- list(
"2006"=list("DB"="https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2006_92-156_DB_ID_txt.zip",
"DA"="https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2006_92-156_DA_AD_txt.zip"),
"2011"=list("DB"="https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2011_92-156_DB_ID_txt.zip",
"DA"="https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2011_92-156_DA_AD_txt.zip"),
"2016"=list("DB"="https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2016/2016_92-156_DB_ID_csv.zip",
"DA"="https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2016/2016_92-156_DA_AD_csv.zip"),
"2021"=list("DB"="https://www12.statcan.gc.ca/census-recensement/2021/geo/aip-pia/correspondence-correspondance/files-fichiers/2021_92-156-X_DB_ID.zip",
"DA"="https://www12.statcan.gc.ca/census-recensement/2021/geo/aip-pia/correspondence-correspondance/files-fichiers/2021_92-156-X_DA_AD.zip")
)
ca_census_base <- c("Population","Dwellings","Households")
years_from_datasets <- function(ds) {
ds %>%
stringr::str_extract("\\d+") %>%
stringr::str_pad(width=3,side="left",pad="0") %>%
stringr::str_pad(width=4,side="left",pad="2") %>%
as.integer()
}
datasets_from_vectors <- function(vs){
ds<-vs %>%
stringr::str_split("_") %>%
purrr::map(function(v)v[[2]]) %>%
unlist()
ds[grepl("^\\d{4}$",ds)]<-geo_dataset_for_years(ds[grepl("^\\d{4}$",ds)])
ds
}
GEO_DATASET_LOOKUP <- c(
setNames(rep("CA1996",1),paste0("TX",seq(2000,2000))),
setNames(rep("CA01",5),paste0("TX",seq(2001,2005))),
setNames(rep("CA06",6),paste0("TX",seq(2006,2011))),
setNames(rep("CA11",4),paste0("TX",seq(2012,2015))),
setNames(rep("CA16",5),paste0("TX",seq(2016,2020))),
setNames(rep("CA16",21),paste0("CA",seq(2000,2020),"RMS"))
)
geo_dataset_for_years <- function(years){
dataset_list <- cancensus::list_census_datasets()
years %>%
lapply(function(year){
dataset_list %>%
filter(.data$description==paste0(year," Canada Census")|.data$description==paste0(year," Canada Census and NHS")) %>%
pull(.data$geo_dataset) %>%
unique()
}) %>%
unlist()
}
geo_dataset_from_dataset <- function(datasets){
if (TRUE) { # legacy until cancensus updates
datasets <- datasets %>% gsub("^CA11[NF]$","CA11",.) %>% gsub("\\d{4}x","",.)
dataset_list <- cancensus::list_census_datasets()
lapply(datasets, function(ds){
dataset_list %>%
filter(.data$dataset == ds) %>%
pull(.data$geo_dataset) %>%
unique()
}) %>%
unlist()
} else {
result <- tibble(dataset=datasets,geo_dataset=GEO_DATASET_LOOKUP[datasets]) %>%
mutate(geo_dataset=ifelse(is.na(.data$geo_dataset),.data$dataset %>%
years_from_datasets() %>%
as.character() %>%
substr(3,4) %>%
paste0("CA",.),
.data$geo_dataset))
result$geo_dataset
}
}
#' Generate metadata from Candian census vectors
#'
#' @description
#' \lifecycle{maturing}
#'
#' Build tibble with information on how to aggregate variables given vectors
#' Queries list_census_variables to obtain needed information and add in vectors needed for aggregation
#'
#' @param vectors list of variables to query
#' @return tidy dataframe with metadata information for requested variables and additional variables
#' needed for tongfen operations
#' @export
#'
#' @examples
#' # Build metadata for vectors
#' \dontrun{
#' meta <- meta_for_ca_census_vectors("v_CA16_4836","v_CA16_4838","v_CA16_4899")
#'}
meta_for_ca_census_vectors <- function(vectors){
nn <- names(vectors)
vectors <- as.character(vectors) ## strip names just in case
if (is.null(nn)) {
nn <- vectors
} else {
nn[nn==""]=vectors[nn==""]
}
if (length(vectors)==0) {
meta <- tibble::tibble(variable=NA,label=NA,dataset=datasets_from_vectors(vectors))
}
meta <- tibble::tibble(variable=vectors,label=nn,dataset=datasets_from_vectors(vectors)) %>%
mutate(type="Original", aggregation="0",units=NA)
datasets <- meta$dataset %>%
unique %>%
sort
for (dataset in datasets){
d <- cancensus::list_census_vectors(dataset) %>%
filter(.data$vector %in% (filter(meta,.data$dataset==dataset)$variable)) %>%
select(.data$vector,.data$aggregation,.data$units)
aggregation_lookup <- setNames(d$aggregation,d$vector)
units_lookup <- setNames(d$units %>% as.character,d$vector)
meta <- meta %>%
mutate(aggregation=ifelse(.data$variable %in% names(aggregation_lookup),aggregation_lookup[.data$variable],.data$aggregation),
units=ifelse(.data$variable %in% names(units_lookup),units_lookup[.data$variable],.data$units))
}
get_vector <- function(g){
g %>% strsplit(" ") %>% purrr::map(function(a){ifelse(length(a)==3,a[3],NA)}) %>% unlist
}
meta <- meta %>%
mutate(rule=case_when(grepl("Average of",.data$aggregation) ~ "Average",
grepl("Median of",.data$aggregation) ~ "Median",
.data$aggregation=="Not additive" ~ "Not additive",
.data$aggregation=="Additive" ~ "Additive",
grepl("Average to",.data$aggregation) ~ "AverageTo",
TRUE ~ sub(" .+$","",.data$aggregation)),
parent=get_vector(.data$aggregation))
extras <- meta %>%
select(variable=.data$parent,.data$dataset) %>%
mutate(type="Extra",aggregation="Additive",rule="Additive") %>%
filter(!is.na(.data$variable),!.data$variable %in% meta$variable) %>%
filter(!duplicated(.data$variable,.data$dataset)) %>%
mutate(label=.data$variable)
if (nrow(extras)>0) {
meta <- meta %>%
bind_rows(extras) %>%
filter(!duplicated(.data$variable,.data$dataset))
}
meta <- meta %>%
mutate(geo_dataset=geo_dataset_from_dataset(.data$dataset),
year=years_from_datasets(.data$dataset))
meta
}
#' Generate metadata from Candian census vectors
#'
#' @description
#' \lifecycle{maturing}
#'
#' Add Population, Dwellings, and Household counts to metadata
#' @param meta ribble with metadata as for example provided by `meta_for_ca_census_vectors`
#' @return tibble with metadata
add_census_ca_base_variables <- function(meta){
new_meta <- meta$geo_dataset %>%
unique() %>%
lapply(function(ds) {
ca_base <- setdiff(ca_census_base,meta %>%
filter(.data$geo_dataset==ds) %>%
pull(.data$variable))
meta_for_additive_variables(ds,ca_base) %>%
mutate(units="Number",
year=years_from_datasets(ds))
}) %>%
bind_rows()
meta %>%
bind_rows(new_meta)
}
#' Get StatCan DA or DB level correspondence file
#'
#' @description
#' \lifecycle{maturing}
#'
#' @param year census year, only 2006 through 2021 are supported
#' @param level geographic level, DA or DB
#' @param refresh reload the correspondence files, default is `FALSE`
#' @return tibble with correspondence table`
get_single_correspondence_ca_census_for <- function(year,level=c("DA","DB"),refresh=FALSE) {
level=level[1]
year=as.character(year)[1]
if (!(level %in% c("DA","DB"))) stop("Level needs to be DA or DB")
if (!(year %in% c("2006","2011","2016","2021"))) stop("Year needs to be 2006, 2011, 2016, or 2021")
new_field=paste0(level,"UID",year)
old_field=paste0(level,"UID",as.integer(year)-5)
path=file.path(tongfen_cache_dir(),paste0("statcan_correspondence_",year,"_",level,".csv"))
if (refresh || !file.exists(path)) {
url=correspondence_ca_census_urls[[year]][[level]]
tmp=tempfile()
utils::download.file(url,tmp)
exdir=file.path(tempdir(),paste0("correspondence_",year,"_",level))
if (dir.exists(exdir)) file.remove(exdir,recursive=TRUE)
dir.create(exdir,showWarnings = FALSE)
utils::unzip(tmp,exdir=exdir)
file=dir(exdir,"\\.txt|\\.csv")
if (length(file)==0) {
p<-dir(exdir)[1]
if (dir.exists(file.path(exdir,p))) {
exdir=file.path(exdir,p)
file=dir(exdir,"\\.txt|\\.csv")
}
}
if (level=="DB") headers=c(new_field,old_field,"flag") else headers=c(new_field,old_field,paste0("DBUID",year),"flag")
unwanted <- paste0(level,"UID",year)
d<-readr::read_csv(file.path(exdir,file),col_types = readr::cols(.default = "c"),col_names = headers) %>%
select(all_of(c(new_field,old_field,"flag"))) %>%
unique() %>%
filter(!grepl(unwanted,!!as.name(new_field)))
readr::write_csv(d,path)
unlink(tmp)
unlink(exdir,recursive = TRUE)
}
result <- readr::read_csv(path,col_types = readr::cols(.default = "c"))
# manual corrections
if (year=="2021" && level=="DB") {
# inconsequential boundary shift Cov/UBC
result <- result %>% filter(.data$DBUID2021!="59150934010")
} else if (year=="2021" && level=="DA") {
# inconsequential boundary shift Cov/UBC
result <- result %>% filter(!(.data$DAUID2021=="59150934" & .data$DAUID2016 == "59150936"))
}
result
}
#' Get StatCan correspondence data
#'
#' @description
#' \lifecycle{maturing}
#'
#' Get correspondence file for several Candian censuses on a common geography. Requires sf and cancensus package to be available
#'
#' @param regions census region list, should be inclusive list of GeoUIDs across censuses
#' @param geo_datasets vector of census geography dataset identifiers
#' @param level aggregation level to return data on (default is "CT")
#' @param method tongfen method, options are "statcan" (the default), "estimate", "identifier".
#' * "statcan" method builds up the common geography using Statistics Canada correspondence files, at this point
#' this method only works for "DB", "DA" and "CT" levels.
#' * "estimate" uses `estimate_tongfen_correspondence` to build up the common geography from scratch based on geographies.
#' * "identifier" assumes regions with identical geographic identifier are identical, and builds up the the correspondence for regions with unmatched geographic identifiers.
#' @param tolerance tolerance for `estimate_tongen_correspondence` in metres, default value is 50 metres.
#' @param area_mismatch_cutoff discard areas returned by `estimate_tongfen_correspondence` with area mismatch (log ratio) greater than cutoff.
#' @param quiet suppress download progress output, default is `FALSE`
#' @param refresh optional character, refresh data cache for this call, (default `FALSE`)
#' @return dataframe with the multi-census correspondence file
#' @export
#'
#' @examples
#' # Get correspondance files between CTs in 2006 and 2016 censuses in Vancouver CMA
#' \dontrun{
#' correspondence <- get_tongfen_correspondence_ca_census(geo_datasets=c('CA06','CA16'),
#' regions=list(CMA="59933"),level='CT')
#'}
get_tongfen_correspondence_ca_census <- function(geo_datasets, regions, level="CT", method="statcan",
tolerance = 50, area_mismatch_cutoff = 0.1,
quiet = FALSE, refresh = FALSE) {
geo_datasets <- normalize_datasets(geo_datasets)
if (method=="statcan") {
assert(level %in% c("DB","DA","CT"),"Level has to be one of DB, DA, or CT when using method = 'statcan'.")
assert(length(setdiff(geo_datasets, c("CA21","CA16","CA11","CA06","CA01")))==0,
"Method 'statcan' only works for census years 2001 through 2021.")
} else if (method=="estimate") {
} else if (method=="identifier") {
} else {
stop(paste0("Unknown method ",method,", has to be one of 'statcan', 'estimate', 'identifier'"))
}
use_cache <- !refresh
data <- lapply(geo_datasets,function(g_ds){
cancensus::get_census(dataset=g_ds, regions=regions, level=level, geo_format='sf',
labels="short", quiet=quiet, use_cache = use_cache) %>%
mutate(!!paste0("GeoUID",g_ds):=.data$GeoUID)
}) %>%
setNames(geo_datasets)
if (method=="statcan") {
statcan_level <- level
if (!(statcan_level %in% c("DB","DA"))) statcan_level <- "DA"
geo_years <- geo_datasets %>% years_from_datasets()
years<-as.integer(geo_years)
all_geo_years=seq(min(years),max(years),5)
all_geo_datasets <- geo_dataset_for_years(all_geo_years)
for (g_ds in setdiff(all_geo_datasets,geo_datasets)) {
data[[g_ds]] <- cancensus::get_census(dataset=g_ds, regions=regions, level=level, geo_format='sf',
labels="short", quiet=quiet, use_cache = use_cache) %>%
mutate(!!paste0("GeoUID",g_ds):=.data$GeoUID)
}
prefix=paste0(statcan_level,"UID")
if (level=="CT") {
c_links <- geo_datasets %>%
lapply(function(ds){
da_column <- ds %>% years_from_datasets() %>% paste0("DAUID",.)
match_column <- ds %>% paste0("GeoUID",.)
cancensus::get_census(dataset=ds,regions=regions,level="DA",use_cache = use_cache,quiet=quiet) %>%
select(.data$GeoUID,.data$CT_UID) %>%
rename(!!match_column:=.data$CT_UID,
!!da_column:=.data$GeoUID)
}) %>%
setNames(geo_datasets)
} else if (level %in% c("DB","DA")){
c_links <- all_geo_datasets %>%
lapply(function(ds){
year <- years_from_datasets(ds)
base_column <- paste0(prefix,year)
match_column <- paste0("GeoUID",ds)
data[[ds]] %>%
st_set_geometry(NULL) %>%
select_at(match_column) %>%
mutate(!!base_column:=!!as.name(match_column))
}) %>%
setNames(all_geo_datasets)
} else {
stop("Oops, should have caught this earlier.")
}
correspondence_years=all_geo_years[-1]
correspondence <- correspondence_years %>%
lapply(function(year){
c <- get_single_correspondence_ca_census_for(year,statcan_level) %>%
select(-.data$flag)
previous_year <- all_geo_years[which(all_geo_years==year)-1]
ds1 <- all_geo_datasets[all_geo_years==year]
ds2 <- all_geo_datasets[all_geo_years==previous_year]
if (!is.null(ds1) && length(ds1)>0) {
match_column <- intersect(names(c),names(c_links[[ds1]]))
if (!is.null(match_column)) {
c <- c %>%
inner_join(c_links[[ds1]],by=match_column) %>%
select(-all_of(match_column)) %>%
unique()
}
}
if (!is.null(ds2) && length(ds2)>0) {
match_column <- intersect(names(c),names(c_links[[ds2]]))
if (!is.null(match_column)) {
c <- c %>%
inner_join(c_links[[ds2]],by=match_column) %>%
select(-all_of(match_column)) %>%
unique()
}
}
c %>%
mutate(TongfenMethod="statcan")
}) %>%
aggregate_correspondences() %>%
select(c(paste0("GeoUID",geo_datasets),"TongfenMethod")) %>%
unique() %>%
get_tongfen_correspondence()
#setNames(correspondence_years)
} else {
geo_identifiers <- paste0("GeoUID",geo_datasets)
correspondence <- estimate_tongfen_correspondence(data,
geo_identifiers,
method = method,
tolerance=200,
computation_crs=3347)
}
correspondence
}
#' Togfen data from several Canadian censuses
#'
#' @description
#' \lifecycle{maturing}
#'
#' Get data from several Candian censuses on a common geography. Requires sf and cancensus package to be available
#'
#' @param regions census region list, should be inclusive list of GeoUIDs across censuses
#' @param meta metadata for the census veraiables to aggregate, for example as returned
#' by \code{meta_for_ca_census_vectors}.
#' @param level aggregation level to return data on (default is "CT")
#' @param method tongfen method, options are "statcan" (the default), "estimate", "identifier".
#' * "statcan" method builds up the common geography using Statistics Canada correspondence files, at this point
#' this method only works for "DB", "DA" and "CT" levels.
#' * "estimate" uses `estimate_tongfen_correspondence` to build up the common geography from scratch based on geographies.
#' * "identifier" assumes regions with identical geographic identifier are identical, and builds up the the correspondence for regions with unmatched geographic identifiers.
#' @param base_geo base census year to build up common geography from, `NULL` (the default) to not return
#' any geographi data
#' @param na.rm logical, determines how NA values should be treated when aggregating variables
#' @param tolerance tolerance for `estimate_tongen_correspondence` in metres, default value is 50 metres,
#' only used when method is 'estimate' or 'identifier'
#' @param area_mismatch_cutoff discard areas returned by `estimate_tongfen_correspondence` with area mismatch (log ratio) greater than cutoff,
#' only used when method is 'estimate' or 'identifier'
#' @param quiet suppress download progress output, default is `FALSE`
#' @param refresh optional character, refresh data cache for this call, (default `FALSE`)
#' @param crs optional CRS to transform data to, and use for spatial intersections if method is
#' 'identifier' or 'estimate'
#' @param data_transform optional transform function to be applied to census data after being returned from cancensus
#' @return dataframe with variables on common geography
#' @export
#'
#' @examples
#' # Get rent data for census years 2001 through 2016
#' \dontrun{
#' rent_variables <- c(rent_2001="v_CA01_1667",rent_2016="v_CA16_4901",
#' rent_2011="v_CA11N_2292",rent_2006="v_CA06_2050")
#' meta <- meta_for_ca_census_vectors(rent_variables)
#'
#' regions=list(CMA="59933")
#' rent_data <- get_tongfen_ca_census(regions=regions, meta=meta, quiet=TRUE,
#' method="estimate", level="CT", base_geo = "CA16")
#'
#'}
get_tongfen_ca_census <- function(regions,meta,level="CT",method="statcan",
base_geo=NULL,na.rm=FALSE,
tolerance = 50,
area_mismatch_cutoff = 0.1,
quiet=FALSE,
refresh=FALSE,
crs=NULL,
data_transform=function(d)d) {
use_cache <- !refresh
geo_datasets <- meta$geo_dataset %>% unique() %>% sort()
meta <- meta %>% add_census_ca_base_variables()
data <- lapply(geo_datasets,function(g_ds){
vectors <- meta %>%
filter(.data$geo_dataset == g_ds,
.data$type != "Base") %>%
pull(.data$variable) %>%
as.character()
c <- cancensus::get_census(dataset=g_ds, regions=regions,
vectors=vectors,
level=level, geo_format='sf',
labels="short", quiet=quiet, use_cache = use_cache) %>%
mutate(!!paste0("GeoUID",g_ds):=.data$GeoUID)
if (!is.null(crs)) c <- c %>% sf::st_transform(crs)
c
}) %>%
setNames(geo_datasets)
if (length(geo_datasets)==1) {
# no need to tongfen
aggregated_data <- data[[1]]
} else {
correspondence <- get_tongfen_correspondence_ca_census(geo_datasets = geo_datasets,
regions = regions,
level = level,
method = method,
tolerance = tolerance,
area_mismatch_cutoff = area_mismatch_cutoff,
quiet = quiet,
refresh = refresh)
aggregated_data <- tongfen_aggregate(data,correspondence,meta)
}
aggregated_data %>%
rename_with_meta(meta)
}
#' @import dplyr
#' @importFrom rlang .data
NULL
if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.