# Region <- "STE"
START <- Sys.time() knitr::opts_knit$set(root.dir = "..") knitr::opts_chunk$set(echo = TRUE, print_chunk = TRUE)
suppressMessages({ library(readxl) library(testthat) library(magrittr) library(hutils) library(data.table) library(ASGS) library(Census2016.DataPack) })
# Ensure we are working with pristine tables if (!isTRUE(getOption("knitr.in.progress"))) { AllTables <- Filter(function(x) is.data.table(get(x)), ls()) if (length(AllTables) != 0) { if (exists("table_killer")) { menu_selection <- 1 } else { menu_selection <- menu(c("Remove all tables.", "Cancel."), title = "Data tables detected. This script will save all tables in the workspace to the package. Remove current tables from workspace?") } } if (exists("menu_selection")) { if (menu_selection %in% c(0, 2)) { stop("Operation cancelled.") } else { rm(list = AllTables) } } }
if (!isNamespaceLoaded("Census2016.DataPack")) { load("../data/SA16_decoder.rda") }
Metadata <- read_excel("./data-raw/Metadata/Metadata_2016_TSP_DataPack.xlsx", sheet = "Cell descriptors information", skip = 12) %>% as.data.table %>% .[, varI := as.integer(gsub("^T", "", Sequential))] %>% setnames("DataPack file", "DataPackFile") %>% .[, TableNo := as.integer(gsub("[^0-9]", "", DataPackFile))] %>% .[]
.short2long <- function(short) { coalesce(Metadata[match(x = short, Short)][["Long"]], short) } expect_equal(.short2long(c("squierl", "Tot_persons_C11_F", "Tot_C16_P")), c("squierl", "Total_persons_2011_Census_Females", "Total_2016_Census_Persons")) short2long <- function(.data) { noms <- names(.data) setnames(.data, noms, .short2long(noms)) }
Region_key <- fread(paste0('./data-raw/data/', Region, '/AUST/2016Census_T01_AUS_', Region, '.csv')) %>% names %>% extract2(1)
grep <- function(..., perl, fixed) base::grep(..., perl = missing(fixed), fixed = !missing(fixed)) grepl <- function(..., perl, fixed) base::grepl(..., perl = missing(fixed), fixed = !missing(fixed)) gsub <- function(..., perl, fixed) base::gsub(..., perl = missing(fixed), fixed = !missing(fixed)) sub <- function(..., perl, fixed) base::sub(..., perl = missing(fixed), fixed = !missing(fixed))
# Possible future issue with R 3.5 stopifnot(sub("a_([a-z])", "\\U\\1\\E", "a_b") == "B", sub("a_([A-Z])", "\\L\\1\\E", "a_B") == "b")
#' @return Wide as-is data table of all tables. freadG <- function(g, ...) { stopifnot(length(g) == 1) g0 <- paste0(if (g < 10) "0", g) file_list <- list.files(path = file.path('./data-raw/data/', Region, 'AUST'), # Files may be of the form ..G08_AUS.. or ..G09B_AUS.. pattern = paste0('2016Census_T', g0, '[A-Z]?_AUS_', Region, '\\.csv$'), full.names = TRUE) if (length(file_list) > 1) { file_list %>% lapply(fread, key = Region_key, ...) %>% Reduce(f = function(X, Y) X[Y]) } else { fread(file = file_list[1], ...) } %>% short2long }
freadT <- function(g, vi = NULL, value.name = "persons") { gmolten <- freadG(g, logical01=FALSE) %>% melt.data.table(id.vars = Region_key, variable.factor = FALSE, value.name = value.name) # Lots of duplicate table names across g's Metadata <- Metadata[TableNo == g] if (all(.subset2(gmolten, "variable") %chin% .subset2(Metadata, "Long"))) { og <- gmolten[Metadata, on = "variable==Long", nomatch=0L] } else if (all(.subset2(gmolten, "variable") %chin% .subset2(Metadata, "Short"))) { # Want the Long form to be 'variable' og <- gmolten[Metadata, on = "variable==Short", nomatch=0L] og <- drop_col(og, "variable") setnames(og, "Long", "variable") } else { stop("Unable to decode table names.") } if (is.null(vi)) { vi <- range(og[["varI"]]) } og %>% drop_cols(setdiff(names(Metadata), "varI")) %>% .[varI %between% vi] %>% extract_Year %>% .[] }
do_dots <- function(...) { eval(substitute(alist(...))) }
splitW <- function(DT, ..., Vs, # elict the rows of DT # from length(Vs) elicit = TRUE) { lower_caps <- function(v) { # Not_stated can be at the start of a code, # e.g. Table 15 v <- gsub("(?<!(^))(?:Not_stated)", "na", v) v <- gsub("(?:None)", "0", v) v <- gsub("Information_and_Communication_Technology", "Information_and_communication_technology", v) v <- gsub("_ICT_", "_ict_", v) v } reraise_caps <- function(v) { v <- gsub("Information_and_communication_technology", "Information_and_Communication_Technology", v) v <- gsub("_ict_", "_ICT_", v) v <- gsub("_", " ", v) v } if (missing(Vs)) { dotsL <- length(match.call(expand.dots = FALSE)$...) Vs <- vapply(seq_len(dotsL), function(x) deparse(do_dots(...)[[x]]), character(1)) v <- .subset2(DT, "variable") # Assume a new level occurs with each capital letter, # except the following v <- lower_caps(v) level <- nchar(gsub("[^A-Z]", "", v)) n_levels <- uniqueN(level) } if (elicit) { out <- DT[which(level == length(Vs))] } else { out <- DT if (n_levels > 1L) { to_print <- copy(out) to_print[, level := level] %>% unique(by = "level") %>% print stop("Cannot print: levels appear to be different.") } if (length(Vs) != n_levels) { stop("length(Vs) = ", length(Vs), " != n_levels = ", n_levels, ".") } } out[, variable := lower_caps(variable)] out[, (Vs) := tstrsplit(.SD[[1]], split = "_(?=[A-Z])", perl = TRUE), .SDcols = "variable" ][, (Vs) := lapply(.SD, reraise_caps), .SDcols = Vs ] # for (vv in names(out)) { # if ("Total" %fin% out[[vv]]) { # out <- out[out[[vv]] != "Total"] # } # } out }
.extract_Year <- function(v) { if (all(grepl("^.+_(2006|2011|2016)_Census.*$", v))) { out <- as.integer(sub("^.*_(2[0-9]{3})_Census.*$", "\\1", v)) } else if (all(grepl("^(2006|2011|2016)_Census", v))) { out <- as.integer(substr(v, 0, 4)) } else { first_bad <- v[nor(grepl("^(2006|2011|2016)_Census", v), grepl("^.+_(2006|2011|2016)_Census.*$", v))] %>% head(1) print(first_bad) stop("Unable to detect census year", call. = FALSE) } } extract_Year <- function(.data) { stopifnot("variable" %in% names(.data)) .data[, Year := .extract_Year(variable)] } expect_equal(.extract_Year("Total_persons_2006_Census_Males"), 2006L) expect_equal(.extract_Year("2006_Census_15_19_years_Married_Females"), 2006L)
.extract_Sex <- function(v) { req_regex <- ".*(Males|Females)" substr(sub(req_regex, "\\1", v), 0, 1) } expect_equal(.extract_Sex("Total_persons_2006_Census_Males"), "M") extract_Sex <- function(.data) { req_regex <- ".*(Males|Females)" stopifnot("variable" %in% names(.data), all(grepl(req_regex, .data[["variable"]]))) .data[, Sex := .extract_Sex(variable)] }
`%npin%` <- function(x, y) { !grepl(y, x) }
.extract_Age <- function(v, orderedFactor = TRUE) { out <- if_else(grepl("^.*?_([0-9]{2})_([0-9]{2})_years$", v), sub("^.*?_([0-9]{2})_([0-9]{2})_years", "\\1-\\2", v), if_else(grepl("^.*Census_[0-9]{1,2}_[0-9]{1,2}_years.*", v), sub("^.*Census_([0-9]{1,2})_([0-9]{1,2})_years.*$", "\\1-\\2", v), if_else(grepl("years_and_over", v), sub("^.*[^0-9]([0-9]+)_years_and_over.*$", "\\1+", v), if_else(grepl("^.*Age_(?:group(?:_of_parent)?|years)_([0-9]{1,2})_([0-9]{1,2})_years.*", v), sub("^.*Age_(?:group(?:_of_parent)?|years)_([0-9]{1,2})_([0-9]{1,2})_years.*", "\\1-\\2", v), if_else(grepl("_[0-9]{1,2}_[0-9]{1,2}_years$", v), sub("^.*([0-9]{1,2})_([0-9]{1,2})_years$", "\\1-\\2", v), v))))) i0 <- grepl("_([0-9]{2})_([0-9]{2})_years_[A-Z]", v) out[i0] <- sub("^.*_([0-9]{2})_([0-9]{2})_years_[A-Z].*$", "\\1-\\2", v[i0]) if (any(grepl("_", out))) { stop("\n\t", paste0(head(unique(grep("_", out, value = TRUE))), collapse = "\n\t"), " present.") } if (orderedFactor) { out <- factor(out, levels = unique(out), ordered = TRUE) } out } expect_equal(.extract_Age("Age_group_85_years_and_over_2006_Census_Males", FALSE), "85+") expect_equal(.extract_Age("2016_Census_80_84_years_Never_married_Females", FALSE), "80-84") expect_equal(.extract_Age("2006_Census_Buddhism_0_14_years", FALSE), "0-14") expect_equal(.extract_Age("Age_group_0_4_years_2006_Census_Persons", FALSE), "0-4") expect_equal(.extract_Age("2006_Census_Age_group_of_parent_15_19_years_Number_of_children_ever_born_No_children", FALSE), "15-19") expect_equal(.extract_Age("2006_Census_Persons_Postgraduate_Degree_Level_15_19_years", FALSE), "15-19") expect_equal(.extract_Age("2006_Census_Persons_15_19_years_Employed_Hours_worked_not_stated", FALSE), "15-19") local({ input <- c("Age_group_0_4_years_2006_Census_Persons", "Age_group_5_14_years_2016_Census_Persons", "Age_group_35_44_years_2011_Census_Persons", "Age_group_55_64_years_2011_Census_Persons", "Age_group_85_years_and_over_2006_Census_Persons") output <- .extract_Age(input) # i.e. ascending order, not expect_false(is.unsorted(output)) }) extract_Age <- function(.data) { .data[, Age := .extract_Age(variable)] }
extract_CountryOfBirth <- function(DT) { DT %>% # Set not stated -> NA .[variable %npin% "Country_of_birth_not_stated", CountryOfBirth := sub("^(.*)_(2006|2011|2016)_Census_(?:Persons|Males|Females)$", "\\1", variable)] %>% decode_country %>% .[] }
decodeRegion <- function(DT) { out <- DT decoder <- switch(Region, "SA2" = SA16_decoder[ASGS_Structure == Region, .(SA2_MAIN16 = Census_Code_2016, SA2_NAME16 = Census_Name_2016)], "SA3" = SA16_decoder[ASGS_Structure == Region, .(SA3_MAIN16 = Census_Code_2016, SA3_NAME16 = Census_Name_2016)], "SA4" = SA16_decoder[ASGS_Structure == Region, .(SA4_MAIN16 = Census_Code_2016, SA4_NAME16 = Census_Name_2016)], "STE" = SA16_decoder[ASGS_Structure == Region, .(STE_MAIN16 = Census_Code_2016, STE_NAME16 = Census_Name_2016)], "LGA" = { NonABS_decoder[ASGS_Structure == Region, .(Census_Code_2016, LGA_NAME16 = Census_Name_2016)] }, "CED" = { NonABS_decoder[ASGS_Structure == Region, .(Census_Code_2016, CED_NAME16 = Census_Name_2016)] }, "SED" = { NonABS_decoder[ASGS_Structure == Region, .(Census_Code_2016, SED_NAME16 = Census_Name_2016)] }, "SSC" = { NonABS_decoder[ASGS_Structure == Region, .(Census_Code_2016, SSC_NAME16 = Census_Name_2016)] }, NULL) %>% unique %>% setnames(1, Region_key) decoded_out <- decoder[out, on = Region_key] decoded_out[, (Region_key) := NULL] decoded_out }
.decode_country <- function(x, to = "alpha-3") { xna <- is.na(x) x[!xna] <- trimws(x[!xna]) stopifnot(to == "alpha-3", names(ISO3166)[1] == "name") nom <- ISO3166[["name"]] a3 <- ISO3166[["alpha_3"]] keep <- and(nchar(x, keepNA = FALSE) == 3L, x == toupper(x)) res <- a3[pmatch(x, nom, duplicates.ok = TRUE)] res[keep] <- x[keep] if (anyNA(res[!is.na(x)])) { print(x[is.na(res[!is.na(x)])]) stop("Country code could not be decoded.") } res[xna] <- NA_character_ res } decode_country <- function(out) { stopifnot(is.data.table(out), "CountryOfBirth" %in% names(out)) out[, CountryOfBirth := trimws(gsub("_", " ", CountryOfBirth))] out[, CountryOfBirth := gsub("(?<!(F))Ye?a?r.*arrival.ns$", "", CountryOfBirth, ignore.case = TRUE)] out[CountryOfBirth %pin% "(Born )?[Ee]lsewhere", CountryOfBirth := NA_character_] out[CountryOfBirth == "Country of birth not stated", CountryOfBirth := NA_character_] out[CountryOfBirth %pin% "Bosnia", CountryOfBirth := "Bosnia and Herzegovina"] out[CountryOfBirth %pin% "Korea", CountryOfBirth := "KOR"] out[CountryOfBirth %pin% "Macedonia", CountryOfBirth := "Macedonia"] out[CountryOfBirth %pin% c("United Kingdom", "N[a-z]+n Ireland", "England", "Scotland", "Wales"), CountryOfBirth := "United Kingdom"] out[CountryOfBirth %pin% "SE Europe nfd", CountryOfBirth := NA_character_] out[CountryOfBirth %pin% "China excl? SARs (and )?Tai?w", CountryOfBirth := "CHN"] out[CountryOfBirth %pin% "Ho?ng Ko?ng", CountryOfBirth := "Hong Kong"] out[CountryOfBirth == "Vietnam", CountryOfBirth := "VNM"] # Negative lookbehind due to FYROM (Macedonia) out[CountryOfBirth %pin% "FYROM", CountryOfBirth := "MKD"] out[CountryOfBirth %pin% "China excl? SARs Tai?w", CountryOfBirth := "CHN"] out[CountryOfBirth %pin% c("USA", "United States"), CountryOfBirth := "USA"] out[CountryOfBirth %pin% "Ho?ng Ko?ng", CountryOfBirth := "Hong Kong"] out[, CountryOfBirth := .decode_country(CountryOfBirth)] out[] }
Mop <- function(DT, value.name = "persons", suborder = NULL) { # Check if any variables have already been used. # If so, stop; else, add to varIused varIBeingUsed <- .subset2(DT, "varI") stopifnot(is.null(varIused) || !any(varIBeingUsed %in% varIused)) drop_col(DT, "varI") stopifnot(value.name %in% names(DT), "Year" %in% names(DT), # Should be Completed 'MaxSchooling' %notin% names(DT)) # Check for no duplicates, except for value.name if (anyDuplicated(DT, by = setdiff(names(DT), c(value.name, "variable")))) { print(duplicated_rows(DT, by = setdiff(names(DT), c(value.name, "variable")))) stop("Duplicated rows.") } # Are any columns constant? uv <- vapply(DT, uniqueN, integer(1)) if (any(uv == 1)) { print(names(uv)[uv == 1]) stop("Constant columns in DT.") } out <- DT %>% drop_col("variable") %>% setcolorder(sort(names(.))) %>% set_cols_first(c(Region_key, "Year")) %>% setorderv(setdiff(names(.), value.name)) %>% set_cols_last(value.name) %>% .[] if ("CountryOfBirth" %chin% names(out)) { if (!all(coalesce(nchar(out[["CountryOfBirth"]]), 3L) == 3L)) { stop("CountryOfBirth must be ISO-3166-2") } } if ("Dwelling" %chin% names(out)) { out[, PrivateDwelling := Dwelling == "Private"] out[, Dwelling := NULL] } if ("HasChildren" %chin% names(out)) { setnames(out, "HasChildren", "HasChild") } if ("HasChildrenUnder15" %chin% names(out)) { setnames(out, "HasChildrenUnder15", "HasChildUnder15") } if ("YearOfArrival" %chin% names(out)) { setnames(out, "YearOfArrival", "YearOfArrival.max") } if ("SchoolSector" %chin% names(out)) { out[SchoolSector == "Non_Govt", SchoolSector := "Non-government"] } if ("Ancestry" %chin% names(out)) { out[Ancestry == "Macdonian", Ancestry := "Macedonian"] if ("FatherBornAus" %chin% names(out) && "MotherBornAus" %chin% names(out)) { out[Ancestry == "Australian Birthplace not stated", FatherBornAus := NA] out[Ancestry == "Australian Birthplace not stated", MotherBornAus := NA] out[Ancestry == "Australian Birthplace not stated", Ancestry := "Australian"] out[Ancestry == "Australian Both parents born Australian", MotherBornAus := TRUE] out[Ancestry == "Australian Both parents born Australian", MotherBornAus := TRUE] out[Ancestry == "Australian Both parents born Australian", Ancestry := "Australian"] } } if ("MaxSchoolingCompleted" %chin% names(out)) { out[, "MaxSchoolingCompleted" := factor(MaxSchoolingCompleted, levels = c("Did not go to school", "Year 8 or below", "Year 8", "Year 9", "Year 10", "Year 11", "Year 12"), ordered = TRUE)] } if ("FamilyComposition" %chin% names(out)) { if (!OR("CoupleFamily" %in% names(out), uniqueN(out[["FamilyComposition"]]) > 3L)) { out[!grepl("Other", FamilyComposition), CoupleFamily := FamilyComposition %pin% "Couple"] # switch to the position of FamilyComposition set_colsuborder(out, c("CoupleFamily", "FamilyComposition")) out[, FamilyComposition := NULL] } } if ("IncomeTotPersonal.min" %chin% names(out)) { out[, IncomeTotPersonal.min := NULL] return(NULL) } if ("DwellingSubtype" %chin% names(out)) { out[DwellingSubtype == "4 or more storey block", DwellingSubtype := "4 storey block or more"] } if ("Aboriginal" %chin% names(out) && "TorresStraitIslander" %chin% names(out)) { c("Aboriginal", "Both Aboriginal & Torres Strait Islander", "Non-indigenous", "Torres Strait Islander") out[nor(Aboriginal, TorresStraitIslander), IndigenousStatus := "Non-indigenous"] out[`&`(Aboriginal, TorresStraitIslander), IndigenousStatus := "Both Aboriginal & Torres Strait Islander"] out[`&`(!Aboriginal, TorresStraitIslander), IndigenousStatus := "Torres Strait Islander"] out[`&`(Aboriginal, !TorresStraitIslander), IndigenousStatus := "Aboriginal"] out[, c("Aboriginal", "TorresStraitIslander") := NULL] } if ("Age.int" %chin% names(out)) { setnames(out, "Age.int", "Age.min") } for (j in names(DT)) { out[out[[j]] == "Other", (j) := "(Other)"] } apparent_population <- out[, list(ApparentPopulation = sum(eval(parse(text = value.name)))), keyby = "Year"] permitted_error <- switch(Region, "SA1" = 1.5e6, 0.5e6) expected_population <- fread("data-raw/data/AUST/2016Census_T01_AUS.csv", select = c("Tot_persons_C06_P", "Tot_persons_C11_P", "Tot_persons_C16_P")) %>% melt(measure.vars = names(.), value.name = "ExpectedPopulation") %>% .[, "Year" := 2000L + as.integer(sub("^Tot_persons_C(06|11|16)_P$", "\\1", variable))] %>% setkeyv("Year") %>% .[] pop_diff_by_Year <- apparent_population[expected_population, on = "Year"] %>% .[, difference := ApparentPopulation - ExpectedPopulation] differences <- .subset2(pop_diff_by_Year, "difference") switch( value.name, "persons" = { if (AND(!any(c("SpeaksEnglishOnly", "BornAust", "MaritalStatus", "MaxSchoolingCompleted", "OnlyEnglishSpokenHome") %in% names(DT)), any(abs(differences) > permitted_error))) { i_first_bad <- which(abs(differences) > permitted_error)[1] first_bad <- pop_diff_by_Year[i_first_bad] %>% .subset2("difference") Yr_first_bad <- switch(i_first_bad, 2006, 2011, 2016) text_apparent_population <- apparent_population[i_first_bad] %>% .subset2("ApparentPopulation") %>% prettyNum(big.mark = ",") if (first_bad < 0) { stop("Population undercounted in ", Yr_first_bad, ":\n\t", text_apparent_population, "\n\t", " (e = ", prettyNum(first_bad, big.mark = ","), ")", "\nShould you have used a different value.name?") } else { stop("Apparent population too high: ", text_apparent_population, " (e = ", prettyNum(first_bad, big.mark = ","), ")") } } }, "adults" = { adult_population <- lapply(dir(path = "data-raw/data/AUST/", pattern = "T29", full.names = TRUE), fread) %>% rbindlist(use.names = TRUE, fill = TRUE) %>% short2long %>% .[, .SD, .SDcols = grep("^Persons_aged_15_years.*Persons$", names(.))] %>% melt.data.table(measure.vars = names(.), value.name = "adults") %>% .[, Year := as.integer(sub("^.*(2006|2011|2016).*$", "\\1", variable))] comparison <- adult_population[apparent_population, on = "Year"] %>% .[, err := abs(ApparentPopulation - adults) / adults] if (any(comparison[["err"]] > 0.0004 * nrow(out))) { print(comparison[, .(apparent = prettyNum(ApparentPopulation, big.mark = ","), expected = prettyNum(adults, big.mark = ","))]) stop("Adults under or overcounted.") } }, "occupied_private_dwellings" = { # freadG(15) %$% sum(C16_T_T + C06_T_T + C11_T_T) exp_tot_dwellings <- 23190507 tot_dwellings <- sum(out[["occupied_private_dwellings"]]) err <- abs(tot_dwellings - exp_tot_dwellings) / exp_tot_dwellings if (err > 0.0004 * nrow(out)) { stop("Dwellings under or overcounted:\n", "\t", "Expected:\t", prettyNum(exp_tot_dwellings, big.mark = ","), "\n\t", "Result: \t", prettyNum(tot_dwellings, big.mark = ","), "\n") } }, "occupied_private_dwellings_being_rented" = { exp_tot_dwellings_2016 <- 2606630 tot_dwellings_2016 <- sum(out[Year == 2016L][["occupied_private_dwellings_being_rented"]]) err <- abs(tot_dwellings_2016 - exp_tot_dwellings_2016) / exp_tot_dwellings_2016 if (err > 0.0004 * nrow(out)) { stop("Dwellings under or overcounted:\n", "\t", "Expected:\t", prettyNum(exp_tot_dwellings, big.mark = ","), "\n\t", "Result: \t", prettyNum(tot_dwellings, big.mark = ","), "\n") } }) # Check for no duplicates, except for value.name if (anyDuplicated(out, by = setdiff(names(out), value.name))) { print(duplicated_rows(out, by = setdiff(names(out), value.name))) stop("2. Duplicated rows.") } if (!is.null(suborder)) { set_colsuborder(out, suborder) } set_cols_last(out, value.name) setorderv(out, setdiff(names(out), value.name)) out_noms <- names(out) object_name <- paste0(Region, "__", paste0(setdiff(out_noms, c(Region_key, value.name)), collapse = "_")) if (object_name %in% ls(envir = .GlobalEnv)) { object_name <- paste0(Region, "__", paste0(setdiff(out_noms, c(Region_key, "persons")), collapse = "_")) if (object_name %in% ls(envir = .GlobalEnv)) { stop("`", object_name, "` already defined.") } } decoded_out <- decodeRegion(out) if (!anyDuplicated(decoded_out, by = setdiff(names(decoded_out), value.name))) { out <- decoded_out } else { cat("\n\n\n\n\n") print(duplicated_rows(decoded_out, by = setdiff(names(decoded_out), value.name))) cat("\n\n\n\n\n") } assign(object_name, out, envir = .GlobalEnv) # Provided we have reached here, we can now modify varIused varIused <<- sort(c(varIused, unique(varIBeingUsed))) decoded_out[] }
iso3166_url <- "https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv" ISO3166 <- if (file.exists("data-raw/ISO-3166-countries.csv")) { fread("data-raw/ISO-3166-countries.csv") } else { fread(iso3166_url) %T>% fwrite("data-raw/ISO-3166-countries.csv") %>% .[] } setnames(ISO3166, "alpha-3", "alpha_3")
expect_equal(.decode_country(c("United Kingdom", "Australia", "Australia", NA)), c("GBR", "AUS", "AUS", NA))
varIused <- NULL
.region_persons <- freadT(1, c(1, 9), "persons") %>% .[variable %pin% "Persons$"] %>% Mop
freadT(1, c(1, 9), "persons") %>% .[variable %npin% "Persons$"] %>% extract_Sex %>% Mop
.region_by_age <- freadT(1, c(10, 108), "persons") %>% .[variable %pin% "Persons$"] %>% extract_Age %>% Mop
.region_by_age %>% .[Age >= "15-19"] %>% .[, .(adults = sum(persons)), keyby = c(names(.)[1], "Year")] %>% assign(x = paste0(Region, "__Year_adults"), envir = .GlobalEnv)
.region_by_age_sex <- freadT(1, c(10, 108), "persons") %>% .[variable %npin% "Persons$"] %>% extract_Sex %>% extract_Age %>% Mop
.region_by_age %>% .[Age >= "15-19"] %>% .[, varI := 0L] %>% setnames("persons", "adults") %>% .[, varI := NULL] %>% set_cols_last("adults") %>% assign(x = paste0(Region, "__Year_Age_adults"), envir = .GlobalEnv)
.region_by_age_sex %>% .[Age >= "15-19"] %>% .[, .(adults = sum(persons)), keyby = c(names(.)[1], "Year", "Sex")] %>% assign(x = paste0(Region, "__Year_Sex_adults"), envir = .GlobalEnv)
freadT(1, c(109, 117), "overseas_visitors") %>% .[variable %pin% "Persons$"] %>% Mop(value.name = "overseas_visitors")
freadT(1, c(109, 117), "overseas_visitors") %>% .[variable %npin% "Persons$"] %>% extract_Sex %>% Mop(value.name = "overseas_visitors")
provide_complementary_observation <- function(.data, vars = "Year", value.name = "persons", compl.var.name = "Other") { if (Region != "LGA") { meta_decoder <- SA16_decoder[ASGS_Structure == Region] } else { meta_decoder <- LGA_2016@data %>% as.data.table %>% .[, .(Census_Name_2016 = as.character(LGA_NAME16), Census_Code_2016 = paste0("LGA", (as.character(LGA_CODE16))))] } tempDT <- get(paste0(Region, "__", paste0(c(vars, if (value.name != "persons") value.name), collapse = "_"))) %>% .[meta_decoder, on = c(paste0(names(.)[1], "==Census_Name_2016")), nomatch=0L] %>% setnames("Census_Code_2016", Region_key) %>% setnames(value.name, "persons") .data %>% copy %>% setnames(value.name, "persons") %>% .[, .(tot_in_persons = sum(persons)), keyby = c(Region_key, vars)] %>% .[tempDT, on = c(Region_key, vars)] %>% .[, persons := persons - tot_in_persons] %>% setnames("persons", value.name) %>% .[, "variable" := compl.var.name] %>% .[, .SD, .SDcols = c(Region_key, vars, "variable", value.name)] %>% rbind(.data, use.names = TRUE, fill = TRUE) } freadT(1, c(118, 144)) %>% .[variable %pin% "Persons$"] %>% provide_complementary_observation %>% .[, Aboriginal := FALSE] %>% .[, TorresStraitIslander := FALSE] %>% .[variable %pin% "persons_Aboriginal", Aboriginal := TRUE] %>% .[variable %pin% "persons_Torres_Strait", TorresStraitIslander := TRUE] %>% .[variable %pin% "Both", c("Aboriginal", "TorresStraitIslander") := TRUE] %>% Mop
freadT(1, c(118, 144)) %>% .[variable %npin% "Persons$"] %>% extract_Sex %>% provide_complementary_observation(vars = c("Year", "Sex")) %>% .[, Aboriginal := FALSE] %>% .[, TorresStraitIslander := FALSE] %>% .[variable %pin% "persons_Aboriginal", Aboriginal := TRUE] %>% .[variable %pin% "persons_Torres_Strait", TorresStraitIslander := TRUE] %>% .[variable %pin% "Both", c("Aboriginal", "TorresStraitIslander") := TRUE] %>% Mop
varIused <- c(varIused, 145:153)
freadT(1, c(154, 171)) %>% .[variable %pin% "Persons$"] %>% .[, BornAust := grepl("Birthplace_Australia", variable)] %>% Mop
freadT(1, c(154, 171)) %>% .[variable %npin% "Persons$"] %>% .[, BornAust := grepl("Birthplace_Australia", variable)] %>% extract_Sex %>% Mop
freadT(1, c(172, 189)) %>% .[variable %pin% "Persons$"] %>% .[, SpeaksEnglishOnly := grepl("English_only", variable)] %>% Mop
freadT(1, c(172, 189)) %>% .[variable %npin% "Persons$"] %>% .[, SpeaksEnglishOnly := grepl("English_only", variable)] %>% extract_Sex %>% Mop
freadT(1, c(190, 199)) %>% .[variable %pin% "Persons$"] %>% provide_complementary_observation %>% .[, AustCitizen := variable != "Other"] %>% Mop
freadT(1, c(190, 199)) %>% .[variable %npin% "Persons$"] %>% extract_Sex %>% provide_complementary_observation %>% .[, AustCitizen := variable != "Other"] %>% Mop
melt_using_suffix <- function(DT, suffix, suffix_is = "Year", id.cols = Region_key) { prefixes <- sub(suffix, "", setdiff(names(DT), Region_key)) suffixes <- sub(paste0("^.*", suffix), "\\1", setdiff(names(DT), Region_key)) decode_suffix <- data.table(temp = seq_along(unique(suffixes)), new = unique(suffixes)) %>% .[, temp := as.character(temp)] %>% setnames(c("temp", "new"), c("_temp", suffix_is)) melt.data.table(DT, id.vars = id.cols, measure.vars = patterns(unique(prefixes)), value.name = unique(prefixes), variable.name = "_temp", variable.factor = FALSE, verbose = TRUE) %>% .[decode_suffix, on = "_temp"] %>% .[, "_temp" := NULL] %>% set_cols_first(c(Region_key, suffix_is)) %>% .[] }
Mop_multiple <- function(DT) { DT[, Year := as.integer(Year)] DTnoms <- copy(names(DT)) for (nom in DTnoms) { if (!(nom %chin% c(Region_key, "Year"))) { nom_ <- gsub("^([A-Z])", "\\L\\1\\E", x = gsub("_([A-za-z])", "\\U\\1\\E", nom, perl = TRUE), perl = TRUE) setnames(DT, nom, nom_) object_name <- paste0(Region, "__", "Year", "_", nom_) assign(object_name, value = decodeRegion(DT[, .SD, .SDcols = c(Region_key, "Year", nom_)]), envir = .GlobalEnv) } } }
freadG(2)[] %>% melt_using_suffix("_Census_year_(2006|2011|2016)") %>% Mop_multiple
freadT(3, vi = c(223, 1104))[] %>% .[variable %pin% "years_[0-9]{1,2}_(2006|2011|2016)"] %>% .[variable %pin% "Persons$"] %>% .[, Age.int := as.integer(sub("^.*years_([0-9]{1,2})_(2006|2011|2016).*$", "\\1", variable))] %>% provide_complementary_observation %>% Mop
freadT(3, vi = c(223, 1104))[] %>% .[variable %pin% "years_[0-9]{1,2}_(2006|2011|2016)"] %>% .[variable %npin% "Persons$"] %>% .[, Age.int := as.integer(sub("^.*years_([0-9]{1,2})_(2006|2011|2016).*$", "\\1", variable))] %>% extract_Sex %>% provide_complementary_observation %>% Mop
# Already used varIused <- unique(sort(c(varIused, seq(223, 1104))))
freadT(4, c(1114, 1737)) %>% .[variable %npin% "Persons$"] %>% .[variable %npin% "Total"] %>% .[or(grepl("[0-9]{1,2}_[0-9]{1,2}_years", variable), grepl("85_years", variable))] %>% .[, MaritalStatus := sub("^.*years(?:_and_over)?_([A-Z].+)_(Males|Females)$", "\\1", variable)] %>% .[MaritalStatus == "Never_married", MaritalStatus := "Never married"] %>% extract_Age %>% setnames("Age", "Age5yr") %>% extract_Sex %>% Mop
freadT(7, vi = c(2710, 3141), value.name = "females") %>% .[!grepl("Total", variable)] %>% extract_Age %>% .[variable %pin% "No_child", ChildrenEverBorn := 0L] %>% .[variable %pin% "One_child", ChildrenEverBorn := 1L] %>% .[variable %pin% "Two_child", ChildrenEverBorn := 2L] %>% .[variable %pin% "Three_child", ChildrenEverBorn := 3L] %>% .[variable %pin% "Four_child", ChildrenEverBorn := 4L] %>% .[variable %pin% "Five_child", ChildrenEverBorn := 5L] %>% .[variable %pin% "Six_or_more_child", ChildrenEverBorn := 6L] %>% setnames("Age", "Age5yr") %>% Mop(value.name = "females")
freadT(8, c(3142, 3483)) %>% .[variable %pin% "Persons$"] %>% .[variable %npin% "Total"] %>% .[, variable := sub("_20(06|11|16)_Census_", "_", variable)] %>% .[, variable := sub("_Persons", "", variable)] %>% setnames("variable", "CountryOfBirth") %>% decode_country %>% .[, .(persons = sum(persons), varI = first(varI)), keyby = c(Region_key, "Year", "CountryOfBirth")] %>% Mop
freadT(8, c(3142, 3483)) %>% .[variable %npin% "Persons$"] %>% .[variable %npin% "Total"] %>% .[, variable := sub("_20(06|11|16)_Census_", "_", variable)] %>% extract_Sex %>% .[, variable := sub("_(Males|Females)$", "", variable)] %>% setnames("variable", "CountryOfBirth") %>% decode_country %>% .[, .(persons = sum(persons), varI = first(varI)), keyby = c(Region_key, "Year", "CountryOfBirth", "Sex")] %>% Mop
freadT(10) %>% .[!grepl("Total", variable)] %>% .[variable %pin% "Persons$"] %>% .[grepl("Speaks_English_only", variable), LanguageSpoken := "English"] %>% .[variable %pin% "^Speaks_other_language_(.*)_20(06|11|16)_Census_.*$", LanguageSpoken := sub("^Speaks_other_language_(.*)_20(06|11|16)_Census_.*$", "\\1", variable)] %>% Mop
freadT(10) %>% .[!grepl("Total", variable)] %>% .[variable %npin% "Persons$"] %>% .[grepl("Speaks_English_only", variable), LanguageSpoken := "English"] %>% .[variable %pin% "^Speaks_other_language_(.*)_20(06|11|16)_Census_.*$", LanguageSpoken := sub("^Speaks_other_language_(.*)_20(06|11|16)_Census_.*$", "\\1", variable)] %>% extract_Sex %>% Mop
freadT(11) %>% .[!grepl("Total", variable)] %>% .[grepl("English_Very_well_or_well", variable), EnglishProficiency := "Speaks English well or very well"] %>% .[grepl("English_Not_well", variable), EnglishProficiency := "Speaks English not well or not at all"] %>% extract_Age %>% .[]
freadT(12) %>% .[!grepl("Total", variable)] %>% extract_Age %>% .[, Religion := gsub("^.*Census_([^0-9]+)_[0-9].*$", "\\1", variable)] %>% .[Religion == "Secular_Beliefs_and_Other_Spiritual_Beliefs_and_No_Religious_Affiliation", Religion := "No religion"] %>% .[Religion == "Religious_affiliation_not_stated", Religion := NA_character_] %>% .[Religion %pin% "Christ", Denomination := gsub("^Christ[a-z]+y_", "", Religion)] %>% .[Religion %pin% "Christ", Religion := "Christianity"] %>% .[Religion %enotin% c("Christianity", "Buddhism", "Hinduism", "Judaism", "Islam", "No religion"), Religion := "(Other)"] %>% .[] %>% .[variable %pin% "Presbyterian", Denomination := "Presbyterian Reformed"] %>% .[variable %pin% "Eastrn.Orthdox", Denomination := "Eastern Orthodox"] %>% .[variable %pin% "Orintal_Orthdx", Denomination := "Oriental Orthodox"] %>% .[variable %pin% "Sevnth.dy", Denomination := "Seventh Day Adventist"] %>% .[variable %pin% "Othr_Protestnt", Denomination := "Protestant (Other)"] %>% .[variable %pin% "Jeh.*v.*ahs.*Witn.*s.*es", Denomination := "Jehovah's Witnesses"] %>% .[variable %pin% "Oth.?r_Christian", Denomination := "(Other)"] %>% .[variable %pin% "Sikhism", Denomination := "Sikhism"] %>% .[variable %pin% "Aust.*Abor.*Trad.*Rel", Denomination := "Australian Aboriginal Traditional"] %>% .[variable %pin% "Other_Religions_Other_Religious_Groups", Denomination := "(Other)"] %>% .[variable %pin% "SB_OSB_NRA_SB", Denomination := "Secular beliefs"] %>% .[variable %pin% "SB_OSB_NRA_OSB", Denomination := "Other spiritual beliefs"] %>% .[, Religion := gsub("_", " ", Religion)] %>% .[, Denomination := gsub("_", " ", Denomination)] %>% .[Denomination == "Christian nfd", Denomination := NA_character_] %>% .[Denomination == "Churches of Christ", Denomination := "Churches Of Christ"] %>% .[Denomination == "Other Protestant", Denomination := "Protestant (Other)"] %>% .[Denomination == "Seventh day Adventist", Denomination := "Seventh Day Adventist"] %>% Mop
freadT(13, value.name = "students") %>% .[variable %pin% "Persons$"] %>% .[!grepl("^Total", variable)] %>% .[if_else(variable %pin% c("^(?:Technical|University)", "^Other_type", "^Infants_Primary", "^Secondary"), grepl("Total", variable), TRUE)] %>% .[variable %pin% "Pre.school", EduInstitutionType := "Pre-school"] %>% .[variable %pin% "Infants.Primary",EduInstitutionType := "Infants/Primary"] %>% .[variable %pin% "^Secondary", EduInstitutionType := "Secondary"] %>% .[variable %pin% "^Tec.*Furt", EduInstitutionType := "Technical or Further Educational Institution"] %>% .[variable %pin% "^Uni", EduInstitutionType := "University or other tertiary"] %>% .[variable %pin% "^Oth", EduInstitutionType := NA_character_] %>% .[, EduInstitutionType := factor(EduInstitutionType, levels = c(NA, "Infants/Primary", "Pre-school", "Secondary", "Technical or Further Educational Institution", "University or other tertiary"), ordered = TRUE)] %>% .[, .(students = sum(students)), keyby = c(Region_key, "Year", "EduInstitutionType")] %>% Mop("students")
freadT(13, value.name = "students") %>% .[!(variable %pin% "Persons$")] %>% .[!grepl("^Total", variable)] %>% .[if_else(variable %pin% c("^(?:Technical|University)", "^Other_type", "^Infants_Primary", "^Secondary"), grepl("Total", variable), TRUE)] %>% .[variable %pin% "Pre.school", EduInstitutionType := "Pre-school"] %>% .[variable %pin% "Infants.Primary",EduInstitutionType := "Infants/Primary"] %>% .[variable %pin% "^Secondary", EduInstitutionType := "Secondary"] %>% .[variable %pin% "^Tec.*Furt", EduInstitutionType := "Technical or Further Educational Institution"] %>% .[variable %pin% "^Uni", EduInstitutionType := "University or other tertiary"] %>% .[variable %pin% "^Oth", EduInstitutionType := NA_character_] %>% .[, EduInstitutionType := factor(EduInstitutionType, levels = c(NA, "Infants/Primary", "Pre-school", "Secondary", "Technical or Further Educational Institution", "University or other tertiary"), ordered = TRUE)] %>% extract_Sex %>% .[, .(students = sum(students)), keyby = c(Region_key, "Year", "EduInstitutionType", "Sex")] %>% Mop("students")
.extract_FamilyComposition <- function(v) { outputs <- c("Couple family with no children", "Couple family with children", "One parent family with children", "Other family", "Lone-person household", "Group household") underscores <- gsub("[^A-Za-z]", "_", gsub("One parent family with children", "One parent family", outputs, fixed = TRUE)) regex <- paste0("(?:", paste0(underscores, collapse = ")|(?:"), ")") underscore_out <- sub(paste0("^.*?(", regex, ").*?$"), "\\1", v, perl = TRUE) outputs[match(underscore_out, underscores)] } extract_FamilyComposition <- function(.data) { outputs <- c("Couple family with no children", "Couple family with children", "One parent family with children", "Other family", "Lone-person household", "Group household") underscores <- gsub("[^A-Za-z]", "_", outputs) req_regex <- paste0("(?:", paste0(underscores, collapse = ")|(?:"), ")") stopifnot("variable" %in% names(.data), all(grepl(req_regex, .data[["variable"]]))) .data[, FamilyComposition := .extract_FamilyComposition(variable)] .data } expect_equal(.extract_FamilyComposition("2006_Census_Separate_house_Family_households_Couple_family_with_no_children"), "Couple family with no children")
freadT(14, value.name = "occupied_private_dwellings") %>% .[!(variable %pin% "Total$")] %$% .[, isFamily := variable %pin% "Family_households"] %>% .[(isFamily), FamilyComposition := gsub("_", " ", gsub("^.*Family_households_", "", variable))] %>% .[not(isFamily), FamilyComposition := sub("^.*((?:Lone_person)|(?:Group)|(?:Other)).*$", "\\1", variable)] %>% .[, FamilyComposition := gsub("_", " ", FamilyComposition)] %>% .[FamilyComposition == "Other", FamilyComposition := "Other family"] %>% .[FamilyComposition == "Group", FamilyComposition := "Group household"] %>% .[FamilyComposition == "Lone person", FamilyComposition := "Lone-person household"] %>% .[]
#' @param x Values to be matched against #' @param y Permitted decoded values. #' @param extract1 A regex with a capturing group. The part of \code{x} to be retained. #' @param y.complete Must all values in y match with at least one value in \code{x}? #' @return For every value in x, the closest match in y. zmatch <- function(x, y, extract1 = "^(.*)$", delete.penalty = 0.01, sub.penalty = 0.2, y.complete = TRUE) { x <- sub(extract1, "\\1", x, perl = TRUE) Y <- gsub("[^A-Za-z]+", "_", y) distance_matrix <- adist(x, Y, costs = list(deletions = delete.penalty, substitutions = sub.penalty)) indexes <- apply(distance_matrix, 1, which.min) out <- y[indexes] if (y.complete && any(y[!is.na(y)] %notin% out)) { stop("Not all y were matched.\n\t", paste0(unique(y[y %notin% out]), sep = "\n\t")) } out }
permitted_DwellingStructures <- c(NA_character_, "(Other)", "Flat/apartment", "Semi-detached, row/terrace house, townhouse etc.", "Separate house")
test_that("zmatch", { x0 <- 'Semi_detached_row_or_terrace_house_townhouse_etc_with_Two_or_more_storeys_' expect_equal(zmatch(x0, permitted_DwellingStructures, y.complete = FALSE), permitted_DwellingStructures[4]) })
freadT(15, value.name = "occupied_private_dwellings") %>% .[variable %pin% paste0("^(20(06|11|16))_Census_", c("Separate_house", "Semi_detached_row_or_terrace_house_townhouse_etc_with_Total", "Flat_unit_or_apartment_Total", "Other_dwelling_Total", "Dwelling_structure_not_stated"), "_Total$")] %>% .[, DwellingStructure := zmatch(variable, permitted_DwellingStructures, extract1 = "^(?:20(?:06|11|16))_Census_(.*)(?:_Total)?_Total$")] %>% # .[, DwellingStructure := sub("^(?:20(?:06|11|16))_Census_(.*)(?:_Total)?_Total$", "\\1", variable)] %>% .[variable %pin% "Dwelling_structure_not_stated", DwellingStructure := NA_character_] %>% .[] %>% Mop("occupied_private_dwellings")
freadT(15, value.name = "occupied_private_dwellings") %>% .[variable %pin% paste0("^(20(06|11|16))_Census_", c("Separate_house", "Semi_detached_row_or_terrace_house_townhouse_etc_with_Total", "Flat_unit_or_apartment_Total", "Other_dwelling_Total", "Dwelling_structure_not_stated"), "_Number_of_persons_usually_resident")] %>% .[, DwellingStructure := zmatch(variable, permitted_DwellingStructures, extract1 = "^(?:20(?:06|11|16))_Census_(.*)(?:_Total)?_Total$")] %>% .[variable %pin% "Dwelling_structure_not_stated", DwellingStructure := NA_character_] %>% .[] %>% .[, UsualResidents.min := zmatch(variable, c("One", "Two", "Three", "Four", "Five", "Six"), extract1 = "^.*persons_usually_resident_(.*)$")] %>% .[, UsualResidents.min := as.integer(match(UsualResidents.min, table = c("One", "Two", "Three", "Four", "Five", "Six")))] %>% Mop("occupied_private_dwellings")
freadT(18, value.name = "occupied_private_dwellings") %>% .[!grepl("^Total", variable)] %>% .[variable %pin% "^(?:20(?:06|11|16))_Census_(?!Total).*Dwelling_structure_"] %>% .[, DwellingStructure := zmatch(variable, permitted_DwellingStructures, extract1 = "^(?:20(?:06|11|16))_Census_.*Dwelling_structure_(.*)(?:_Total)?(?:_Total)?$", delete.penalty = 0.05)] %>% .[grepl("Dwelling_structure_Not_stated", variable, ignore.case = TRUE), DwellingStructure := NA_character_] %>% .[!grepl("20.._Census_Total_Dwelling_structure_Separate_house", variable)] %>% .[] %>% .[, TenureType := sub("^(?:20(?:06|11|16))_Census_(.*)_Dwelling_structure_.*$", "\\1", variable)] %>% .[TenureType != "Rented_Total"] %>% .[TenureType == "Tenure_type_not_stated", TenureType := NA_character_] %>% .[, TenureType := gsub("_", " ", TenureType)] %>% .[] %>% Mop("occupied_private_dwellings")
permitted_Landlords <- c(NA, "(Other)", "Co-op/church group etc", "Person not in same household", "Real estate agent", "State or territory housing authority")
freadT(19, value.name = "occupied_private_dwellings_being_rented") %>% .[!(variable %pin% "_Total$")] %>% .[!grepl("^20.._Census_Total", variable)] %>% .[, variable := gsub("Housing_co_operative_community_church_group", "Co-op/church group etc", variable)] %>% .[, variable := gsub("State_territory", "State or territory", variable)] %>% .[!grepl("Landlord_type_Not_stated", variable), Landlord := zmatch(variable, permitted_Landlords, extract1 = "^.*Landlord_type_(.*)$")] %>% .[variable %pin% "Housing.co.operative", Landlord := "Co-op/church group etc"] %>% .[variable %pin% "^20.._Census_([0-9]+)_.*$", Rent.min := 52.5 * as.integer(sub("^20.._Census_([0-9]+)_.*$", "\\1", variable))] %>% .[, Rent.min := as.integer(Rent.min)] %>% .[] %>% Mop("occupied_private_dwellings_being_rented")
freadT(20, value.name = "occupied_private_dwellings_being_rented") %>% rbind(freadT(21, value.name = "occupied_private_dwellings_being_rented")) %>% .[!(variable %pin% "_Total$")] %>% .[!(variable %pin% "20.._Census_Total")] %>% .[variable %pin% "^20.._Census_([0-9]+)_.*$", Rent.min := 52.5 * as.integer(sub("^20.._Census_([0-9]+)_.*$", "\\1", variable))] %>% .[, Rent.min := as.integer(Rent.min)] %>% .[variable %pin% "Couple_family", FamilyComposition := "Couple family"] %>% .[variable %pin% "One.parent", FamilyComposition := "One-parent family"] %>% .[, HasDependantStudent := not(variable %pin% "no_dependent_students$")] %>% .[, HasChild := not(variable %pin% "With_no_children(_under_15)?")] %>% .[!grepl("With_no_children$", variable), HasNonDependentChild := variable %pin% "With_non_dependent_children_only"] %>% .[] %>% Mop("occupied_private_dwellings_being_rented")
freadT(22, value.name = "families_with_children") %>% rbind(freadT(23, value.name = "families_with_children")) %>% .[!grepl("Total", variable)] %>% .[, CoupleFamily := grepl("Couple_family", variable)] %>% .[, Children := match(zmatch(variable, y = c("One", "Two", "Three", "Four"), extract1 = "^.*family_with_(.*)_child(ren)?$"), table = c("One", "Two", "Three", "Four", "Five"))] %>% .[!grepl("Partial_income_stated|All_incomes_not_stated|Negative", variable), TotalFamilyIncome.min := 52L * as.integer(sub("^.*Census_([0-9]+)_([0-9]+|or_more)_(Couple|One_parent).*$", "\\1", variable))] %>% .[grepl("Negative", variable), TotalFamilyIncome.min := -1L] %>% .[, list(families_with_children = sum(families_with_children), # hack to allow Mop() to proceed varI = first(varI)), keyby = c(Region_key, "Year", "CoupleFamily", "Children", "TotalFamilyIncome.min")] %>% Mop("families_with_children")
freadT(24, value.name = "occupied_private_dwellings_being_rented") %>% .[!grepl("Total", variable)] %>% # Mistake in data pack 150-224 should be 200-224 .[, variable := sub("Rent_150_224", "Rent_200_224", variable)] %>% # If any income not stated, just set it as NA: .[, variable := sub("Partial_income_stated", "All_incomes_not_stated", variable)] %>% .[, .(occupied_private_dwellings_being_rented = sum(occupied_private_dwellings_being_rented), varI = first(varI)), keyby = c(Region_key, "Year", "variable")] %>% .[variable %pin% "20.._Census_[0-9]", TotalHouseholdIncome.min := 52L * as.integer(sub("^.*Census_([0-9]+)_.*$", "\\1", variable))] %>% .[grepl("Negative", variable), TotalHouseholdIncome.min := -1L] %>% .[variable %pin% "^20.._Census_.*_Rent_[0-9]", Rent.min := 52.5 * as.integer(sub("^20.._Census_.*_Rent_([0-9]+).*$", "\\1", variable))] %>% .[, Rent.min := as.integer(Rent.min)] %>% Mop("occupied_private_dwellings_being_rented")
# Not usable freadT(25, value.name = "families") %>% .[!grepl("Total", variable)] %>% .[, FamilyComposition := zmatch(variable, c("Couple family", "One-parent family"))] %>% .[] %>% .[, HasChildren := !grepl("family_with_no_children_(?!under_15)", variable)] %>% .[, HasChildrenUnder15 := HasChildren & !grepl("no_children_under_15|non_dependent", variable)] %>% .[, HasDependentStudents := grepl("((?<!(?:no))_dependent_students)", variable)] %>% .[variable %pin% "Mortgage_repayment_[0-9]", MortgageRepayment.min := as.integer(sub("^.*Mortgage_repayment_([0-9]+).*$", "\\1", variable))] %>% .[] %>% Mop("families")
# Selected (percent and persons in same table) fread29 <- suppressWarnings(freadT(29)) adult_population <- fread29 %>% .[variable %pin% "^Persons_aged_15_years_and_over.*Persons$"] %>% .[, .(tot_adults = persons), keyby = c(Region_key, "Year")]
# Labour_force_status fread29 %>% .[endsWith(variable, "Persons")] %>% .[variable %pin% c("^Labour_force_status", "^Not_in_the_labour_force")] %>% setnames("persons", "adults") %>% .[!grepl("Labour_force_status_Total", variable)] %>% .[variable %pin% "^Labour_force_status", LabourForceStatus := zmatch(variable, c("Employed (full-time)", "Employed (part-time)", "Employed (away from work)", "Unemployed (looking for work)"), extract1 = "^Labour_force_status_(.*)_20.*$")] %>% .[variable %pin% "^Not_in_the_labour_force", LabourForceStatus := "Not in the labour force"] %>% # Need to account for 'not applicable' to ensure totals # sum to adult population .[adult_population, on = c(Region_key, "Year")] %>% .[, residual_adults := tot_adults - sum(adults), keyby = c(Region_key, "Year")] %>% .[, .SD, .SDcols = c(Region_key, "varI", "Year", "LabourForceStatus", "adults", "residual_adults")] %>% melt.data.table(measure.vars = grep("adults", names(.), value = TRUE), value.name = "adults") %>% .[variable == "residual_adults", LabourForceStatus := NA_character_] %>% unique(by = c(Region_key, "Year", "LabourForceStatus")) %>% Mop("adults")
adult_population <- fread29 %>% .[variable %pin% "^Persons_aged_15_years_and_over.*(Males|Females)$"] %>% extract_Sex %>% .[, .(tot_adults = persons), keyby = c(Region_key, "Sex", "Year")] fread29 %>% .[!endsWith(variable, "Persons")] %>% extract_Sex %>% .[variable %pin% c("^Labour_force_status", "^Not_in_the_labour_force")] %>% setnames("persons", "adults") %>% .[!grepl("Labour_force_status_Total", variable)] %>% .[variable %pin% "^Labour_force_status", LabourForceStatus := zmatch(variable, c("Employed (full-time)", "Employed (part-time)", "Employed (away from work)", "Unemployed (looking for work)"), extract1 = "^Labour_force_status_(.*)_20.*$")] %>% .[variable %pin% "^Not_in_the_labour_force", LabourForceStatus := "Not in the labour force"] %>% # Need to account for 'not applicable' to ensure totals # sum to adult population .[adult_population, on = c(Region_key, "Year", "Sex")] %>% .[, residual_adults := tot_adults - sum(adults), keyby = c(Region_key, "Year", "Sex")] %>% .[, .SD, .SDcols = c(Region_key, "varI", "Year", "LabourForceStatus", "Sex", "adults", "residual_adults")] %>% melt.data.table(measure.vars = grep("adults", names(.), value = TRUE), value.name = "adults") %>% .[variable == "residual_adults", LabourForceStatus := NA_character_] %>% unique(by = c(Region_key, "Year", "Sex", "LabourForceStatus")) %>% Mop("adults")
fread29 %>% .[endsWith(variable, "Persons")] %>% .[startsWith(variable, "Percent")] %>% setnames("persons", "value") %>% .[, variable := sub("^Percent_(.*)_20...*$", "\\1", variable)] %>% .[, variable := gsub("_([A-Za-z])", "\\U\\1\\E", variable)] %>% .[, variable := sub("^([A-Z])", "\\L\\1\\E", variable)] %>% dcast.data.table(formula = as.formula(paste0(Region_key, " + Year", "~", "variable")), value.var = "value") %>% .[] %>% { dot <- . for (nom in setdiff(names(dot), c(Region_key, "Year"))) { out <- dot[, .SD, .SDcols = c(Region_key, "Year", nom)] Mop(out, value.name = nom) # assign(paste0(Region, "__Year_", nom), # value = decodeRegion(dot[, .SD, .SDcols = c(Region_key, "Year", nom)]), # envir = .GlobalEnv) } }
fread29 %>% .[!endsWith(variable, "Persons")] %>% extract_Sex %>% .[startsWith(variable, "Percent")] %>% setnames("persons", "value") %>% .[, variable := sub("^Percent_(.*)_20...*$", "\\1", variable)] %>% .[, variable := gsub("_([A-Za-z])", "\\U\\1\\E", variable)] %>% .[, variable := sub("^([A-Z])", "\\L\\1\\E", variable)] %>% dcast.data.table(formula = as.formula(paste0(Region_key, " + Year + Sex", "~", "variable")), value.var = "value") %>% .[] %>% { dot <- . for (nom in setdiff(names(dot), c(Region_key, "Year", "Sex"))) { out <- dot[, .SD, .SDcols = c(Region_key, "Year", "Sex", nom)] Mop(out, value.name = nom) # assign(paste0(Region, "__Year_Sex_", nom), # value = decodeRegion(dot[, .SD, .SDcols = c(Region_key, "Year", "Sex", nom)]), # envir = .GlobalEnv) } }
fread29 %>% .[endsWith(variable, "Persons")] %>% .[startsWith(variable, "Non_school_qualification")] %>% .[, variable := sub("^Non_school_qualifications?_(.*)_20.._Census_Persons$", "\\1", variable)] %>% .[!endsWith(variable, "Total")] %>% .[, NonSchoolQualification := gsub("_", " ", sub("Certificate_Certificate", "Certificate", sub("_(Degree_)?Level", "", sub("(Certificate_Level_){2}", "Certificate_Level_", variable))))] %>% .[, NonSchoolQualification := factor(NonSchoolQualification, levels = c("Certificate nfd", "Certificate I and II Level", "Certificate III and IV Level", "Advanced Diploma and Diploma", "Bachelor", "Graduate Diploma and Graduate Certificate", "Postgraduate"), ordered = TRUE)] %>% provide_complementary_observation %>% .[] %>% Mop
fread29 %>% .[!endsWith(variable, "Persons")] %>% extract_Sex %>% .[startsWith(variable, "Non_school_qualification")] %>% .[, variable := sub("^Non_school_qualifications?_(.*)_20.._Census_(Males|Females)$", "\\1", variable)] %>% .[!endsWith(variable, "Total")] %>% .[, NonSchoolQualification := gsub("_", " ", sub("Certificate_Certificate", "Certificate", sub("_(Degree_)?Level", "", sub("(Certificate_Level_){2}", "Certificate_Level_", variable))))] %>% .[, NonSchoolQualification := factor(NonSchoolQualification, levels = c("Certificate nfd", "Certificate I and II Level", "Certificate III and IV Level", "Advanced Diploma and Diploma", "Bachelor", "Graduate Diploma and Graduate Certificate", "Postgraduate"), ordered = TRUE)] %>% provide_complementary_observation(vars = c("Year", "Sex")) %>% .[] %>% Mop
fread29 %>% .[endsWith(variable, "Persons")] %>% .[startsWith(variable, "Migration")] %>% .[grepl("1_year_ago", variable)] %>% .[, SameAddress1YearAgo := grepl("Lived_at_same_address_1_year_ago", variable)] %>% .[] %>% provide_complementary_observation %>% Mop
fread29 %>% .[endsWith(variable, "Persons")] %>% .[startsWith(variable, "Migration")] %>% .[grepl("5_years_ago", variable)] %>% .[, SameAddress5YearsAgo := grepl("Lived_at_same_address_5_years_ago", variable)] %>% .[] %>% provide_complementary_observation %>% Mop
fread29 %>% .[!endsWith(variable, "Persons")] %>% extract_Sex %>% .[startsWith(variable, "Migration")] %>% .[grepl("1_year_ago", variable)] %>% .[, SameAddress1YearAgo := grepl("Lived_at_same_address_1_year_ago", variable)] %>% .[] %>% provide_complementary_observation(vars = c("Year", "Sex")) %>% Mop
fread29 %>% .[!endsWith(variable, "Persons")] %>% extract_Sex %>% .[startsWith(variable, "Migration")] %>% .[grepl("5_years_ago", variable)] %>% .[, SameAddress5YearsAgo := grepl("Lived_at_same_address_5_years_ago", variable)] %>% .[] %>% provide_complementary_observation(vars = c("Year", "Sex")) %>% Mop
freadT(30, value.name = "families") %>% .[!endsWith(variable, "Total")] %>% .[!grepl("Census_Total", variable)] %>% .[grepl("Couple|Parent", variable, ignore.case = TRUE), FamilyComposition := zmatch(variable, y = c("Couple", "One parent"))] %>% .[] %>% .[variable %pin% "Family_income_[0-9]", TotalFamilyIncome.min := 52L * as.integer(sub("^.*Family_income_([0-9]+)_.*$", "\\1", variable))] %>% .[variable %pin% "Negative_Nil_income$", TotalFamilyIncome.min := -1L] %>% .[, variable := sub("_Family_income.*$", "", variable)] %>% .[!endsWith(variable, "Total")] %>% .[!grepl("Labour_force(_status)?_not_stated", variable), c("FullTimeWorkers", "PartTimeWorkers", "AwayFromWork", "NotWorking") := 0L] %>% .[variable %pin% "(One_e|E)mployed_full_time", FullTimeWorkers := 1L] %>% .[variable %pin% "[oO]ne(_employed)?_part_time", PartTimeWorkers := 1L] %>% .[variable %pin% "Employed_part_time", PartTimeWorkers := 1L] %>% .[variable %pin% "one_away_from_work", AwayFromWork := 1L] %>% .[variable %pin% "Employed_away_from_work$", AwayFromWork := 1L] %>% .[variable %pin% "one_not_working", NotWorking := 1L] %>% .[variable %pin% "Both_employed_full_time", FullTimeWorkers := 2L] %>% .[variable %pin% "Both_employed_part_time", PartTimeWorkers := 2L] %>% .[variable %pin% "Both_employed_away_from_work", AwayFromWork := 2L] %>% .[variable %pin% "Both_not_working", NotWorking := 2L] %>% .[variable %pin% "Unemployed", NotWorking := 1L] %>% .[] %>% .[grepl("children", variable), HasChildren := !grepl("with_no_children", variable)] %>% .[] %>% Mop("families", suborder = c("CoupleFamily", "HasChild", "TotalFamilyIncome.min", "FullTimeWorkers", "PartTimeWorkers", "AwayFromWork", 'NotWorking', "families") ) %>% .[]
freadT(31, value.name = "adults") %>% .[grepl("20.._Census_Persons", variable)] %>% .[!grepl("Total$", variable)] %>% .[!grepl("20.._Census_Persons_Total", variable)] %>% extract_Age %>% .[, variable := sub("20.._Census_Persons_(.*?)_[1-9].*", "\\1", variable)] %>% .[] %>% .[variable != "Certificate_Level_Total"] %>% # Set both to NA .[variable == "Level_of_education_inadequately_described", variable := "Level_of_education_not_stated"] %>% .[variable == "Level_of_education_not_stated", adults := sum(adults), by = c(Region_key, "Year", "Age")] %>% unique(by = setdiff(names(.), "varI")) %>% .[!grepl("Level_of_education_not_stated", variable), NonSchoolQualification := zmatch(variable, c("Certificate nfd", "Certificate I and II Level", "Certificate III and IV Level", "Advanced Diploma and Diploma", "Bachelor", "Graduate Diploma and Graduate Certificate", "Postgraduate"))] %>% .[, NonSchoolQualification := factor(NonSchoolQualification, levels = c("Certificate nfd", "Certificate I and II Level", "Certificate III and IV Level", "Advanced Diploma and Diploma", "Bachelor", "Graduate Diploma and Graduate Certificate", "Postgraduate"), ordered = TRUE)] %>% .[] %>% Mop("adults")
freadT(31, value.name = "adults") %>% .[grepl("20.._Census_(Males|Females)", variable)] %>% extract_Sex %>% .[!grepl("Total$", variable)] %>% .[!grepl("20.._Census_(Males|Females)_Total", variable)] %>% extract_Age %>% .[, variable := sub("20.._Census_(?:Males|Females)_(.*?)_[1-9].*", "\\1", variable)] %>% .[] %>% .[variable != "Certificate_Level_Total"] %>% # Set both to NA .[variable == "Level_of_education_inadequately_described", variable := "Level_of_education_not_stated"] %>% .[variable == "Level_of_education_not_stated", adults := sum(adults), by = c(Region_key, "Year", "Age", "Sex")] %>% unique(by = setdiff(names(.), "varI")) %>% .[!grepl("Level_of_education_not_stated", variable), NonSchoolQualification := zmatch(variable, c("Certificate nfd", "Certificate I and II Level", "Certificate III and IV Level", "Advanced Diploma and Diploma", "Bachelor", "Graduate Diploma and Graduate Certificate", "Postgraduate"))] %>% .[, NonSchoolQualification := factor(NonSchoolQualification, levels = c("Certificate nfd", "Certificate I and II Level", "Certificate III and IV Level", "Advanced Diploma and Diploma", "Bachelor", "Graduate Diploma and Graduate Certificate", "Postgraduate"), ordered = TRUE)] %>% .[] %>% Mop("adults")
freadT(32, value.name = "adults") %>% .[grepl("20.._Census_(Persons)", variable)] %>% .[!grepl("Total$", variable)] %>% .[!grepl("20.._Census_(Persons)_Total", variable)] %>% extract_Age %>% .[, variable := sub("20.._Census_(?:Persons)_(.*?)_[1-9].*", "\\1", variable)] %>% .[] %>% # Set both to NA .[variable == "Field_of_study_inadequately_described", variable := "Field_of_study_not_stated"] %>% .[variable == "Field_of_study_not_stated", adults := sum(adults), by = c(Region_key, "Year", "Age")] %>% unique(by = setdiff(names(.), "varI")) %>% .[, FOE := gsub("_", " ", variable)] %>% .[] %>% Mop("adults")
freadT(32, value.name = "adults") %>% .[grepl("20.._Census_(Males|Females)", variable)] %>% extract_Sex %>% .[!grepl("Total$", variable)] %>% .[!grepl("20.._Census_(Males|Females)_Total", variable)] %>% extract_Age %>% .[, variable := sub("20.._Census_(?:Males|Females)_(.*?)_[1-9].*", "\\1", variable)] %>% # Set both to NA .[variable == "Level_of_education_inadequately_described", variable := "Level_of_education_not_stated"] %>% .[variable == "Level_of_education_not_stated", adults := sum(adults), by = c(Region_key, "Year", "Age", "Sex")] %>% unique(by = setdiff(names(.), "varI")) %>% .[, FOE := gsub("_", " ", variable)] %>% .[] %>% Mop("adults")
freadT(33, value.name = "adults") %>% .[!grepl("Males|Females", variable)] %>% .[!grepl("Labour_force_status_Total", variable)] %>% .[!grepl("20.._Census_Persons_Total", variable)] %>% .[!endsWith(variable, "Total")] %>% extract_Age %>% setnames("Age", "Age5yr") %>% .[] %>% .[, variable := sub("^.*_([A-Z][^A-Z]+)$", "\\1", variable)] %>% .[] %>% .[variable %enotin% "Total_labour_force"] %>% .[] %>% .[variable %ein% "Worked_full_time", LabourForceStatus := "Employed (full-time)"] %>% .[variable %ein% "Worked_part_time", LabourForceStatus := "Employed (part-time)"] %>% .[variable %ein% "Away_from_work",LabourForceStatus := "Employed (away from work)"] %>% .[variable %ein% "Hours_worked_not_stated",LabourForceStatus := "Employed (hours worked not stated)"] %>% .[variable %ein% "Looking_for_full_time_work",LabourForceStatus := "Unemployed (looking for full-time work)"] %>% .[variable %ein% "Looking_for_part_time_work",LabourForceStatus := "Unemployed (looking for part-time work)"] %>% .[variable %ein% "Not_in_the_labour_force",LabourForceStatus := "Not in the labour force"] %>% .[] %>% Mop("adults")
freadT(33, value.name = "adults") %>% .[grepl("Males|Females", variable)] %>% extract_Sex %>% .[!grepl("Labour_force_status_Total", variable)] %>% .[!grepl("20.._Census_(Males|Females)_Total", variable)] %>% .[!endsWith(variable, "Total")] %>% extract_Age %>% setnames("Age", "Age5yr") %>% .[] %>% .[, variable := sub("^.*_([A-Z][^A-Z]+)$", "\\1", variable)] %>% .[] %>% .[variable %enotin% "Total_labour_force"] %>% .[] %>% .[variable %ein% "Worked_full_time", LabourForceStatus := "Employed (full-time)"] %>% .[variable %ein% "Worked_part_time", LabourForceStatus := "Employed (part-time)"] %>% .[variable %ein% "Away_from_work",LabourForceStatus := "Employed (away from work)"] %>% .[variable %ein% "Hours_worked_not_stated",LabourForceStatus := "Employed (hours worked not stated)"] %>% .[variable %ein% "Looking_for_full_time_work",LabourForceStatus := "Unemployed (looking for full-time work)"] %>% .[variable %ein% "Looking_for_part_time_work",LabourForceStatus := "Unemployed (looking for part-time work)"] %>% .[variable %ein% "Not_in_the_labour_force",LabourForceStatus := "Not in the labour force"] %>% .[] %>% Mop("adults")
freadT(34, value.name = "workers") %>% .[!grepl("Males|Females", variable)] %>% .[, variable := sub("_20.._Census_Persons$", "", variable)] %>% .[variable != "Total"] %>% .[variable != "Inadequately_described_Not_stated", IndustryOfEmployment := gsub("_", " ", variable)] %>% Mop("workers")
freadT(34, value.name = "workers") %>% .[grepl("Males|Females", variable)] %>% extract_Sex %>% .[, variable := sub("_20.._Census_(Males|Females)$", "", variable)] %>% .[variable != "Total"] %>% .[variable != "Inadequately_described_Not_stated", IndustryOfEmployment := gsub("_", " ", variable)] %>% Mop("workers")
freadT(35, value.name = "workers") %>% .[!grepl("Males|Females", variable)] %>% .[, variable := sub("_20...*", "", variable)] %>% .[variable != "Total"] %>% .[variable != "Inadequately_described_Not_stated", Occupation := gsub("_", " ", variable)] %>% .[] %>% Mop("workers")
freadT(35, value.name = "workers") %>% .[grepl("Males|Females", variable)] %>% extract_Sex %>% .[, variable := sub("_20...*", "", variable)] %>% .[variable != "Total"] %>% .[variable != "Inadequately_described_Not_stated", Occupation := gsub("_", " ", variable)] %>% .[] %>% Mop("workers")
document <- function(the_table) { stopifnot(dir.exists("R")) file.R <- file.path("R", paste0(the_table, ".R")) if (file.exists(file.R)) { file.remove(file.R) } DT <- get(the_table) value_name <- names(DT)[ncol(DT)] cat(paste0("#' @title ", paste0(names(DT)[-c(1:2, ncol(DT))], collapse = ", "), " by ", Region, ", Year"), "\n", file = file.R) cat(paste0("#' @description ", if (value_name == tolower(value_name) && value_name != "unemployment") { paste0("Number of ", value_name) } else { # 'medianIncomeHouse' --> 'Median income house' sub("^(.)", "\\U\\1\\E", paste0(unlist(strsplit(value_name, split = "(?<=[a-z](?=(?:[A-Z])))", perl = TRUE)))) }, paste0(names(DT)[-c(1:2, ncol(DT))], collapse = ", "), " by ", Region, ", Year"), "\n", file = file.R, append = TRUE) cat(paste0("#' @format ", prettyNum(nrow(DT), big.mark = ","), " observations and ", ncol(DT), " variables."), "\n", "\n", file = file.R, append = TRUE) cat(paste0('"', the_table, '"'), file = file.R, append = TRUE) cat("\n", file = file.R, append = TRUE) }
# Print side-effect ok stopifnot(dir.exists("data")) ced_tbls <- tables(silent = TRUE) %>% use_series("NAME") %>% grep(pattern = paste0("^", Region), x = ., value = TRUE) suppressMessages({library(Census2016.spec)}) test_check(data_list = mget(ced_tbls), show.progress = FALSE) data_path <- "data" provide.dir("tsv") provide.dir(file.path("tsv", Region)) tsv_path <- file.path("tsv", Region) try({ prior_data_size <- lapply(list.files(path = data_path, full.names = TRUE), file.info) %>% rbindlist %$% sum(size) %>% divide_by(1024^2) %>% round(2) prior_region_data_size <- lapply(list.files(path = data_path, pattern = Region, full.names = TRUE), file.info) %>% rbindlist %$% sum(size) %>% divide_by(1024^2) %>% round(2) }, silent = TRUE) region_dtas <- c(list.files(path = "data/", pattern = Region, full.names = TRUE), list.files(path = file.path("data-raw", "data", Region), pattern = paste0(Region, ".*csv$"), full.names = TRUE)) vapply(region_dtas, file.remove, logical(1)) %>% all %>% stopifnot library(foreach) library(doParallel) cl <- makeCluster(8) registerDoParallel(cl, cores = 8) foreach(tbl = ced_tbls, .inorder = FALSE, .export = c(ced_tbls, data_path)) %dopar% { save(list = tbl, file = file.path(data_path, paste0(tbl, ".rda")), compress = "gzip", compression_level = 9) } stopImplicitCluster() stopCluster(cl) fwrite <- function(...) data.table::fwrite(..., sep = "\t") for (tbl in ced_tbls) { file.tsv <- file.path(tsv_path, paste0(tbl, ".tsv")) .ced_tbl <- get(tbl) fwrite(.ced_tbl, file = file.tsv) switch (ceiling(file.size(file.tsv) / (95 * 1024^2)), { NULL }, { DT <- .ced_tbl lastv <- .subset2(DT, last(names(DT))) if (is.integer(lastv) && min(lastv) == 0L) { DT <- DT[lastv > 0L] } NN <- nrow(DT) DT1 <- DT[seq_len(NN %/% 2)] DT2 <- DT[-seq_len(NN %/% 2)] file.remove(file.tsv) fwrite(DT1, sub("\\.tsv$", "1.tsv", file.tsv)) fwrite(DT2, sub("\\.tsv$", "2.tsv", file.tsv)) }, { DT <- .ced_tbl lastv <- .subset2(DT, last(names(DT))) if (is.integer(lastv) && min(lastv) == 0L) { DT <- DT[lastv > 0L] } NN <- nrow(DT) DT1 <- DT[seq_len(NN %/% 2)] DT2 <- DT[-seq_len(NN %/% 2)] DT11 <- DT1[seq_len(nrow(DT1) %/% 2)] DT12 <- DT1[-seq_len(nrow(DT1) %/% 2)] DT21 <- DT2[seq_len(nrow(DT2) %/% 2)] DT22 <- DT2[-seq_len(nrow(DT2) %/% 2)] stopifnot(nrow(DT11) + nrow(DT12) + nrow(DT21) + nrow(DT22) == nrow(DT)) file.remove(file.tsv) fwrite(DT11, sub("\\.tsv$", "11.tsv", file.tsv)) fwrite(DT12, sub("\\.tsv$", "12.tsv", file.tsv)) fwrite(DT21, sub("\\.tsv$", "21.tsv", file.tsv)) fwrite(DT22, sub("\\.tsv$", "22.tsv", file.tsv)) }) } region_dtas <- list.files(path = "data/", pattern = Region, full.names = TRUE) # tools::resaveRdaFiles(paths = region_dtas) current_region_data_size <- lapply(list.files(path = data_path, pattern = Region, full.names = TRUE), file.info) %>% rbindlist %$% sum(size) %>% divide_by(1024^2) %>% round(2) current_data_size <- lapply(list.files(path = data_path, full.names = TRUE), file.info) %>% rbindlist %$% sum(size) %>% divide_by(1024^2) %>% round(2) cat("\n") cat("Region:\t", prior_region_data_size, "MB ===> ", current_region_data_size, "MB\n") cat("Total: \t", prior_data_size, "MB ===> ", current_data_size, "MB\t")
# Print side-effect ok ced_tbls <- tables(silent = TRUE) %>% use_series("NAME") %>% grep(pattern = paste0("^", Region), x = ., value = TRUE) if (dir.exists("data")) { lapply(list.files(path = "data/", full.names = TRUE), file.info) %>% rbindlist %$% sum(size) %>% divide_by(1024^2) %>% cat region_dtas <- list.files(path = "data/", pattern = Region, full.names = TRUE) vapply(region_dtas, file.remove, logical(1)) %>% all %>% stopifnot } else { provide.dir("data") } for (tbl in ced_tbls) { document(tbl) save(list = tbl, file = file.path("data", paste0(tbl, ".rda"))) } region_dtas <- list.files(path = "data/", pattern = Region, full.names = TRUE) size_of_data <- lapply(list.files(path = "data/", pattern = Region, full.names = TRUE), file.info) %>% rbindlist %$% sum(size) %>% divide_by(1024^2) cat("Size before resave:\t", round(size_of_data, 1), "MB") # Size not large enough to justify resaving. # # tools::resaveRdaFiles(paths = region_dtas) # size_of_data <- # lapply(list.files(path = "data/", # pattern = Region, # full.names = TRUE), # file.info) %>% # rbindlist %$% # sum(size) %>% # divide_by(1024^2) # # cat("Size after resave:\t", round(size_of_data, 1), "MB")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.