In HughParsonage/Census2016.DataPack.TimeSeries: Provides ABS Time-Series Datapacks

# Region <- "STE"

START <- Sys.time()
knitr::opts_knit$set(root.dir = "..")
knitr::opts_chunk$set(echo = TRUE, 
                      print_chunk = TRUE)

suppressMessages({
library(readxl)
library(testthat)
library(magrittr)
library(hutils)
library(data.table)
library(ASGS)
library(Census2016.DataPack)
})

# Ensure we are working with pristine tables
if (!isTRUE(getOption("knitr.in.progress"))) {
  AllTables <- Filter(function(x) is.data.table(get(x)), ls())
  if (length(AllTables) != 0) {
    if (exists("table_killer")) {
      menu_selection <- 1
    } else {
      menu_selection <- menu(c("Remove all tables.",
                               "Cancel."),
                             title = "Data tables detected. This script will save all tables in the workspace to the package. Remove current tables from workspace?")
    }
  }

  if (exists("menu_selection")) {
    if (menu_selection %in% c(0, 2)) {
      stop("Operation cancelled.")
    } else {
      rm(list = AllTables)
    }
  }
}

if (!isNamespaceLoaded("Census2016.DataPack")) {
  load("../data/SA16_decoder.rda")
}

Metadata <-
  read_excel("./data-raw/Metadata/Metadata_2016_TSP_DataPack.xlsx",
             sheet = "Cell descriptors information", 
             skip = 12) %>%
  as.data.table %>%
  .[, varI := as.integer(gsub("^T", "", Sequential))] %>%
  setnames("DataPack file", "DataPackFile") %>%
  .[, TableNo := as.integer(gsub("[^0-9]", "", DataPackFile))] %>%
  .[]

.short2long <- function(short) {
  coalesce(Metadata[match(x = short, Short)][["Long"]], short)
}

expect_equal(.short2long(c("squierl", "Tot_persons_C11_F", "Tot_C16_P")),
             c("squierl", 
               "Total_persons_2011_Census_Females",
               "Total_2016_Census_Persons"))

short2long <- function(.data) {
  noms <- names(.data)
  setnames(.data, noms, .short2long(noms))
}

Region_key <- 
  fread(paste0('./data-raw/data/', Region, '/AUST/2016Census_T01_AUS_', Region, '.csv')) %>%
  names %>%
  extract2(1)

grep <- function(..., perl, fixed) base::grep(..., perl = missing(fixed), fixed = !missing(fixed))
grepl <- function(..., perl, fixed) base::grepl(..., perl = missing(fixed), fixed = !missing(fixed))
gsub <- function(..., perl, fixed) base::gsub(..., perl = missing(fixed), fixed = !missing(fixed))
sub <- function(..., perl, fixed) base::sub(..., perl = missing(fixed), fixed = !missing(fixed))

# Possible future issue with R 3.5
stopifnot(sub("a_([a-z])", "\\U\\1\\E", "a_b") == "B",
          sub("a_([A-Z])", "\\L\\1\\E", "a_B") == "b")

#' @return Wide as-is data table of all tables.
freadG <- function(g, ...) {
  stopifnot(length(g) == 1)
  g0 <- paste0(if (g < 10) "0", g)
  file_list <-
    list.files(path = file.path('./data-raw/data/', Region, 'AUST'),
               # Files may be of the form ..G08_AUS.. or ..G09B_AUS..
               pattern = paste0('2016Census_T', g0, '[A-Z]?_AUS_', Region, '\\.csv$'),
               full.names = TRUE)
  if (length(file_list) > 1) {
    file_list %>%
    lapply(fread, key = Region_key, ...) %>%
    Reduce(f = function(X, Y) X[Y])
  } else {
    fread(file = file_list[1], ...)
  } %>% 
    short2long
}

freadT <- function(g, vi = NULL, value.name = "persons") {
  gmolten <- 
    freadG(g, logical01=FALSE) %>%
    melt.data.table(id.vars = Region_key,
                    variable.factor = FALSE,
                    value.name = value.name)

  # Lots of duplicate table names across g's
  Metadata <- Metadata[TableNo == g]

  if (all(.subset2(gmolten, "variable") %chin% .subset2(Metadata, "Long"))) {
    og <- gmolten[Metadata, on = "variable==Long", nomatch=0L]
  } else if (all(.subset2(gmolten, "variable") %chin% .subset2(Metadata, "Short"))) {
    # Want the Long form to be 'variable'
    og <- gmolten[Metadata, on = "variable==Short", nomatch=0L]
    og <- drop_col(og, "variable")
    setnames(og, "Long", "variable")
  } else {
    stop("Unable to decode table names.")
  }

  if (is.null(vi)) {
    vi <- range(og[["varI"]])
  }

  og %>%
    drop_cols(setdiff(names(Metadata), "varI")) %>%
    .[varI %between% vi] %>%
    extract_Year %>%
    .[]
}

do_dots <- function(...) {
  eval(substitute(alist(...)))
}

splitW <- function(DT, ...,
                   Vs,
                   # elict the rows of DT
                   # from length(Vs)
                   elicit = TRUE) {
  lower_caps <- function(v) {
    # Not_stated can be at the start of a code,
    # e.g. Table 15
    v <- gsub("(?<!(^))(?:Not_stated)", "na", v)
    v <- gsub("(?:None)", "0", v)
    v <- gsub("Information_and_Communication_Technology", 
              "Information_and_communication_technology",
              v)
    v <- gsub("_ICT_", "_ict_", v)
    v
  }

  reraise_caps <- function(v) {
    v <- gsub("Information_and_communication_technology",
              "Information_and_Communication_Technology", 
              v)
    v <- gsub("_ict_", "_ICT_", v)

    v <- gsub("_", " ", v)
    v
  }

  if (missing(Vs)) {
    dotsL <- length(match.call(expand.dots = FALSE)$...)
    Vs <- vapply(seq_len(dotsL),
                 function(x) deparse(do_dots(...)[[x]]),
                 character(1))
    v <- .subset2(DT, "variable")
    # Assume a new level occurs with each capital letter, 
    # except the following
    v <- lower_caps(v)
    level <- nchar(gsub("[^A-Z]", "", v))
    n_levels <- uniqueN(level)
  }

  if (elicit) {
    out <- DT[which(level == length(Vs))]
  } else {
    out <- DT
    if (n_levels > 1L) {
      to_print <- copy(out)
      to_print[, level := level] %>%
        unique(by = "level") %>%
        print
      stop("Cannot print: levels appear to be different.")
    }
    if (length(Vs) != n_levels) {
      stop("length(Vs) = ", length(Vs), " != n_levels = ", n_levels, ".")
    }
  }


  out[, variable := lower_caps(variable)]
  out[,
      (Vs) := tstrsplit(.SD[[1]], split = "_(?=[A-Z])", perl = TRUE),
      .SDcols = "variable"
      ][,        
        (Vs) := lapply(.SD, reraise_caps),
        .SDcols = Vs
        ]

  # for (vv in names(out)) {
  #   if ("Total" %fin% out[[vv]]) {
  #     out <- out[out[[vv]] != "Total"]
  #   }
  # }
  out

}

.extract_Year <- function(v) {
  if (all(grepl("^.+_(2006|2011|2016)_Census.*$", v))) {
    out <- as.integer(sub("^.*_(2[0-9]{3})_Census.*$", "\\1", v))
  } else if (all(grepl("^(2006|2011|2016)_Census", v))) {
    out <- as.integer(substr(v, 0, 4))
  } else {
    first_bad <- 
      v[nor(grepl("^(2006|2011|2016)_Census", v),
            grepl("^.+_(2006|2011|2016)_Census.*$", v))] %>%
      head(1)
    print(first_bad)
    stop("Unable to detect census year", call. = FALSE)
  }

}

extract_Year <- function(.data) {
  stopifnot("variable" %in% names(.data))
  .data[, Year := .extract_Year(variable)]
}

expect_equal(.extract_Year("Total_persons_2006_Census_Males"), 2006L)
expect_equal(.extract_Year("2006_Census_15_19_years_Married_Females"), 2006L)

.extract_Sex <- function(v) {
  req_regex <- ".*(Males|Females)"
  substr(sub(req_regex, "\\1", v), 0, 1)
}

expect_equal(.extract_Sex("Total_persons_2006_Census_Males"), "M")
extract_Sex <- function(.data) {
  req_regex <- ".*(Males|Females)"
  stopifnot("variable" %in% names(.data),
            all(grepl(req_regex, .data[["variable"]])))
  .data[, Sex := .extract_Sex(variable)]
}

`%npin%` <- function(x, y) {
  !grepl(y, x)
}

.extract_Age <- function(v, orderedFactor = TRUE) {
  out <-
    if_else(grepl("^.*?_([0-9]{2})_([0-9]{2})_years$", v),
            sub("^.*?_([0-9]{2})_([0-9]{2})_years", "\\1-\\2", v),
            if_else(grepl("^.*Census_[0-9]{1,2}_[0-9]{1,2}_years.*", v),
                    sub("^.*Census_([0-9]{1,2})_([0-9]{1,2})_years.*$", "\\1-\\2", v),
                    if_else(grepl("years_and_over", v),
                            sub("^.*[^0-9]([0-9]+)_years_and_over.*$", "\\1+", v),
                            if_else(grepl("^.*Age_(?:group(?:_of_parent)?|years)_([0-9]{1,2})_([0-9]{1,2})_years.*", v),
                                    sub("^.*Age_(?:group(?:_of_parent)?|years)_([0-9]{1,2})_([0-9]{1,2})_years.*",
                                        "\\1-\\2",
                                        v),
                                    if_else(grepl("_[0-9]{1,2}_[0-9]{1,2}_years$", v),
                                            sub("^.*([0-9]{1,2})_([0-9]{1,2})_years$", "\\1-\\2", v),
                                            v)))))
  i0 <- grepl("_([0-9]{2})_([0-9]{2})_years_[A-Z]", v)
  out[i0] <- sub("^.*_([0-9]{2})_([0-9]{2})_years_[A-Z].*$", "\\1-\\2", v[i0])


  if (any(grepl("_", out))) {
    stop("\n\t", paste0(head(unique(grep("_", out, value = TRUE))), collapse = "\n\t"), " present.")
  }

  if (orderedFactor) {
    out <- factor(out, levels = unique(out), ordered = TRUE)
  }
  out
}
expect_equal(.extract_Age("Age_group_85_years_and_over_2006_Census_Males", FALSE), "85+")
expect_equal(.extract_Age("2016_Census_80_84_years_Never_married_Females", FALSE), "80-84")
expect_equal(.extract_Age("2006_Census_Buddhism_0_14_years", FALSE), "0-14")
expect_equal(.extract_Age("Age_group_0_4_years_2006_Census_Persons", FALSE), "0-4")
expect_equal(.extract_Age("2006_Census_Age_group_of_parent_15_19_years_Number_of_children_ever_born_No_children", FALSE), "15-19")
expect_equal(.extract_Age("2006_Census_Persons_Postgraduate_Degree_Level_15_19_years", FALSE), "15-19")
expect_equal(.extract_Age("2006_Census_Persons_15_19_years_Employed_Hours_worked_not_stated", FALSE), "15-19")

local({
  input <- c("Age_group_0_4_years_2006_Census_Persons",
             "Age_group_5_14_years_2016_Census_Persons",
             "Age_group_35_44_years_2011_Census_Persons",
             "Age_group_55_64_years_2011_Census_Persons",
             "Age_group_85_years_and_over_2006_Census_Persons")
  output <- .extract_Age(input) 

  # i.e. ascending order, not 
  expect_false(is.unsorted(output))
})

extract_Age <- function(.data) {
  .data[, Age := .extract_Age(variable)]
}

extract_CountryOfBirth <- function(DT) {
  DT %>%
    # Set not stated -> NA
    .[variable %npin% "Country_of_birth_not_stated",
      CountryOfBirth := sub("^(.*)_(2006|2011|2016)_Census_(?:Persons|Males|Females)$",
                            "\\1",
                            variable)] %>%
    decode_country %>%
    .[]
}

decodeRegion <- function(DT) {
  out <- DT
  decoder <- 
    switch(Region,
           "SA2" = SA16_decoder[ASGS_Structure == Region,
                                .(SA2_MAIN16 = Census_Code_2016,
                                  SA2_NAME16 = Census_Name_2016)],
           "SA3" = SA16_decoder[ASGS_Structure == Region,
                                .(SA3_MAIN16 = Census_Code_2016,
                                  SA3_NAME16 = Census_Name_2016)],
           "SA4" = SA16_decoder[ASGS_Structure == Region,
                                .(SA4_MAIN16 = Census_Code_2016,
                                  SA4_NAME16 = Census_Name_2016)],
           "STE" = SA16_decoder[ASGS_Structure == Region,
                                .(STE_MAIN16 = Census_Code_2016,
                                  STE_NAME16 = Census_Name_2016)],
           "LGA" = {
             NonABS_decoder[ASGS_Structure == Region,
                            .(Census_Code_2016, LGA_NAME16 = Census_Name_2016)]
           },
           "CED" = {
             NonABS_decoder[ASGS_Structure == Region,
                            .(Census_Code_2016, CED_NAME16 = Census_Name_2016)]
           },
           "SED" = {
             NonABS_decoder[ASGS_Structure == Region,
                            .(Census_Code_2016, SED_NAME16 = Census_Name_2016)]
           },
           "SSC" = {
             NonABS_decoder[ASGS_Structure == Region,
                            .(Census_Code_2016, SSC_NAME16 = Census_Name_2016)]
           },
           NULL) %>%
    unique %>%
    setnames(1, Region_key)

  decoded_out <- decoder[out, on = Region_key]
  decoded_out[, (Region_key) := NULL]
  decoded_out
}

.decode_country <- function(x, to = "alpha-3") {
  xna <- is.na(x)
  x[!xna] <- trimws(x[!xna])
  stopifnot(to == "alpha-3",
            names(ISO3166)[1] == "name")
  nom <- ISO3166[["name"]]
  a3 <- ISO3166[["alpha_3"]]
  keep <- and(nchar(x, keepNA = FALSE) == 3L,
              x == toupper(x))

  res <- a3[pmatch(x, nom, duplicates.ok = TRUE)]
  res[keep] <- x[keep]
  if (anyNA(res[!is.na(x)])) {
    print(x[is.na(res[!is.na(x)])])
    stop("Country code could not be decoded.")
  }
  res[xna] <- NA_character_
  res
}

decode_country <- function(out) {
  stopifnot(is.data.table(out),
            "CountryOfBirth" %in% names(out))
  out[, CountryOfBirth := trimws(gsub("_", " ", CountryOfBirth))]
  out[, CountryOfBirth := gsub("(?<!(F))Ye?a?r.*arrival.ns$", "", CountryOfBirth,
                               ignore.case = TRUE)]
  out[CountryOfBirth %pin% "(Born )?[Ee]lsewhere", CountryOfBirth := NA_character_]
  out[CountryOfBirth == "Country of birth not stated", CountryOfBirth := NA_character_]
  out[CountryOfBirth %pin% "Bosnia", CountryOfBirth := "Bosnia and Herzegovina"]
  out[CountryOfBirth %pin% "Korea", CountryOfBirth := "KOR"]
  out[CountryOfBirth %pin% "Macedonia", CountryOfBirth := "Macedonia"]
  out[CountryOfBirth %pin% c("United Kingdom",
                             "N[a-z]+n Ireland",
                             "England",
                             "Scotland",
                             "Wales"),
      CountryOfBirth := "United Kingdom"]
  out[CountryOfBirth %pin% "SE Europe nfd", CountryOfBirth := NA_character_]
  out[CountryOfBirth %pin% "China excl? SARs (and )?Tai?w",  CountryOfBirth := "CHN"]
  out[CountryOfBirth %pin% "Ho?ng Ko?ng",             CountryOfBirth := "Hong Kong"]
  out[CountryOfBirth == "Vietnam", CountryOfBirth := "VNM"]
  # Negative lookbehind due to FYROM (Macedonia)


  out[CountryOfBirth %pin% "FYROM",                   CountryOfBirth := "MKD"]
  out[CountryOfBirth %pin% "China excl? SARs Tai?w",  CountryOfBirth := "CHN"]
  out[CountryOfBirth %pin% c("USA", "United States"), CountryOfBirth := "USA"]
  out[CountryOfBirth %pin% "Ho?ng Ko?ng",             CountryOfBirth := "Hong Kong"]
  out[, CountryOfBirth := .decode_country(CountryOfBirth)]
  out[]
}

Mop <- function(DT, value.name = "persons", suborder = NULL) {
  # Check if any variables have already been used.
  # If so, stop; else, add to varIused
  varIBeingUsed <- .subset2(DT, "varI")
  stopifnot(is.null(varIused) || !any(varIBeingUsed %in% varIused))
  drop_col(DT, "varI")

  stopifnot(value.name %in% names(DT),
            "Year" %in% names(DT),
            # Should be Completed
            'MaxSchooling' %notin% names(DT))

  # Check for no duplicates, except for value.name
  if (anyDuplicated(DT, by = setdiff(names(DT), c(value.name, "variable")))) {
    print(duplicated_rows(DT, by = setdiff(names(DT), c(value.name, "variable"))))
    stop("Duplicated rows.")
  }

  # Are any columns constant?
  uv <- vapply(DT, uniqueN, integer(1))
  if (any(uv == 1)) {
    print(names(uv)[uv == 1])
    stop("Constant columns in DT.")
  }

  out <- 
    DT %>%
    drop_col("variable") %>%
    setcolorder(sort(names(.))) %>%
    set_cols_first(c(Region_key, "Year")) %>%
    setorderv(setdiff(names(.), value.name)) %>%
    set_cols_last(value.name) %>%
    .[]

  if ("CountryOfBirth" %chin% names(out)) {
    if (!all(coalesce(nchar(out[["CountryOfBirth"]]), 3L) == 3L)) {
      stop("CountryOfBirth must be ISO-3166-2")
    }
  }

  if ("Dwelling" %chin% names(out)) {
    out[, PrivateDwelling := Dwelling == "Private"]
    out[, Dwelling := NULL]
  }

  if ("HasChildren" %chin% names(out)) {
    setnames(out, "HasChildren", "HasChild")
  }
  if ("HasChildrenUnder15" %chin% names(out)) {
    setnames(out, "HasChildrenUnder15", "HasChildUnder15")
  }
  if ("YearOfArrival" %chin% names(out)) {
    setnames(out, "YearOfArrival", "YearOfArrival.max")
  }

  if ("SchoolSector" %chin% names(out)) {
    out[SchoolSector == "Non_Govt",
        SchoolSector := "Non-government"]
  }

  if ("Ancestry" %chin% names(out)) {
    out[Ancestry == "Macdonian", Ancestry := "Macedonian"]
    if ("FatherBornAus" %chin% names(out) &&
        "MotherBornAus" %chin% names(out)) {
      out[Ancestry == "Australian Birthplace not stated", FatherBornAus := NA]
      out[Ancestry == "Australian Birthplace not stated", MotherBornAus := NA]
      out[Ancestry == "Australian Birthplace not stated", Ancestry := "Australian"]
      out[Ancestry == "Australian Both parents born Australian", MotherBornAus := TRUE]
      out[Ancestry == "Australian Both parents born Australian", MotherBornAus := TRUE]
      out[Ancestry == "Australian Both parents born Australian", Ancestry := "Australian"]
    }
  }

  if ("MaxSchoolingCompleted" %chin% names(out)) {
    out[,
        "MaxSchoolingCompleted" := factor(MaxSchoolingCompleted,
                                          levels = c("Did not go to school", 
                                                     "Year 8 or below",
                                                     "Year 8",
                                                     "Year 9",
                                                     "Year 10",
                                                     "Year 11",
                                                     "Year 12"),
                                          ordered = TRUE)]
  }

  if ("FamilyComposition" %chin% names(out)) {
    if (!OR("CoupleFamily" %in% names(out),
            uniqueN(out[["FamilyComposition"]]) > 3L)) {
      out[!grepl("Other", FamilyComposition),
          CoupleFamily := FamilyComposition %pin% "Couple"]
      # switch to the position of FamilyComposition
      set_colsuborder(out, c("CoupleFamily", "FamilyComposition"))
      out[, FamilyComposition := NULL]
    }
  }

  if ("IncomeTotPersonal.min" %chin% names(out)) {
    out[, IncomeTotPersonal.min := NULL]
    return(NULL)
  }

  if ("DwellingSubtype" %chin% names(out)) {
    out[DwellingSubtype == "4 or more storey block",
        DwellingSubtype := "4 storey block or more"]
  }

  if ("Aboriginal" %chin% names(out) &&
      "TorresStraitIslander" %chin% names(out)) {
    c("Aboriginal", "Both Aboriginal & Torres Strait Islander", "Non-indigenous", 
"Torres Strait Islander")
    out[nor(Aboriginal, TorresStraitIslander), IndigenousStatus := "Non-indigenous"]
    out[`&`(Aboriginal, TorresStraitIslander), IndigenousStatus := "Both Aboriginal & Torres Strait Islander"]
    out[`&`(!Aboriginal, TorresStraitIslander), IndigenousStatus := "Torres Strait Islander"]
    out[`&`(Aboriginal, !TorresStraitIslander), IndigenousStatus := "Aboriginal"]
    out[, c("Aboriginal", "TorresStraitIslander") := NULL]
  }

  if ("Age.int" %chin% names(out)) {
    setnames(out, "Age.int", "Age.min")
  }

  for (j in names(DT)) {
   out[out[[j]] == "Other", (j) := "(Other)"]
  }

  apparent_population <- 
      out[, list(ApparentPopulation = sum(eval(parse(text = value.name)))), keyby = "Year"]

  permitted_error <- switch(Region,
                            "SA1" = 1.5e6,
                            0.5e6)

  expected_population <-
    fread("data-raw/data/AUST/2016Census_T01_AUS.csv",
          select = c("Tot_persons_C06_P", "Tot_persons_C11_P", "Tot_persons_C16_P")) %>%
    melt(measure.vars = names(.), value.name = "ExpectedPopulation") %>%
    .[, "Year" := 2000L + as.integer(sub("^Tot_persons_C(06|11|16)_P$", "\\1", variable))] %>%
    setkeyv("Year") %>%
    .[]

  pop_diff_by_Year <- 
    apparent_population[expected_population, on = "Year"] %>%
    .[, difference := ApparentPopulation - ExpectedPopulation]

  differences <- .subset2(pop_diff_by_Year, "difference")

  switch(
    value.name,
    "persons" = {
      if (AND(!any(c("SpeaksEnglishOnly", 
                     "BornAust",
                     "MaritalStatus",
                     "MaxSchoolingCompleted",
                     "OnlyEnglishSpokenHome") %in% names(DT)),
              any(abs(differences) > permitted_error))) {

        i_first_bad <- which(abs(differences) > permitted_error)[1]

        first_bad <-
          pop_diff_by_Year[i_first_bad] %>%
          .subset2("difference")

        Yr_first_bad <-
          switch(i_first_bad,
                 2006,
                 2011,
                 2016)

        text_apparent_population <-
          apparent_population[i_first_bad] %>%
          .subset2("ApparentPopulation") %>%
          prettyNum(big.mark = ",")


        if (first_bad < 0) {
          stop("Population undercounted in ", Yr_first_bad, ":\n\t",
               text_apparent_population, "\n\t",
               " (e = ", prettyNum(first_bad, big.mark = ","), ")",
               "\nShould you have used a different value.name?")
        } else {
          stop("Apparent population too high: ", text_apparent_population,
               " (e = ", prettyNum(first_bad, big.mark = ","), ")")
        }
      }
    },

    "adults" = {
      adult_population <-
        lapply(dir(path = "data-raw/data/AUST/",
                   pattern = "T29",
                   full.names = TRUE),
               fread) %>%
        rbindlist(use.names = TRUE, fill = TRUE) %>%
        short2long %>%
        .[, .SD, .SDcols = grep("^Persons_aged_15_years.*Persons$", names(.))] %>%
        melt.data.table(measure.vars = names(.),
                        value.name = "adults") %>%
        .[, Year := as.integer(sub("^.*(2006|2011|2016).*$", "\\1", variable))]

      comparison <- 
        adult_population[apparent_population, on = "Year"] %>%
        .[, err := abs(ApparentPopulation - adults) / adults]

      if (any(comparison[["err"]] > 0.0004 * nrow(out))) {
        print(comparison[, .(apparent = prettyNum(ApparentPopulation, big.mark = ","),
                             expected = prettyNum(adults, big.mark = ","))])
        stop("Adults under or overcounted.")
      }




    },


    "occupied_private_dwellings" = {
      # freadG(15) %$% sum(C16_T_T + C06_T_T + C11_T_T)
      exp_tot_dwellings <- 23190507

      tot_dwellings <- sum(out[["occupied_private_dwellings"]])

      err <- abs(tot_dwellings - exp_tot_dwellings) / exp_tot_dwellings
      if (err > 0.0004 * nrow(out)) {
        stop("Dwellings under or overcounted:\n",
             "\t",
             "Expected:\t", prettyNum(exp_tot_dwellings, big.mark = ","),
             "\n\t",
             "Result:  \t",  prettyNum(tot_dwellings, big.mark = ","),
             "\n")
      }
    },
    "occupied_private_dwellings_being_rented" = {
      exp_tot_dwellings_2016 <- 2606630
      tot_dwellings_2016 <-
        sum(out[Year == 2016L][["occupied_private_dwellings_being_rented"]])

      err <- abs(tot_dwellings_2016 - exp_tot_dwellings_2016) / exp_tot_dwellings_2016
      if (err > 0.0004 * nrow(out)) {
        stop("Dwellings under or overcounted:\n",
             "\t",
             "Expected:\t", prettyNum(exp_tot_dwellings, big.mark = ","),
             "\n\t",
             "Result:  \t",  prettyNum(tot_dwellings, big.mark = ","),
             "\n")
      }

    })

  # Check for no duplicates, except for value.name
  if (anyDuplicated(out, by = setdiff(names(out), value.name))) {
    print(duplicated_rows(out, by = setdiff(names(out), value.name)))
    stop("2. Duplicated rows.")
  }
  if (!is.null(suborder)) {
    set_colsuborder(out, suborder)
  }

  set_cols_last(out, value.name)
  setorderv(out, setdiff(names(out), value.name))

  out_noms <- names(out)

  object_name <- 
    paste0(Region,
           "__",
           paste0(setdiff(out_noms, c(Region_key, value.name)),
                  collapse = "_"))

  if (object_name %in% ls(envir = .GlobalEnv)) {
    object_name <- 
      paste0(Region,
             "__",
             paste0(setdiff(out_noms, c(Region_key, "persons")),
                    collapse = "_"))

    if (object_name %in% ls(envir = .GlobalEnv)) {
      stop("`", object_name, "` already defined.")
    }
  }

  decoded_out <- decodeRegion(out)
  if (!anyDuplicated(decoded_out, by = setdiff(names(decoded_out), value.name))) {
    out <- decoded_out 
  } else {
    cat("\n\n\n\n\n")
    print(duplicated_rows(decoded_out, by = setdiff(names(decoded_out), value.name)))
    cat("\n\n\n\n\n")
  }

  assign(object_name, out, envir = .GlobalEnv)
  # Provided we have reached here, we can now modify varIused
  varIused <<- sort(c(varIused, unique(varIBeingUsed)))
  decoded_out[]
}

iso3166_url <-
  "https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv"

ISO3166 <-
  if (file.exists("data-raw/ISO-3166-countries.csv")) {
    fread("data-raw/ISO-3166-countries.csv")
  } else {
    fread(iso3166_url) %T>%
      fwrite("data-raw/ISO-3166-countries.csv") %>%
      .[]
  }
setnames(ISO3166, "alpha-3", "alpha_3")

expect_equal(.decode_country(c("United Kingdom", "Australia", "Australia",
                               NA)), 
             c("GBR", "AUS", "AUS",
               NA))

varIused <- NULL

G01

Persons

.region_persons <- 
  freadT(1, c(1, 9), "persons") %>%
  .[variable %pin% "Persons$"] %>%
  Mop

Persons by gender

freadT(1, c(1, 9), "persons") %>%
  .[variable %npin% "Persons$"] %>%
  extract_Sex %>%
  Mop

Persons by age

.region_by_age <- 
  freadT(1, c(10, 108), "persons") %>%
  .[variable %pin% "Persons$"] %>%
  extract_Age %>%
  Mop

.region_by_age %>%
  .[Age >= "15-19"] %>%
  .[, .(adults = sum(persons)), keyby = c(names(.)[1], "Year")] %>%
  assign(x = paste0(Region, "__Year_adults"), envir = .GlobalEnv)

Persons by age by sex

.region_by_age_sex <- 
  freadT(1, c(10, 108), "persons") %>%
  .[variable %npin% "Persons$"] %>%
  extract_Sex %>%
  extract_Age %>%
  Mop

.region_by_age %>%
  .[Age >= "15-19"] %>%
  .[, varI := 0L] %>%
  setnames("persons", "adults") %>%
  .[, varI := NULL] %>%
  set_cols_last("adults") %>%
  assign(x = paste0(Region, "__Year_Age_adults"), envir = .GlobalEnv)

.region_by_age_sex %>%
  .[Age >= "15-19"] %>%
  .[, .(adults = sum(persons)), keyby = c(names(.)[1], "Year", "Sex")] %>%
  assign(x = paste0(Region, "__Year_Sex_adults"), envir = .GlobalEnv)

Overseas visitors

freadT(1, c(109, 117), "overseas_visitors") %>%
  .[variable %pin% "Persons$"] %>%
  Mop(value.name = "overseas_visitors")

Overseas visitors by Sex

freadT(1, c(109, 117), "overseas_visitors") %>%
  .[variable %npin% "Persons$"] %>%
  extract_Sex %>%
  Mop(value.name = "overseas_visitors")

ATSI

provide_complementary_observation <- function(.data, vars = "Year",
                                              value.name = "persons",
                                              compl.var.name = "Other") {
  if (Region != "LGA") {
    meta_decoder <- SA16_decoder[ASGS_Structure == Region]
  } else {
    meta_decoder <-
      LGA_2016@data %>%
      as.data.table %>%
      .[, .(Census_Name_2016 = as.character(LGA_NAME16),
            Census_Code_2016 = paste0("LGA", (as.character(LGA_CODE16))))]
  }

  tempDT <- 
    get(paste0(Region, 
               "__",
               paste0(c(vars, if (value.name != "persons") value.name),
                      collapse = "_"))) %>%
    .[meta_decoder,
      on = c(paste0(names(.)[1], "==Census_Name_2016")),
      nomatch=0L] %>%
    setnames("Census_Code_2016", Region_key) %>%
    setnames(value.name, "persons")

  .data %>%
    copy %>%
    setnames(value.name, "persons") %>%
    .[, .(tot_in_persons = sum(persons)), keyby = c(Region_key, vars)] %>%
    .[tempDT, on = c(Region_key, vars)] %>%
    .[, persons := persons - tot_in_persons] %>%
    setnames("persons", value.name) %>%
    .[, "variable" := compl.var.name] %>%
    .[, .SD, .SDcols = c(Region_key, vars, "variable", value.name)] %>%
    rbind(.data, use.names = TRUE, fill = TRUE)
}


freadT(1, c(118, 144)) %>%
  .[variable %pin% "Persons$"] %>%
  provide_complementary_observation %>%
  .[, Aboriginal := FALSE] %>%
  .[, TorresStraitIslander := FALSE] %>%
  .[variable %pin% "persons_Aboriginal", Aboriginal := TRUE] %>%
  .[variable %pin% "persons_Torres_Strait", TorresStraitIslander := TRUE] %>%
  .[variable %pin% "Both", c("Aboriginal", "TorresStraitIslander") := TRUE] %>%
  Mop

freadT(1, c(118, 144)) %>%
  .[variable %npin% "Persons$"] %>%
  extract_Sex %>%
  provide_complementary_observation(vars = c("Year", "Sex")) %>%

  .[, Aboriginal := FALSE] %>%
  .[, TorresStraitIslander := FALSE] %>%
  .[variable %pin% "persons_Aboriginal", Aboriginal := TRUE] %>%
  .[variable %pin% "persons_Torres_Strait", TorresStraitIslander := TRUE] %>%
  .[variable %pin% "Both", c("Aboriginal", "TorresStraitIslander") := TRUE] %>%
  Mop

varIused <- c(varIused, 145:153)

freadT(1, c(154, 171)) %>%
  .[variable %pin% "Persons$"] %>%
  .[, BornAust := grepl("Birthplace_Australia", variable)] %>%
  Mop

freadT(1, c(154, 171)) %>%
  .[variable %npin% "Persons$"] %>%
  .[, BornAust := grepl("Birthplace_Australia", variable)] %>%
  extract_Sex %>%
  Mop

freadT(1, c(172, 189)) %>%
  .[variable %pin% "Persons$"] %>%
  .[, SpeaksEnglishOnly := grepl("English_only", variable)] %>%
  Mop

freadT(1, c(172, 189)) %>%
  .[variable %npin% "Persons$"] %>%
  .[, SpeaksEnglishOnly := grepl("English_only", variable)] %>%
  extract_Sex %>%
  Mop

freadT(1, c(190, 199)) %>%
  .[variable %pin% "Persons$"] %>%
  provide_complementary_observation %>%
  .[, AustCitizen := variable != "Other"] %>%
  Mop

freadT(1, c(190, 199)) %>%
  .[variable %npin% "Persons$"] %>%
  extract_Sex %>%
  provide_complementary_observation %>%
  .[, AustCitizen := variable != "Other"] %>%
  Mop

melt_using_suffix <- function(DT, suffix, suffix_is = "Year", id.cols = Region_key) {
   prefixes <- sub(suffix, "", setdiff(names(DT), Region_key))
   suffixes <- sub(paste0("^.*", suffix), "\\1", setdiff(names(DT), Region_key))

   decode_suffix <- data.table(temp = seq_along(unique(suffixes)),
                               new = unique(suffixes)) %>%
     .[, temp := as.character(temp)] %>%
     setnames(c("temp", "new"), c("_temp", suffix_is))

   melt.data.table(DT, 
                   id.vars = id.cols,
                   measure.vars = patterns(unique(prefixes)),
                   value.name = unique(prefixes),
                   variable.name = "_temp",
                   variable.factor = FALSE,
                   verbose = TRUE) %>%
     .[decode_suffix, on = "_temp"] %>%
     .[, "_temp" := NULL] %>%
     set_cols_first(c(Region_key, suffix_is)) %>%
     .[]
}

Mop_multiple <- function(DT) {
  DT[, Year := as.integer(Year)]
  DTnoms <- copy(names(DT))
  for (nom in DTnoms) {
    if (!(nom %chin% c(Region_key, "Year"))) {
      nom_ <-
        gsub("^([A-Z])", "\\L\\1\\E",
             x = gsub("_([A-za-z])", "\\U\\1\\E",
                      nom,
                      perl = TRUE),
             perl = TRUE)

      setnames(DT, nom, nom_)

      object_name <- 
        paste0(Region,
               "__",
               "Year",
               "_",
               nom_)

      assign(object_name,
             value = decodeRegion(DT[, .SD, .SDcols = c(Region_key, "Year", nom_)]),
             envir = .GlobalEnv)
    }
  }
}

T02

freadG(2)[] %>%
  melt_using_suffix("_Census_year_(2006|2011|2016)") %>%
  Mop_multiple

T03

freadT(3, vi = c(223, 1104))[] %>%
  .[variable %pin% "years_[0-9]{1,2}_(2006|2011|2016)"] %>%
  .[variable %pin% "Persons$"] %>%
  .[, Age.int := as.integer(sub("^.*years_([0-9]{1,2})_(2006|2011|2016).*$", "\\1", variable))] %>%
  provide_complementary_observation %>%
  Mop

freadT(3, vi = c(223, 1104))[] %>%
  .[variable %pin% "years_[0-9]{1,2}_(2006|2011|2016)"] %>%
  .[variable %npin% "Persons$"] %>%
  .[, Age.int := as.integer(sub("^.*years_([0-9]{1,2})_(2006|2011|2016).*$", "\\1", variable))] %>%
  extract_Sex %>%
  provide_complementary_observation %>%
  Mop

Grouped age

# Already used
varIused <- unique(sort(c(varIused, seq(223, 1104))))

freadT(4, c(1114, 1737)) %>%
  .[variable %npin% "Persons$"] %>%
  .[variable %npin% "Total"] %>%
  .[or(grepl("[0-9]{1,2}_[0-9]{1,2}_years", variable),
       grepl("85_years", variable))] %>%
  .[, MaritalStatus := sub("^.*years(?:_and_over)?_([A-Z].+)_(Males|Females)$", "\\1", variable)] %>%
  .[MaritalStatus == "Never_married", MaritalStatus := "Never married"] %>%
  extract_Age %>%
  setnames("Age", "Age5yr") %>%
  extract_Sex %>%
  Mop

freadT(7, vi = c(2710, 3141), value.name = "females") %>%
  .[!grepl("Total", variable)] %>%
  extract_Age %>%
  .[variable %pin% "No_child",    ChildrenEverBorn := 0L] %>%
  .[variable %pin% "One_child",   ChildrenEverBorn := 1L] %>%
  .[variable %pin% "Two_child",   ChildrenEverBorn := 2L] %>%
  .[variable %pin% "Three_child", ChildrenEverBorn := 3L] %>%
  .[variable %pin% "Four_child",  ChildrenEverBorn := 4L] %>%
  .[variable %pin% "Five_child",  ChildrenEverBorn := 5L] %>%
  .[variable %pin% "Six_or_more_child", ChildrenEverBorn := 6L] %>%
  setnames("Age", "Age5yr") %>%
  Mop(value.name = "females")

freadT(8, c(3142, 3483)) %>%
  .[variable %pin% "Persons$"] %>%
  .[variable %npin% "Total"] %>%
  .[, variable := sub("_20(06|11|16)_Census_", "_", variable)] %>%
  .[, variable := sub("_Persons", "", variable)] %>%
  setnames("variable", "CountryOfBirth") %>%
  decode_country %>%
  .[, .(persons = sum(persons),
        varI = first(varI)), keyby = c(Region_key, "Year", "CountryOfBirth")] %>%
  Mop

freadT(8, c(3142, 3483)) %>%
  .[variable %npin% "Persons$"] %>%
  .[variable %npin% "Total"] %>%
  .[, variable := sub("_20(06|11|16)_Census_", "_", variable)] %>%
  extract_Sex %>%
  .[, variable := sub("_(Males|Females)$", "", variable)] %>%
  setnames("variable", "CountryOfBirth") %>%
  decode_country %>%
  .[, .(persons = sum(persons),
        varI = first(varI)), keyby = c(Region_key, "Year", "CountryOfBirth", "Sex")] %>%
  Mop

Ancestory

freadT(10) %>%
  .[!grepl("Total", variable)] %>%
  .[variable %pin% "Persons$"] %>%
  .[grepl("Speaks_English_only", variable),
    LanguageSpoken := "English"] %>%
  .[variable %pin% "^Speaks_other_language_(.*)_20(06|11|16)_Census_.*$",
    LanguageSpoken := sub("^Speaks_other_language_(.*)_20(06|11|16)_Census_.*$", "\\1", variable)] %>%
  Mop

freadT(10) %>%
  .[!grepl("Total", variable)] %>%
  .[variable %npin% "Persons$"] %>%
  .[grepl("Speaks_English_only", variable),
    LanguageSpoken := "English"] %>%
  .[variable %pin% "^Speaks_other_language_(.*)_20(06|11|16)_Census_.*$",
    LanguageSpoken := sub("^Speaks_other_language_(.*)_20(06|11|16)_Census_.*$", "\\1", variable)] %>%
  extract_Sex %>%
  Mop

freadT(11) %>%
  .[!grepl("Total", variable)] %>%
  .[grepl("English_Very_well_or_well", variable), EnglishProficiency := "Speaks English well or very well"] %>%
  .[grepl("English_Not_well", variable), EnglishProficiency := "Speaks English not well or not at all"] %>%
  extract_Age %>%
  .[]

freadT(12) %>%
  .[!grepl("Total", variable)] %>%
  extract_Age %>%
  .[, Religion := gsub("^.*Census_([^0-9]+)_[0-9].*$", "\\1", variable)] %>%
  .[Religion == "Secular_Beliefs_and_Other_Spiritual_Beliefs_and_No_Religious_Affiliation",
    Religion := "No religion"] %>%
  .[Religion == "Religious_affiliation_not_stated",
    Religion := NA_character_] %>%
  .[Religion %pin% "Christ", Denomination := gsub("^Christ[a-z]+y_", "", Religion)] %>%
  .[Religion %pin% "Christ", Religion := "Christianity"] %>%
  .[Religion %enotin% c("Christianity", "Buddhism", "Hinduism", "Judaism", "Islam", "No religion"),
    Religion := "(Other)"] %>%
  .[] %>%
  .[variable %pin% "Presbyterian",
   Denomination := "Presbyterian Reformed"] %>%
  .[variable %pin% "Eastrn.Orthdox",
    Denomination := "Eastern Orthodox"] %>%
  .[variable %pin% "Orintal_Orthdx",
    Denomination := "Oriental Orthodox"] %>%
  .[variable %pin% "Sevnth.dy",
    Denomination := "Seventh Day Adventist"] %>%
  .[variable %pin% "Othr_Protestnt",
    Denomination := "Protestant (Other)"] %>%
  .[variable %pin% "Jeh.*v.*ahs.*Witn.*s.*es",
    Denomination := "Jehovah's Witnesses"] %>%
  .[variable %pin% "Oth.?r_Christian",
    Denomination := "(Other)"] %>%
  .[variable %pin% "Sikhism",
    Denomination := "Sikhism"] %>%
  .[variable %pin% "Aust.*Abor.*Trad.*Rel",
    Denomination := "Australian Aboriginal Traditional"] %>%
  .[variable %pin% "Other_Religions_Other_Religious_Groups",
    Denomination := "(Other)"] %>%
  .[variable %pin% "SB_OSB_NRA_SB",
    Denomination := "Secular beliefs"] %>%
  .[variable %pin% "SB_OSB_NRA_OSB",
    Denomination := "Other spiritual beliefs"] %>%
  .[, Religion := gsub("_", " ", Religion)] %>%
  .[, Denomination := gsub("_", " ", Denomination)] %>%
  .[Denomination == "Christian nfd",
    Denomination := NA_character_] %>%
  .[Denomination == "Churches of Christ",
    Denomination := "Churches Of Christ"] %>%
  .[Denomination == "Other Protestant",
    Denomination := "Protestant (Other)"] %>%
  .[Denomination == "Seventh day Adventist",
    Denomination := "Seventh Day Adventist"] %>%
  Mop

freadT(13, value.name = "students") %>%
  .[variable %pin% "Persons$"] %>%
  .[!grepl("^Total", variable)] %>%
  .[if_else(variable %pin% c("^(?:Technical|University)",
                             "^Other_type",
                             "^Infants_Primary",
                             "^Secondary"),
            grepl("Total", variable),
            TRUE)] %>%
  .[variable %pin% "Pre.school",     EduInstitutionType := "Pre-school"] %>%
  .[variable %pin% "Infants.Primary",EduInstitutionType := "Infants/Primary"] %>%
  .[variable %pin% "^Secondary",     EduInstitutionType := "Secondary"] %>%
  .[variable %pin% "^Tec.*Furt",     EduInstitutionType := "Technical or Further Educational Institution"] %>%
  .[variable %pin% "^Uni",           EduInstitutionType := "University or other tertiary"] %>%
  .[variable %pin% "^Oth",           EduInstitutionType := NA_character_] %>%
    .[, EduInstitutionType := factor(EduInstitutionType,
                                   levels = c(NA,
                                              "Infants/Primary",
                                              "Pre-school",
                                              "Secondary",
                                              "Technical or Further Educational Institution",
                                              "University or other tertiary"),
                                   ordered = TRUE)] %>%
  .[, .(students = sum(students)), keyby = c(Region_key, "Year", "EduInstitutionType")] %>%
  Mop("students")

freadT(13, value.name = "students") %>%
  .[!(variable %pin% "Persons$")] %>%
  .[!grepl("^Total", variable)] %>%
  .[if_else(variable %pin% c("^(?:Technical|University)",
                             "^Other_type",
                             "^Infants_Primary",
                             "^Secondary"),
            grepl("Total", variable),
            TRUE)] %>%
  .[variable %pin% "Pre.school",     EduInstitutionType := "Pre-school"] %>%
  .[variable %pin% "Infants.Primary",EduInstitutionType := "Infants/Primary"] %>%
  .[variable %pin% "^Secondary",     EduInstitutionType := "Secondary"] %>%
  .[variable %pin% "^Tec.*Furt",     EduInstitutionType := "Technical or Further Educational Institution"] %>%
  .[variable %pin% "^Uni",           EduInstitutionType := "University or other tertiary"] %>%
  .[variable %pin% "^Oth",           EduInstitutionType := NA_character_] %>%
  .[, EduInstitutionType := factor(EduInstitutionType,
                                   levels = c(NA,
                                              "Infants/Primary",
                                              "Pre-school",
                                              "Secondary",
                                              "Technical or Further Educational Institution",
                                              "University or other tertiary"),
                                   ordered = TRUE)] %>%
  extract_Sex %>%
  .[, .(students = sum(students)), keyby = c(Region_key, "Year", "EduInstitutionType", "Sex")] %>%
  Mop("students")

.extract_FamilyComposition <- function(v) {
  outputs <- 
    c("Couple family with no children",
      "Couple family with children", 
      "One parent family with children",
      "Other family",
      "Lone-person household", 
      "Group household")

  underscores <- gsub("[^A-Za-z]", "_",
                      gsub("One parent family with children",
                           "One parent family", 
                           outputs,
                           fixed = TRUE))

  regex <-
    paste0("(?:",
           paste0(underscores, 
                  collapse = ")|(?:"),
           ")")

  underscore_out <- 
    sub(paste0("^.*?(", regex, ").*?$"), 
        "\\1",
        v,
        perl = TRUE)

  outputs[match(underscore_out, underscores)]
}

extract_FamilyComposition <- function(.data) {
  outputs <- 
    c("Couple family with no children",
      "Couple family with children", 
      "One parent family with children",
      "Other family",
      "Lone-person household", 
      "Group household")

  underscores <- gsub("[^A-Za-z]", "_", outputs)

  req_regex <-
    paste0("(?:",
           paste0(underscores, 
                  collapse = ")|(?:"),
           ")")

  stopifnot("variable" %in% names(.data),
            all(grepl(req_regex, .data[["variable"]])))
  .data[, FamilyComposition := .extract_FamilyComposition(variable)]
  .data
}

expect_equal(.extract_FamilyComposition("2006_Census_Separate_house_Family_households_Couple_family_with_no_children"), 
             "Couple family with no children")

freadT(14, value.name =  "occupied_private_dwellings") %>%
  .[!(variable %pin% "Total$")] %$%
  .[, isFamily := variable %pin% "Family_households"] %>%
  .[(isFamily),
    FamilyComposition := gsub("_", " ",
                              gsub("^.*Family_households_",
                                   "",
                                   variable))] %>%
  .[not(isFamily),
    FamilyComposition := sub("^.*((?:Lone_person)|(?:Group)|(?:Other)).*$",
                             "\\1",
                             variable)] %>%
  .[, FamilyComposition := gsub("_", " ", FamilyComposition)] %>%
  .[FamilyComposition == "Other", FamilyComposition := "Other family"] %>%
  .[FamilyComposition == "Group", FamilyComposition := "Group household"] %>%
  .[FamilyComposition == "Lone person", FamilyComposition := "Lone-person household"] %>%
  .[]

#' @param x Values to be matched against
#' @param y Permitted decoded values. 
#' @param extract1 A regex with a capturing group. The part of \code{x} to be retained.
#' @param y.complete Must all values in y match with at least one value in \code{x}?
#' @return For every value in x, the closest match in y.
zmatch <- function(x, y, extract1 = "^(.*)$", delete.penalty = 0.01, sub.penalty = 0.2, y.complete = TRUE) {
  x <- sub(extract1, "\\1", x, perl = TRUE)
  Y <- gsub("[^A-Za-z]+", "_", y)
  distance_matrix <-
    adist(x, Y,
          costs = list(deletions = delete.penalty,
                       substitutions = sub.penalty))

  indexes <- apply(distance_matrix, 1, which.min)
  out <- y[indexes]

  if (y.complete && any(y[!is.na(y)] %notin% out)) {
    stop("Not all y were matched.\n\t",
         paste0(unique(y[y %notin% out]), sep = "\n\t"))
  }

  out
}

permitted_DwellingStructures <-
  c(NA_character_,
    "(Other)",
    "Flat/apartment",
    "Semi-detached, row/terrace house, townhouse etc.", 
    "Separate house")

test_that("zmatch", {
  x0 <- 'Semi_detached_row_or_terrace_house_townhouse_etc_with_Two_or_more_storeys_'
  expect_equal(zmatch(x0, permitted_DwellingStructures, y.complete = FALSE), 
               permitted_DwellingStructures[4])
})

freadT(15, value.name = "occupied_private_dwellings") %>%
  .[variable %pin% paste0("^(20(06|11|16))_Census_",
                          c("Separate_house", 
                            "Semi_detached_row_or_terrace_house_townhouse_etc_with_Total", 
                            "Flat_unit_or_apartment_Total",
                            "Other_dwelling_Total",
                            "Dwelling_structure_not_stated"),
                          "_Total$")] %>%
  .[, DwellingStructure := zmatch(variable, 
                                  permitted_DwellingStructures,
                                  extract1 = "^(?:20(?:06|11|16))_Census_(.*)(?:_Total)?_Total$")] %>% 
  # .[, DwellingStructure := sub("^(?:20(?:06|11|16))_Census_(.*)(?:_Total)?_Total$", "\\1", variable)] %>% 
  .[variable %pin% "Dwelling_structure_not_stated", DwellingStructure := NA_character_] %>%
  .[] %>%
  Mop("occupied_private_dwellings")

freadT(15, value.name = "occupied_private_dwellings") %>%
  .[variable %pin% paste0("^(20(06|11|16))_Census_",
                          c("Separate_house", 
                            "Semi_detached_row_or_terrace_house_townhouse_etc_with_Total", 
                            "Flat_unit_or_apartment_Total",
                            "Other_dwelling_Total",
                            "Dwelling_structure_not_stated"),
                          "_Number_of_persons_usually_resident")] %>%
  .[, DwellingStructure := zmatch(variable, 
                                  permitted_DwellingStructures,
                                  extract1 = "^(?:20(?:06|11|16))_Census_(.*)(?:_Total)?_Total$")] %>% 
  .[variable %pin% "Dwelling_structure_not_stated", DwellingStructure := NA_character_] %>%
  .[] %>%
   .[,
    UsualResidents.min := zmatch(variable, 
                                 c("One", "Two", "Three", "Four", "Five", "Six"),
                                 extract1 = "^.*persons_usually_resident_(.*)$")] %>%
  .[,
    UsualResidents.min := as.integer(match(UsualResidents.min,
                                           table = c("One", "Two", "Three", "Four", "Five", "Six")))] %>%
  Mop("occupied_private_dwellings")

freadT(18, value.name = "occupied_private_dwellings") %>% 
  .[!grepl("^Total", variable)] %>%
  .[variable %pin% "^(?:20(?:06|11|16))_Census_(?!Total).*Dwelling_structure_"] %>%
  .[, DwellingStructure := zmatch(variable, 
                                  permitted_DwellingStructures,
                                  extract1 = "^(?:20(?:06|11|16))_Census_.*Dwelling_structure_(.*)(?:_Total)?(?:_Total)?$",
                                  delete.penalty = 0.05)] %>% 
  .[grepl("Dwelling_structure_Not_stated", variable, ignore.case = TRUE),
    DwellingStructure := NA_character_] %>%
  .[!grepl("20.._Census_Total_Dwelling_structure_Separate_house", variable)] %>%
  .[] %>%
  .[, TenureType := sub("^(?:20(?:06|11|16))_Census_(.*)_Dwelling_structure_.*$",
                        "\\1",
                        variable)] %>%
  .[TenureType != "Rented_Total"] %>%
  .[TenureType == "Tenure_type_not_stated", TenureType := NA_character_] %>%
  .[, TenureType := gsub("_", " ", TenureType)] %>%
  .[] %>%
  Mop("occupied_private_dwellings")

permitted_Landlords <-
  c(NA, "(Other)", "Co-op/church group etc", "Person not in same household", 
    "Real estate agent", "State or territory housing authority")

freadT(19, value.name = "occupied_private_dwellings_being_rented") %>%
  .[!(variable %pin% "_Total$")] %>%
  .[!grepl("^20.._Census_Total", variable)] %>%
  .[, variable := gsub("Housing_co_operative_community_church_group", "Co-op/church group etc", variable)] %>%
  .[, variable := gsub("State_territory", "State or territory", variable)] %>%
  .[!grepl("Landlord_type_Not_stated", variable),
    Landlord := zmatch(variable, permitted_Landlords, extract1 = "^.*Landlord_type_(.*)$")] %>%
  .[variable %pin% "Housing.co.operative",
    Landlord := "Co-op/church group etc"] %>%
  .[variable %pin% "^20.._Census_([0-9]+)_.*$",
    Rent.min := 52.5 * as.integer(sub("^20.._Census_([0-9]+)_.*$", "\\1", variable))] %>%
  .[, Rent.min := as.integer(Rent.min)] %>%
  .[] %>%
  Mop("occupied_private_dwellings_being_rented")

freadT(20, value.name = "occupied_private_dwellings_being_rented") %>%
  rbind(freadT(21, value.name = "occupied_private_dwellings_being_rented")) %>%
  .[!(variable %pin% "_Total$")] %>%
  .[!(variable %pin% "20.._Census_Total")] %>%
  .[variable %pin% "^20.._Census_([0-9]+)_.*$",
    Rent.min := 52.5 * as.integer(sub("^20.._Census_([0-9]+)_.*$", "\\1", variable))] %>%
  .[, Rent.min := as.integer(Rent.min)] %>%

  .[variable %pin% "Couple_family", FamilyComposition := "Couple family"] %>%
  .[variable %pin% "One.parent", FamilyComposition := "One-parent family"] %>%

  .[, HasDependantStudent := not(variable %pin% "no_dependent_students$")] %>%
  .[, HasChild := not(variable %pin% "With_no_children(_under_15)?")] %>%
  .[!grepl("With_no_children$", variable),
    HasNonDependentChild := variable %pin% "With_non_dependent_children_only"] %>%

  .[] %>%
  Mop("occupied_private_dwellings_being_rented")

freadT(22, value.name = "families_with_children") %>%
  rbind(freadT(23, value.name = "families_with_children")) %>%
  .[!grepl("Total", variable)] %>%
  .[, CoupleFamily := grepl("Couple_family", variable)] %>%
  .[, Children := match(zmatch(variable,
                               y = c("One", "Two", "Three", "Four"),
                               extract1 = "^.*family_with_(.*)_child(ren)?$"), 
                        table = c("One", "Two", "Three", "Four", "Five"))] %>%
  .[!grepl("Partial_income_stated|All_incomes_not_stated|Negative", variable),
    TotalFamilyIncome.min := 52L * as.integer(sub("^.*Census_([0-9]+)_([0-9]+|or_more)_(Couple|One_parent).*$", "\\1", variable))] %>%
  .[grepl("Negative", variable), TotalFamilyIncome.min := -1L] %>%
  .[, list(families_with_children = sum(families_with_children),
           # hack to allow Mop() to proceed
           varI = first(varI)),
    keyby = c(Region_key, "Year", "CoupleFamily", "Children", "TotalFamilyIncome.min")] %>%
  Mop("families_with_children")

freadT(24, value.name = "occupied_private_dwellings_being_rented") %>%
  .[!grepl("Total", variable)] %>%
  # Mistake in data pack 150-224 should be 200-224
  .[, variable := sub("Rent_150_224", "Rent_200_224", variable)] %>%
  # If any income not stated, just set it as NA:
  .[, variable := sub("Partial_income_stated", "All_incomes_not_stated", variable)] %>%
  .[, 
    .(occupied_private_dwellings_being_rented = sum(occupied_private_dwellings_being_rented),
      varI = first(varI)),
    keyby = c(Region_key, "Year", "variable")] %>%
  .[variable %pin% "20.._Census_[0-9]",
    TotalHouseholdIncome.min := 52L * as.integer(sub("^.*Census_([0-9]+)_.*$", "\\1", variable))] %>%
  .[grepl("Negative", variable), TotalHouseholdIncome.min := -1L] %>%
  .[variable %pin% "^20.._Census_.*_Rent_[0-9]",
    Rent.min := 52.5 * as.integer(sub("^20.._Census_.*_Rent_([0-9]+).*$", "\\1", variable))] %>%
  .[, Rent.min := as.integer(Rent.min)] %>%
  Mop("occupied_private_dwellings_being_rented")

# Not usable
freadT(25, value.name = "families") %>%
  .[!grepl("Total", variable)] %>%
  .[, FamilyComposition := zmatch(variable,
                                  c("Couple family", "One-parent family"))] %>%
  .[] %>%
  .[, HasChildren := !grepl("family_with_no_children_(?!under_15)", variable)] %>%
  .[, HasChildrenUnder15 := HasChildren & !grepl("no_children_under_15|non_dependent", variable)] %>%
  .[, HasDependentStudents := grepl("((?<!(?:no))_dependent_students)", variable)] %>%
  .[variable %pin% "Mortgage_repayment_[0-9]",
    MortgageRepayment.min := as.integer(sub("^.*Mortgage_repayment_([0-9]+).*$", "\\1", variable))] %>%
  .[] %>%
  Mop("families")

# Selected (percent and persons in same table)
fread29 <- suppressWarnings(freadT(29))

adult_population <-
  fread29 %>%
  .[variable %pin% "^Persons_aged_15_years_and_over.*Persons$"] %>%
  .[, .(tot_adults = persons), keyby = c(Region_key, "Year")]

# Labour_force_status
fread29 %>%
  .[endsWith(variable, "Persons")] %>%
  .[variable %pin% c("^Labour_force_status", 
                     "^Not_in_the_labour_force")] %>%
  setnames("persons", "adults") %>%
  .[!grepl("Labour_force_status_Total", variable)] %>%
  .[variable %pin% "^Labour_force_status",
     LabourForceStatus := zmatch(variable, 
                                 c("Employed (full-time)",
                                   "Employed (part-time)",
                                   "Employed (away from work)",
                                   "Unemployed (looking for work)"),
                                 extract1 = "^Labour_force_status_(.*)_20.*$")] %>%
  .[variable %pin% "^Not_in_the_labour_force",
    LabourForceStatus := "Not in the labour force"] %>%

  # Need to account for 'not applicable' to ensure totals 
  # sum to adult population
  .[adult_population, on = c(Region_key, "Year")] %>%
  .[, residual_adults := tot_adults - sum(adults), keyby = c(Region_key, "Year")] %>%
  .[, .SD, .SDcols = c(Region_key,
                       "varI",
                       "Year", "LabourForceStatus", "adults", "residual_adults")] %>%
  melt.data.table(measure.vars = grep("adults", names(.), value = TRUE),
                  value.name = "adults") %>%
  .[variable == "residual_adults", LabourForceStatus := NA_character_] %>%
  unique(by = c(Region_key, "Year", "LabourForceStatus")) %>%
  Mop("adults")

adult_population <-
  fread29 %>%
  .[variable %pin% "^Persons_aged_15_years_and_over.*(Males|Females)$"] %>%
  extract_Sex %>%
  .[, .(tot_adults = persons), keyby = c(Region_key, "Sex", "Year")]

fread29 %>%
  .[!endsWith(variable, "Persons")] %>%
  extract_Sex %>%
  .[variable %pin% c("^Labour_force_status", 
                     "^Not_in_the_labour_force")] %>%
  setnames("persons", "adults") %>%
  .[!grepl("Labour_force_status_Total", variable)] %>%
  .[variable %pin% "^Labour_force_status",
     LabourForceStatus := zmatch(variable, 
                                 c("Employed (full-time)",
                                   "Employed (part-time)",
                                   "Employed (away from work)",
                                   "Unemployed (looking for work)"),
                                 extract1 = "^Labour_force_status_(.*)_20.*$")] %>%
  .[variable %pin% "^Not_in_the_labour_force",
    LabourForceStatus := "Not in the labour force"] %>%

  # Need to account for 'not applicable' to ensure totals 
  # sum to adult population
  .[adult_population, on = c(Region_key, "Year", "Sex")] %>%
  .[, residual_adults := tot_adults - sum(adults), keyby = c(Region_key, "Year", "Sex")] %>%
  .[, .SD, .SDcols = c(Region_key,
                       "varI",
                       "Year", "LabourForceStatus", "Sex", "adults", "residual_adults")] %>%
  melt.data.table(measure.vars = grep("adults", names(.), value = TRUE),
                  value.name = "adults") %>%
  .[variable == "residual_adults", LabourForceStatus := NA_character_] %>%
  unique(by = c(Region_key, "Year", "Sex", "LabourForceStatus")) %>%
  Mop("adults")

fread29 %>%
  .[endsWith(variable, "Persons")] %>%
  .[startsWith(variable, "Percent")] %>%
  setnames("persons", "value") %>%
  .[, variable := sub("^Percent_(.*)_20...*$", "\\1", variable)] %>%
  .[, variable := gsub("_([A-Za-z])", "\\U\\1\\E", variable)] %>%
  .[, variable := sub("^([A-Z])", "\\L\\1\\E", variable)] %>%
  dcast.data.table(formula = as.formula(paste0(Region_key, " + Year", "~", "variable")),
                   value.var = "value") %>%
  .[] %>%

  {
    dot <- .
    for (nom in setdiff(names(dot), c(Region_key, "Year"))) {
      out <- dot[, .SD, .SDcols = c(Region_key, "Year", nom)]
      Mop(out, value.name = nom)
      # assign(paste0(Region, "__Year_", nom),
      #        value = decodeRegion(dot[, .SD, .SDcols = c(Region_key, "Year", nom)]),
      #        envir = .GlobalEnv)
    }
  }

fread29 %>%
  .[!endsWith(variable, "Persons")] %>%
  extract_Sex %>%
  .[startsWith(variable, "Percent")] %>%
  setnames("persons", "value") %>%
  .[, variable := sub("^Percent_(.*)_20...*$", "\\1", variable)] %>%
  .[, variable := gsub("_([A-Za-z])", "\\U\\1\\E", variable)] %>%
  .[, variable := sub("^([A-Z])", "\\L\\1\\E", variable)] %>%
  dcast.data.table(formula = as.formula(paste0(Region_key, " + Year + Sex", "~", "variable")),
                   value.var = "value") %>%
  .[] %>%

  {
    dot <- .
    for (nom in setdiff(names(dot), c(Region_key, "Year", "Sex"))) {
      out <- dot[, .SD, .SDcols = c(Region_key, "Year", "Sex", nom)]
      Mop(out, value.name = nom)
      # assign(paste0(Region, "__Year_Sex_", nom),
      #        value = decodeRegion(dot[, .SD, .SDcols = c(Region_key, "Year", "Sex", nom)]),
      #        envir = .GlobalEnv)
    }

  }

fread29 %>%
  .[endsWith(variable, "Persons")] %>%
  .[startsWith(variable, "Non_school_qualification")] %>%
  .[, variable := sub("^Non_school_qualifications?_(.*)_20.._Census_Persons$", "\\1", variable)] %>%
  .[!endsWith(variable, "Total")] %>%
  .[, NonSchoolQualification := gsub("_", " ", 
                                     sub("Certificate_Certificate",
                                         "Certificate",
                                         sub("_(Degree_)?Level", "", 
                                             sub("(Certificate_Level_){2}",
                                                 "Certificate_Level_",
                                                 variable))))] %>%
  .[, NonSchoolQualification := factor(NonSchoolQualification,
                                       levels = c("Certificate nfd",
                                                  "Certificate I and II Level",
                                                  "Certificate III and IV Level",
                                                  "Advanced Diploma and Diploma",
                                                  "Bachelor",
                                                  "Graduate Diploma and Graduate Certificate",
                                                  "Postgraduate"),
                                       ordered = TRUE)] %>%
  provide_complementary_observation %>%
  .[] %>%
  Mop

fread29 %>%
  .[!endsWith(variable, "Persons")] %>%
  extract_Sex %>%
  .[startsWith(variable, "Non_school_qualification")] %>%
  .[, variable := sub("^Non_school_qualifications?_(.*)_20.._Census_(Males|Females)$", "\\1", variable)] %>%
  .[!endsWith(variable, "Total")] %>%
  .[, NonSchoolQualification := gsub("_", " ", 
                                     sub("Certificate_Certificate",
                                         "Certificate",
                                         sub("_(Degree_)?Level", "", 
                                             sub("(Certificate_Level_){2}",
                                                 "Certificate_Level_",
                                                 variable))))] %>%
  .[, NonSchoolQualification := factor(NonSchoolQualification,
                                       levels = c("Certificate nfd",
                                                  "Certificate I and II Level",
                                                  "Certificate III and IV Level",
                                                  "Advanced Diploma and Diploma",
                                                  "Bachelor",
                                                  "Graduate Diploma and Graduate Certificate",
                                                  "Postgraduate"),
                                       ordered = TRUE)] %>%
  provide_complementary_observation(vars = c("Year", "Sex")) %>%
  .[] %>%
  Mop

fread29 %>%
  .[endsWith(variable, "Persons")] %>%
  .[startsWith(variable, "Migration")] %>%
  .[grepl("1_year_ago", variable)] %>%
  .[, SameAddress1YearAgo := grepl("Lived_at_same_address_1_year_ago", variable)] %>%
  .[] %>%
  provide_complementary_observation %>%
  Mop

fread29 %>%
  .[endsWith(variable, "Persons")] %>%
  .[startsWith(variable, "Migration")] %>%
  .[grepl("5_years_ago", variable)] %>%
  .[, SameAddress5YearsAgo := grepl("Lived_at_same_address_5_years_ago", variable)] %>%
  .[] %>%
  provide_complementary_observation %>%
  Mop

fread29 %>%
  .[!endsWith(variable, "Persons")] %>%
  extract_Sex %>%
  .[startsWith(variable, "Migration")] %>%
  .[grepl("1_year_ago", variable)] %>%
  .[, SameAddress1YearAgo := grepl("Lived_at_same_address_1_year_ago", variable)] %>%
  .[] %>%
  provide_complementary_observation(vars = c("Year", "Sex")) %>%
  Mop

fread29 %>%
  .[!endsWith(variable, "Persons")] %>%
  extract_Sex %>%
  .[startsWith(variable, "Migration")] %>%
  .[grepl("5_years_ago", variable)] %>%
  .[, SameAddress5YearsAgo := grepl("Lived_at_same_address_5_years_ago", variable)] %>%
  .[] %>%
  provide_complementary_observation(vars = c("Year", "Sex")) %>%
  Mop

freadT(30, value.name = "families") %>%
  .[!endsWith(variable, "Total")] %>%
  .[!grepl("Census_Total", variable)] %>%
  .[grepl("Couple|Parent", variable, ignore.case = TRUE),
    FamilyComposition := zmatch(variable, y = c("Couple", "One parent"))] %>%
  .[] %>%
  .[variable %pin% "Family_income_[0-9]",
    TotalFamilyIncome.min := 52L * as.integer(sub("^.*Family_income_([0-9]+)_.*$", "\\1", variable))] %>%
  .[variable %pin% "Negative_Nil_income$", 
    TotalFamilyIncome.min := -1L] %>%
  .[, variable := sub("_Family_income.*$", "", variable)] %>%
  .[!endsWith(variable, "Total")] %>%
  .[!grepl("Labour_force(_status)?_not_stated", variable),
    c("FullTimeWorkers", 
      "PartTimeWorkers",
      "AwayFromWork", 
      "NotWorking") := 0L] %>%
  .[variable %pin% "(One_e|E)mployed_full_time", FullTimeWorkers := 1L] %>%
  .[variable %pin% "[oO]ne(_employed)?_part_time", PartTimeWorkers := 1L] %>%
  .[variable %pin% "Employed_part_time", PartTimeWorkers := 1L] %>%
  .[variable %pin% "one_away_from_work", AwayFromWork := 1L] %>%
  .[variable %pin% "Employed_away_from_work$", AwayFromWork := 1L] %>%
  .[variable %pin% "one_not_working", NotWorking := 1L] %>%
  .[variable %pin% "Both_employed_full_time", FullTimeWorkers := 2L] %>%
  .[variable %pin% "Both_employed_part_time", PartTimeWorkers := 2L] %>%
  .[variable %pin% "Both_employed_away_from_work", AwayFromWork := 2L] %>%
  .[variable %pin% "Both_not_working", NotWorking := 2L] %>%
  .[variable %pin% "Unemployed", NotWorking := 1L] %>%
  .[] %>%
  .[grepl("children", variable),
    HasChildren := !grepl("with_no_children", variable)] %>%
  .[] %>%
  Mop("families",
      suborder = c("CoupleFamily",
                   "HasChild",
                   "TotalFamilyIncome.min",
                   "FullTimeWorkers",
                   "PartTimeWorkers",
                   "AwayFromWork", 
                   'NotWorking',
                   "families")
      ) %>%
  .[]

freadT(31, value.name = "adults") %>%
  .[grepl("20.._Census_Persons", variable)] %>%
  .[!grepl("Total$", variable)] %>%
  .[!grepl("20.._Census_Persons_Total", variable)] %>%
  extract_Age %>%
  .[, variable := sub("20.._Census_Persons_(.*?)_[1-9].*", "\\1", variable)] %>%
  .[] %>%
  .[variable != "Certificate_Level_Total"] %>%

  # Set both to NA
  .[variable == "Level_of_education_inadequately_described",
    variable := "Level_of_education_not_stated"] %>%
  .[variable == "Level_of_education_not_stated",
    adults := sum(adults),
    by = c(Region_key, "Year", "Age")] %>%
  unique(by = setdiff(names(.), "varI")) %>%

  .[!grepl("Level_of_education_not_stated", variable),
    NonSchoolQualification := zmatch(variable,
                                     c("Certificate nfd", 
                                       "Certificate I and II Level",
                                       "Certificate III and IV Level", 
                                       "Advanced Diploma and Diploma",
                                       "Bachelor",
                                       "Graduate Diploma and Graduate Certificate", 
                                       "Postgraduate"))] %>%
  .[, NonSchoolQualification := factor(NonSchoolQualification,
                                       levels = c("Certificate nfd",
                                                  "Certificate I and II Level",
                                                  "Certificate III and IV Level",
                                                  "Advanced Diploma and Diploma",
                                                  "Bachelor",
                                                  "Graduate Diploma and Graduate Certificate",
                                                  "Postgraduate"),
                                       ordered = TRUE)] %>%
  .[] %>%
  Mop("adults")

freadT(31, value.name = "adults") %>%
  .[grepl("20.._Census_(Males|Females)", variable)] %>%
  extract_Sex %>%
  .[!grepl("Total$", variable)] %>%
  .[!grepl("20.._Census_(Males|Females)_Total", variable)] %>%
  extract_Age %>%
  .[, variable := sub("20.._Census_(?:Males|Females)_(.*?)_[1-9].*", "\\1", variable)] %>%
  .[] %>%
  .[variable != "Certificate_Level_Total"] %>%

  # Set both to NA
  .[variable == "Level_of_education_inadequately_described",
    variable := "Level_of_education_not_stated"] %>%
  .[variable == "Level_of_education_not_stated",
    adults := sum(adults),
    by = c(Region_key, "Year", "Age", "Sex")] %>%
  unique(by = setdiff(names(.), "varI")) %>%

  .[!grepl("Level_of_education_not_stated", variable),
    NonSchoolQualification := zmatch(variable,
                                     c("Certificate nfd", 
                                       "Certificate I and II Level",
                                       "Certificate III and IV Level", 
                                       "Advanced Diploma and Diploma",
                                       "Bachelor",
                                       "Graduate Diploma and Graduate Certificate", 
                                       "Postgraduate"))] %>%
  .[, NonSchoolQualification := factor(NonSchoolQualification,
                                       levels = c("Certificate nfd",
                                                  "Certificate I and II Level",
                                                  "Certificate III and IV Level",
                                                  "Advanced Diploma and Diploma",
                                                  "Bachelor",
                                                  "Graduate Diploma and Graduate Certificate",
                                                  "Postgraduate"),
                                       ordered = TRUE)] %>%
  .[] %>%
  Mop("adults")

freadT(32, value.name = "adults") %>%
  .[grepl("20.._Census_(Persons)", variable)] %>%
  .[!grepl("Total$", variable)] %>%
  .[!grepl("20.._Census_(Persons)_Total", variable)] %>%
  extract_Age %>%
  .[, variable := sub("20.._Census_(?:Persons)_(.*?)_[1-9].*", "\\1", variable)] %>%
  .[] %>%
  # Set both to NA
  .[variable == "Field_of_study_inadequately_described",
    variable := "Field_of_study_not_stated"] %>%
  .[variable == "Field_of_study_not_stated",
    adults := sum(adults),
    by = c(Region_key, "Year", "Age")] %>%
  unique(by = setdiff(names(.), "varI")) %>%
  .[, FOE := gsub("_", " ", variable)] %>%
  .[] %>%
  Mop("adults")

freadT(32, value.name = "adults") %>%
  .[grepl("20.._Census_(Males|Females)", variable)] %>%
  extract_Sex %>%
  .[!grepl("Total$", variable)] %>%
  .[!grepl("20.._Census_(Males|Females)_Total", variable)] %>%
  extract_Age %>%
  .[, variable := sub("20.._Census_(?:Males|Females)_(.*?)_[1-9].*", "\\1", variable)] %>%
  # Set both to NA
  .[variable == "Level_of_education_inadequately_described",
    variable := "Level_of_education_not_stated"] %>%
  .[variable == "Level_of_education_not_stated",
    adults := sum(adults),
    by = c(Region_key, "Year", "Age", "Sex")] %>%
  unique(by = setdiff(names(.), "varI")) %>%
  .[, FOE := gsub("_", " ", variable)] %>%
  .[] %>%
  Mop("adults")

freadT(33, value.name = "adults") %>%
  .[!grepl("Males|Females", variable)] %>%
  .[!grepl("Labour_force_status_Total", variable)] %>%
  .[!grepl("20.._Census_Persons_Total", variable)] %>%
  .[!endsWith(variable, "Total")] %>%
  extract_Age %>%
  setnames("Age", "Age5yr") %>%
  .[] %>%
  .[, variable := sub("^.*_([A-Z][^A-Z]+)$", "\\1", variable)] %>%
  .[] %>%
  .[variable %enotin% "Total_labour_force"] %>%
  .[] %>%
  .[variable %ein% "Worked_full_time", LabourForceStatus := "Employed (full-time)"] %>%
  .[variable %ein% "Worked_part_time", LabourForceStatus := "Employed (part-time)"] %>%
  .[variable %ein% "Away_from_work",LabourForceStatus := "Employed (away from work)"] %>%
  .[variable %ein% "Hours_worked_not_stated",LabourForceStatus := "Employed (hours worked not stated)"] %>%
  .[variable %ein% "Looking_for_full_time_work",LabourForceStatus := "Unemployed (looking for full-time work)"] %>%
  .[variable %ein% "Looking_for_part_time_work",LabourForceStatus := "Unemployed (looking for part-time work)"] %>%
  .[variable %ein% "Not_in_the_labour_force",LabourForceStatus := "Not in the labour force"] %>%
  .[] %>% 
  Mop("adults")

freadT(33, value.name = "adults") %>%
  .[grepl("Males|Females", variable)] %>%
  extract_Sex %>%
  .[!grepl("Labour_force_status_Total", variable)] %>%
  .[!grepl("20.._Census_(Males|Females)_Total", variable)] %>%
  .[!endsWith(variable, "Total")] %>%
  extract_Age %>%
  setnames("Age", "Age5yr") %>%
  .[] %>%
  .[, variable := sub("^.*_([A-Z][^A-Z]+)$", "\\1", variable)] %>%
  .[] %>%
  .[variable %enotin% "Total_labour_force"] %>%
  .[] %>%
  .[variable %ein% "Worked_full_time", LabourForceStatus := "Employed (full-time)"] %>%
  .[variable %ein% "Worked_part_time", LabourForceStatus := "Employed (part-time)"] %>%
  .[variable %ein% "Away_from_work",LabourForceStatus := "Employed (away from work)"] %>%
  .[variable %ein% "Hours_worked_not_stated",LabourForceStatus := "Employed (hours worked not stated)"] %>%
  .[variable %ein% "Looking_for_full_time_work",LabourForceStatus := "Unemployed (looking for full-time work)"] %>%
  .[variable %ein% "Looking_for_part_time_work",LabourForceStatus := "Unemployed (looking for part-time work)"] %>%
  .[variable %ein% "Not_in_the_labour_force",LabourForceStatus := "Not in the labour force"] %>%
  .[] %>% 
  Mop("adults")

freadT(34, value.name = "workers") %>%
  .[!grepl("Males|Females", variable)] %>%
  .[, variable := sub("_20.._Census_Persons$", "", variable)] %>%
  .[variable != "Total"] %>%
  .[variable != "Inadequately_described_Not_stated",
    IndustryOfEmployment := gsub("_", " ", variable)] %>%

  Mop("workers")

freadT(34, value.name = "workers") %>%
  .[grepl("Males|Females", variable)] %>%
  extract_Sex %>%
  .[, variable := sub("_20.._Census_(Males|Females)$", "", variable)] %>%
  .[variable != "Total"] %>%
  .[variable != "Inadequately_described_Not_stated",
    IndustryOfEmployment := gsub("_", " ", variable)] %>%
  Mop("workers")

freadT(35, value.name = "workers") %>%
  .[!grepl("Males|Females", variable)] %>%
  .[, variable := sub("_20...*", "", variable)] %>%
  .[variable != "Total"] %>%
  .[variable != "Inadequately_described_Not_stated", 
    Occupation := gsub("_", " ", variable)] %>%
  .[] %>%
  Mop("workers")

freadT(35, value.name = "workers") %>%
  .[grepl("Males|Females", variable)] %>%
  extract_Sex %>%
  .[, variable := sub("_20...*", "", variable)] %>%
  .[variable != "Total"] %>%
  .[variable != "Inadequately_described_Not_stated", 
    Occupation := gsub("_", " ", variable)] %>%
  .[] %>%
  Mop("workers")

document <- function(the_table) {
  stopifnot(dir.exists("R"))
  file.R <- file.path("R", paste0(the_table, ".R"))

  if (file.exists(file.R)) {
    file.remove(file.R)
  }

  DT <- get(the_table)
  value_name <- names(DT)[ncol(DT)]


  cat(paste0("#' @title ", paste0(names(DT)[-c(1:2, ncol(DT))],
                                  collapse = ", "),
             " by ", Region, ", Year"),
      "\n",
      file = file.R)

  cat(paste0("#' @description ",
             if (value_name == tolower(value_name) && 
                 value_name != "unemployment") {
               paste0("Number of ", value_name)
             } else {
               # 'medianIncomeHouse' --> 'Median income house'
               sub("^(.)",
                   "\\U\\1\\E",
                   paste0(unlist(strsplit(value_name, split = "(?<=[a-z](?=(?:[A-Z])))", perl = TRUE))))
             }, 
             paste0(names(DT)[-c(1:2, ncol(DT))],
                    collapse = ", "),
             " by ", Region, ", Year"),
      "\n",
      file = file.R, append = TRUE)

  cat(paste0("#' @format ",
             prettyNum(nrow(DT), big.mark = ","), 
             " observations and ", ncol(DT), " variables."),
      "\n",
      "\n",
      file = file.R, append = TRUE)

  cat(paste0('"', the_table, '"'), file = file.R, append = TRUE)
  cat("\n", file = file.R, append = TRUE)
}

# Print side-effect ok
stopifnot(dir.exists("data"))
ced_tbls <-
  tables(silent = TRUE) %>%
  use_series("NAME") %>%
  grep(pattern = paste0("^", Region), x = ., value = TRUE)

suppressMessages({library(Census2016.spec)})
test_check(data_list = mget(ced_tbls), show.progress = FALSE)

data_path <- "data"
provide.dir("tsv")
provide.dir(file.path("tsv", Region))
tsv_path <- file.path("tsv", Region)


try({
prior_data_size <- 
  lapply(list.files(path = data_path,
                    full.names = TRUE),
         file.info) %>%
  rbindlist %$%
  sum(size) %>%
  divide_by(1024^2) %>%
  round(2)

prior_region_data_size <-
  lapply(list.files(path = data_path,
                    pattern = Region,
                    full.names = TRUE),
         file.info) %>%
  rbindlist %$%
  sum(size) %>%
  divide_by(1024^2) %>%
  round(2)
}, silent = TRUE)

region_dtas <- c(list.files(path = "data/",
                            pattern = Region,
                            full.names = TRUE),
                 list.files(path = file.path("data-raw", "data", Region),
                            pattern = paste0(Region, ".*csv$"),
                            full.names = TRUE))

vapply(region_dtas,
       file.remove,
       logical(1)) %>%
  all %>%
  stopifnot

library(foreach)
library(doParallel)
cl <- makeCluster(8)
registerDoParallel(cl, cores = 8)

foreach(tbl = ced_tbls, .inorder = FALSE, .export = c(ced_tbls, data_path)) %dopar% {
  save(list = tbl,
       file = file.path(data_path, paste0(tbl, ".rda")),
       compress = "gzip",
       compression_level = 9)
}
stopImplicitCluster()
stopCluster(cl)


fwrite <- function(...) data.table::fwrite(..., sep = "\t")
for (tbl in ced_tbls) {

  file.tsv <- file.path(tsv_path, paste0(tbl, ".tsv"))

  .ced_tbl <- get(tbl)
  fwrite(.ced_tbl, file = file.tsv)

  switch (ceiling(file.size(file.tsv) / (95 * 1024^2)),
          {
            NULL
          },
          {
            DT <- .ced_tbl
            lastv <- .subset2(DT, last(names(DT)))
            if (is.integer(lastv) && min(lastv) == 0L) {
              DT <- DT[lastv > 0L]
            }
            NN <- nrow(DT)
            DT1 <- DT[seq_len(NN %/% 2)]
            DT2 <- DT[-seq_len(NN %/% 2)]
            file.remove(file.tsv)
            fwrite(DT1, sub("\\.tsv$", "1.tsv", file.tsv))
            fwrite(DT2, sub("\\.tsv$", "2.tsv", file.tsv))
          },
          {
            DT <- .ced_tbl
            lastv <- .subset2(DT, last(names(DT)))
            if (is.integer(lastv) && min(lastv) == 0L) {
              DT <- DT[lastv > 0L]
            }
            NN <- nrow(DT)
            DT1 <- DT[seq_len(NN %/% 2)]
            DT2 <- DT[-seq_len(NN %/% 2)]
            DT11 <- DT1[seq_len(nrow(DT1) %/% 2)]
            DT12 <- DT1[-seq_len(nrow(DT1) %/% 2)]
            DT21 <- DT2[seq_len(nrow(DT2) %/% 2)]
            DT22 <- DT2[-seq_len(nrow(DT2) %/% 2)]
            stopifnot(nrow(DT11) + nrow(DT12) +
                        nrow(DT21) + nrow(DT22) == nrow(DT))
            file.remove(file.tsv)
            fwrite(DT11, sub("\\.tsv$", "11.tsv", file.tsv))
            fwrite(DT12, sub("\\.tsv$", "12.tsv", file.tsv))
            fwrite(DT21, sub("\\.tsv$", "21.tsv", file.tsv))
            fwrite(DT22, sub("\\.tsv$", "22.tsv", file.tsv))
          })
}


region_dtas <- list.files(path = "data/",
                          pattern = Region,
                          full.names = TRUE)
# tools::resaveRdaFiles(paths = region_dtas)

current_region_data_size <-
  lapply(list.files(path = data_path,
                    pattern = Region,
                    full.names = TRUE),
         file.info) %>%
  rbindlist %$%
  sum(size) %>%
  divide_by(1024^2) %>%
  round(2)

current_data_size <- 
  lapply(list.files(path = data_path,
                    full.names = TRUE),
         file.info) %>%
  rbindlist %$%
  sum(size) %>%
  divide_by(1024^2) %>%
  round(2)

cat("\n")
cat("Region:\t", prior_region_data_size, "MB  ===> ", current_region_data_size, "MB\n")
cat("Total: \t", prior_data_size, "MB  ===> ", current_data_size, "MB\t")

# Print side-effect ok
ced_tbls <-
  tables(silent = TRUE) %>%
  use_series("NAME") %>%
  grep(pattern = paste0("^", Region), x = ., value = TRUE)

if (dir.exists("data")) {

  lapply(list.files(path = "data/",
                    full.names = TRUE),
         file.info) %>%
    rbindlist %$%
    sum(size) %>%
    divide_by(1024^2) %>%
    cat

  region_dtas <- list.files(path = "data/",
                            pattern = Region,
                            full.names = TRUE)

  vapply(region_dtas,
         file.remove,
         logical(1)) %>%
    all %>%
    stopifnot
} else {
  provide.dir("data")
}

for (tbl in ced_tbls) {
  document(tbl)
  save(list = tbl, file = file.path("data", paste0(tbl, ".rda")))
}

region_dtas <- list.files(path = "data/",
                          pattern = Region,
                          full.names = TRUE)
size_of_data <- 
  lapply(list.files(path = "data/",
                    pattern = Region,
                    full.names = TRUE),
         file.info) %>%
  rbindlist %$%
  sum(size) %>%
  divide_by(1024^2)

cat("Size before resave:\t", round(size_of_data, 1), "MB")
# Size not large enough to justify resaving.
# 
# tools::resaveRdaFiles(paths = region_dtas)
# size_of_data <- 
#   lapply(list.files(path = "data/",
#                     pattern = Region,
#                     full.names = TRUE),
#          file.info) %>%
#   rbindlist %$%
#   sum(size) %>%
#   divide_by(1024^2)
# 
# cat("Size after resave:\t", round(size_of_data, 1), "MB")