library(data.table)
library(magrittr)
library(tidyxl)
library(unpivotr)
library(hutils)
library(grattan)
clean_CAPITA_sheet <- function(input, debug = NULL) {
sheetDT <- as.data.table(input)
# tidy_xlsx() does not read in the columns but
# retains the original of the value for 'col'. So
# if column 10 was empty in the excel file, the
# col values will be 8 9 11 12
empty_columns <- setdiff(seq_len(max(sheetDT$col)),
unique(sheetDT$col))
# Goddammit Mark
if (identical(sheetDT[address == "E2"][["character"]], "Bott, Mark:")) {
empty_columns <- sheetDT[address == "E2"][["col"]][1]
}
# exclude empty columns (e.g. CPI_A has a comment to the right of the table
# which messes up the logic of numericness)
if (length(empty_columns) > 0) {
sheetDT <- sheetDT[col < min(empty_columns)]
}
type_by_row <-
sheetDT %>%
.[data_type != "blank"] %>%
.[, .N, by = c("data_type", "row")]
numeric_row <-
type_by_row %>%
.[, .(numeric = all(data_type %in% c("numeric", "date"))), keyby = row]
headers <-
as.data.table(sheetDT) %>%
.[row < min(numeric_row[(numeric)][["row"]]), .(row, col, character)] %>%
.[order(col)]
tryCatch({
raw_headers <-
headers[row == max(row), list(col, raw_name = character)]
},
warning = function(e) {
cat(debug)
e
})
numeric_cols <-
sheetDT[row >= which.max(numeric_row[["numeric"]]),
list(row, col, numeric)] %>%
melt.data.table(id.vars = c("col", "row")) %>%
.[complete.cases(.)] %>%
.[raw_headers, on = "col"]
# Are columns 1:2 just dates (i.e. the keys of the table)?
which_dates_cols <- unique(sheetDT[!is.na(date)][["col"]])
assign("the_sheet_DT", sheetDT, envir = .GlobalEnv)
if (identical(which_dates_cols, c(1L, 2L))) {
start_date_by_row <-
sheetDT %>%
.[col == 1] %>%
.[row >= which.max(numeric_row[["numeric"]]),
list(row, start_date = date)] %>%
.[complete.cases(.)]
end_date_by_row <-
sheetDT %>%
.[col == 2] %>%
.[row >= which.max(numeric_row[["numeric"]]),
list(row, end_date = date)] %>%
.[complete.cases(.)]
out <-
numeric_cols %>%
.[col > 2] %>%
.[start_date_by_row, on = "row"] %>%
.[end_date_by_row, on = "row"]
} else {
# i.e. one date to key the data
if (identical(which_dates_cols, 1L)) {
date_by_row <-
sheetDT %>%
.[col == 1] %>%
.[row >= which.max(numeric_row[["numeric"]]),
list(row, start_date = date)] %>%
.[, end_date := shift(start_date, type = "lead")] %>%
.[complete.cases(.)]
out <-
numeric_cols %>%
.[col > 1] %>%
.[date_by_row, on = "row"]
} else {
if (!is.null(debug)) {
cat(debug, "")
out <- data.table(col = NA_integer_)
}
}
}
out
}
if (!file.exists("data-raw/CAPITA/capita.tsv") ||
!file.exists("data-raw/CAPITA/capita-headers.tsv")) {
tidy_xlsx <- function (path, sheets = NA) {
# .Deprecated(msg = paste("'tidy_xlsx()' is deprecated.", "Use 'xlsx_cells()' or 'xlsx_formats()' instead.",
# sep = "\n"))
path <- tidyxl:::check_file(path)
all_sheets <- tidyxl:::utils_xlsx_sheet_files(path)
sheets <- tidyxl:::check_sheets(sheets, path)
formats <- tidyxl:::xlsx_formats_(path)
cells <- tidyxl:::xlsx_cells_(path, sheets$sheet_path, sheets$name,
sheets$comments_path)
cells$sheet <- factor(cells$sheet, levels = sheets$name)
cells_list <- split(cells, cells$sheet)
cells_list <- lapply(cells_list, function(x) x[, -1])
list(data = cells_list, formats = formats)
}
CAPITA <-
switch(basename(getwd()),
"grattan" = tidy_xlsx("./data-raw/CAPITA/CPS-v17-09-12.xlsx"),
"data-raw" = tidy_xlsx("./CAPITA/CPS-v17-09-12.xlsx"),
"CAPITA" = tidy_xlsx("CPS-v17-09-12.xlsx"))
CAPITA_data <- CAPITA[["data"]]
capita_tables <- list(length(CAPITA_data))
K <- seq_along(CAPITA_data)[names(CAPITA_data) != "Contents"]
capita <-
lapply(K, function(k) {
sheet_nom <- names(CAPITA_data)[k]
if (sheet_nom != "Contents") {
capita_tables[[k]] <-
clean_CAPITA_sheet(CAPITA_data[[k]], debug = k) %>%
.[, "sheet_name" := sheet_nom] %>%
.[, "k" := k]
}
}) %>%
rbindlist
get_headers <- function(input) {
sheetDT <- as.data.table(input)
sheetDT[row <= 7, .(row, col, character)] %>%
.[, is_raw_header := if_else(row == 7L, "raw", "above")] %>%
setorder(col, -row) %>%
.[, character_filled_right := zoo::na.locf(character, na.rm = FALSE), by = row] %>%
.[, character_filled_left := zoo::na.locf(character, na.rm = FALSE, fromLast = TRUE), by = row] %>%
.[, character_filled := coalesce(character_filled_right, character_filled_left)] %>%
.[, row_as_char := paste0("R", row)] %>%
dcast.data.table(col ~ row_as_char, value.var = "character_filled") %>%
setcolorder(rev(names(.))) %>%
setnames("R7", "raw_name")
}
capita_headers <-
lapply(K, function(k) {
sheet_nom <- names(CAPITA_data)[k]
if (sheet_nom != "Contents") {
capita_tables[[k]] <-
get_headers(CAPITA_data[[k]]) %>%
.[, "sheet_name" := sheet_nom] %>%
.[, "k" := k]
}
}) %>%
rbindlist(use.names = TRUE, fill = TRUE)
fwrite(capita_headers, "data-raw/CAPITA/capita-headers.tsv", sep = "\t")
# stopifnot(basename(getwd()) == "grattan")
# fwrite(capita, "data-raw/CAPITA/capita.tsv", sep = "\t")
} else {
capita <- fread("data-raw/CAPITA/capita.tsv", sep = "\t", na.strings = c("NA", ""))
capita_headers <- fread("data-raw/CAPITA/capita-headers.tsv", na.strings = c("NA", ""))
}
cols2COL <- function(x) {
w0 <- which(x <= 26)
w1 <- which(x >= 27)
x0 <- x[w0]
x1 <- x[w1]
out <- character(length(x))
out[w0] <- LETTERS[x0]
out[w1] <- paste0(LETTERS[x1 %/% 26], LETTERS[x1 %% 26])
out
}
capita[, COL := cols2COL(col)]
assert_drop60plus_ok <- function(DT) {
#' @return Asserts that the allowances for
#' Singles with dependants and singles (60 + and long-term recipient)
#' are identical
#'
#'
stopifnot("col" %in% names(DT))
Keeps <- DT[col %between% c(6, 7)]
Drops <- DT[col %between% c(9, 10)]
if (!identical(Keeps[["value"]],
Drops[["value"]])) {
stop("`With Dependents` not identical to `60+ and long term recipient`")
}
DT[!between(col, 9, 11)]
}
unemployment_annual_rates <-
capita[sheet_name == "Unemployment_A"] %>%
.[!grepl("^not_used", raw_name)] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %T>%
{stopifnot(all(.$raw_name == .$i.raw_name))} %>%
.[R1 == "Period" | R2 %pin% c("Period", "NewStart Allowance,")] %>%
assert_drop60plus_ok %>%
.[, .(fy_year = yr2fy(year(end_date)),
hasPartner = R3 %ein% "Married",
hasDependant = R4 %enotin% "No Dependents",
Component = R6,
value = round(value, 2))] %>%
unique %>%
dcast.data.table(... ~ Component, value.var = "value")
# Give up
unemployment_table_means_tests_annual <-
capita[sheet_name == "Unemployment_A"] %>%
.[!grepl("^not_used", raw_name)] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %T>%
{stopifnot(all(.$raw_name == .$i.raw_name))} %>%
.[R1 == "Period" | R1 == "Means Tests"] %>%
.[, fy_year := yr2fy(year(end_date))] %>%
drop_constant_cols %>%
.[]
unemployment_assets_tests <-
capita[sheet_name == "Unemployment_A"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
.[R2 == "Asset Cutout"] %>%
drop_constant_cols %>%
.[,
list(fy_year = date2fy(end_date),
hasPartner = R3 == "Married",
HomeOwner = R4 == "Home owner",
asset_cutout = as.integer(floor(value)))] %>%
setkey(fy_year, hasPartner, HomeOwner) %>%
.[]
unemployment_income_thresholds <-
capita[sheet_name == "Unemployment_A"] %>%
.[!grepl("^not_used", raw_name)] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
.[R2 == "Income tests"] %>%
.[, i.raw_name := NULL] %>%
.[raw_name %chin% c("UnempThr1F", "UnempThr2F")] %>%
drop_constant_cols %>%
.[, .(fy_year = date2fy(end_date),
TaperNumber = as.integer(gsub("[^0-9]", "", raw_name)),
IncomeThreshold = as.integer(floor(value)))]
unemployment_income_tapers <-
capita[sheet_name == "Unemployment_A"] %>%
.[!grepl("^not_used", raw_name)] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
.[R2 == "Income tests"] %>%
.[, i.raw_name := NULL] %>%
.[raw_name %chin% c("UnempTpr1", "UnempTpr2","UnempSingDepsTpr1", "UnempSingDepsTpr2")] %>%
drop_constant_cols %>%
.[, .(fy_year = date2fy(end_date),
R5,
TaperNumber = as.integer(R6 == "Upper") + 1L,
raw_name,
value)] %>%
.[, hasPartner := grepl("Couples", R5)] %>%
.[, hasDependant := R5 == "Singles with Dependents"] %>%
{
dot <- .
Couples_with_dependants <- dot[(hasPartner)]
Couples_with_dependants[, R5 := "Couples with Dependants"]
rbind(dot, Couples_with_dependants[, hasDependant := TRUE])
} %>%
{
dot <- .
Singles_without_dependants <- dot[R5 %ein% "Couples and Singles Without Deps"]
Singles_without_dependants[, R5 := "Singles without Dependants"]
Singles_without_dependants[, hasPartner := FALSE]
rbind(dot, Singles_without_dependants[, hasDependant := FALSE])
} %>%
.[order(fy_year, value, hasPartner, hasDependant, TaperNumber),
.(fy_year, hasPartner, hasDependant, TaperNumber, taper = value)]
unemployment_income_tests <-
unemployment_income_thresholds[unemployment_income_tapers,
on = c("fy_year", "TaperNumber")] %>%
dcast.data.table(fy_year + hasPartner + hasDependant ~ TaperNumber,
value.var = c("IncomeThreshold", "taper"),
sep = "_")
unemployment_rates_by_date <-
capita[sheet_name == "Unemployment"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
.[, (grep("raw_name", names(.), value = TRUE)) := NULL] %>%
drop_constant_cols %>%
.[R1 == "Period" | R2 %pin% c("Period", "NewStart Allowance,")] %>%
assert_drop60plus_ok %>%
.[col %notin% c(5, 8, 11, 14)] %>%
.[, .(Date = as.Date(end_date),
hasPartner = R3 %ein% "Married",
hasDependant = R4 %enotin% "No Dependents",
Component = R6,
value = round(value, 2))] %>%
unique %>%
dcast.data.table(Date + hasPartner + hasDependant ~ Component,
value.var = "value") %>%
.[, ES := coalesce(ES, 0.0)] %>%
setkey(hasPartner, hasDependant, Date)
unemployment_assets_tests_by_date <-
capita[sheet_name == "Unemployment"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
.[R1 == "Means Tests"] %>%
.[R2 == "Asset Cutout"] %>%
.[,
.(Date = as.Date(end_date),
hasPartner = R3 %ein% "Married",
HomeOwner = R4 %ein% "Home owner",
asset_cutout = as.integer(floor(value)))
] %>%
setkey(hasPartner, HomeOwner, Date)
unemployment_income_thresholds_by_date <-
capita[sheet_name == "Unemployment"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
.[R1 == "Means Tests"] %>%
.[R3 == "Income Thresholds"] %>%
.[,
.(Date = as.Date(end_date),
TaperNumber = as.integer(R5 %ein% "Upper") + 1L,
IncomeThreshold = round(value, 2))
]
unemployment_income_tapers_by_date <-
capita[sheet_name == "Unemployment"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
.[R1 == "Means Tests"] %>%
.[R3 == "Taper Rates"] %>%
.[, .(Date = as.Date(end_date),
R5,
TaperNumber = as.integer(R6 == "Upper") + 1L,
taper = round(value, 2))] %>%
.[, hasPartner := grepl("Couples", R5)] %>%
.[, hasDependant := R5 == "Singles with Dependents"] %>%
{
dot <- .
Couples_with_dependants <- dot[(hasPartner)]
Couples_with_dependants[, R5 := "Couples with Dependants"]
rbind(dot, Couples_with_dependants[, hasDependant := TRUE])
} %>%
{
dot <- .
Singles_without_dependants <- dot[R5 %ein% "Couples and Singles Without Deps"]
Singles_without_dependants[, R5 := "Singles without Dependants"]
Singles_without_dependants[, hasPartner := FALSE]
rbind(dot, Singles_without_dependants[, hasDependant := FALSE])
} %>%
.[, R5 := NULL]
unemployment_income_tests_by_date <-
unemployment_income_thresholds_by_date[unemployment_income_tapers_by_date,
on = c("Date", "TaperNumber")] %>%
dcast.data.table(Date + hasPartner + hasDependant ~ TaperNumber,
value.var = c("IncomeThreshold", "taper"),
sep = "_") %>%
setkeyv(c("hasPartner",
"hasDependant",
"Date"))
rent_assistance_table <-
capita[sheet_name == "RentA_A"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
drop_empty_cols %>%
drop_constant_cols %>%
.[R6 == "No deps", nDependants := 0L] %>%
.[R6 == "1-2 deps", nDependants := 1L] %>%
.[R6 == "> 2 deps", nDependants := 3L] %>%
.[, hasPartner := R5 %ein% "Married"] %>%
.[,
.(fy_year = date2fy(end_date),
R2,
hasPartner,
nDependants,
value)]
# assert constant
rent_assistance_table[R2 == "Proportion of rent paid by RA"] %>%
.[, .(fy_year, value)] %$%
stopifnot(all(value == 0.75))
rent_assistance_rates <-
rent_assistance_table[R2 != "Proportion of rent paid by RA"] %>%
.[, value := round(value, 2)] %>%
dcast.data.table(... ~ R2, value.var = "value") %>%
setnames(c("Maximum rent assistance payable",
"Minimum rent paid for rent allowance to be payable"),
c("max_rate",
"min_rent")) %>%
.[]
rent_assistance_table_by_date <-
capita[sheet_name == "RentA"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
drop_empty_cols %>%
drop_constant_cols %>%
.[R6 == "No deps", nDependants := 0L] %>%
.[R6 == "1-2 deps", nDependants := 1L] %>%
.[R6 == "> 2 deps", nDependants := 3L] %>%
.[, hasPartner := R5 %ein% "Married"] %>%
.[,
.(Date = as.Date(end_date),
R2,
hasPartner,
nDependants,
value)]
# assert constant
rent_assistance_table_by_date[R2 == "Proportion of rent paid by RA"] %>%
.[, .(Date, value)] %$%
stopifnot(all(value == 0.75))
rent_assistance_rates_by_date <-
rent_assistance_table_by_date[R2 != "Proportion of rent paid by RA"] %>%
.[, value := round(value, 2)] %>%
dcast.data.table(... ~ R2, value.var = "value") %>%
setnames(c("Maximum rent assistance payable",
"Minimum rent paid for rent allowance to be payable"),
c("max_rate",
"min_rent")) %>%
setkeyv(c("hasPartner",
"nDependants",
"Date")) %>%
.[]
youthStudent_annual_rates <-
capita[sheet_name == "YouthStudents_A"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0L] %>%
.[COL <= "X"] %>%
.[and(col %% 3 != 2, COL %notin% c("U", "V", "W", "X"))] %>%
.[, .(fy_year = date2fy(end_date), col, COL, R6, value)] %>%
.[, hasDependant := COL > "N"] %>%
.[, hasPartner := COL %between% c("L", "Q")] %>%
.[, LivesAtHome := COL %between% c("C", "G")] %>%
.[, Age16or17 := COL %chin% c("C", "D")] %>%
.[R6 %ein% c("MBR", "ES")] %>%
.[, c("COL", "col") := NULL] %>%
.[] %>%
dcast.data.table(... ~ R6, value.var = "value") %>%
.[]
youthStudent_income_tests <-
capita[sheet_name == "YouthStudents_A"] %>%
.[COL %chin% c("U", "V", "W", "X"),
.(start_date, end_date, col, COL, value)] %>%
.[, variable_type := if_else(COL %chin% c("U", "V"),
"IncomeThreshold",
"taper")] %>%
.[, TaperNo := if_else(COL %ein% c("U", "W"), "1", "2")] %>%
.[, fy_year := date2fy(end_date)] %>%
.[, isStudent := TRUE] %>%
dcast.data.table(isStudent + fy_year ~ variable_type + TaperNo, value.var = "value") %>%
.[, IncomeThreshold_1 := as.integer(IncomeThreshold_1)] %>%
.[, IncomeThreshold_2 := as.integer(IncomeThreshold_2)] %>%
.[]
youth_annual_rates <-
capita[sheet_name == "YouthUnemployment_A"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
.[COL <= "X"] %>%
.[and(col %% 3 != 2, COL %notin% c("U", "V", "W", "X"))] %>%
.[, .(fy_year = date2fy(end_date), col, COL, R6, value)] %>%
.[, hasDependant := COL > "N"] %>%
.[, hasPartner := COL %between% c("L", "Q")] %>%
.[, LivesAtHome := COL %between% c("C", "G")] %>%
.[, Age16or17 := COL %chin% c("C", "D")] %>%
.[R6 %ein% c("MBR", "ES")] %>%
.[, c("COL", "col") := NULL] %>%
dcast.data.table(... ~ R6, value.var = "value")
youth_income_tests <-
capita[sheet_name == "YouthUnemployment_A"] %>%
.[COL %chin% c("U", "V", "W", "X"),
.(fy_year = date2fy(end_date), COL, value)] %>%
.[, variable_type := if_else(COL %chin% c("U", "V"),
"IncomeThreshold",
"taper")] %>%
.[, TaperNo := if_else(COL %ein% c("U", "W"), "1", "2")] %>%
.[, isStudent := FALSE] %>%
dcast.data.table(isStudent + fy_year ~ variable_type + TaperNo, value.var = "value") %>%
.[, IncomeThreshold_1 := as.integer(IncomeThreshold_1)] %>%
.[, IncomeThreshold_2 := as.integer(IncomeThreshold_2)] %>%
rbind(youthStudent_income_tests, use.names = TRUE, fill = TRUE) %>%
setkey(isStudent, fy_year) %>%
.[]
youth_unemployment_rates <-
capita[sheet_name == "YouthUnemployment_A"] %>%
.[capita_headers, on = c("sheet_name", "col"), nomatch = 0] %>%
.[col %between% match(c("C", "T"), LETTERS)] %>%
drop_constant_cols() %>%
.[, hasDependant := R3 %ein% c("Coupled With Children", "Sole parents")] %>%
.[, hasPartner := R3 %ein% c("Coupled Without Children", "Coupled With Children")] %>%
.[, LivesAtHome := R3 %ein% "Dependent At Home"] %>%
.[]
# Age pension
# age_pension_income_tests <-
capita[sheet_name %ein% "Pensions"] %>%
.[COL %chin% c("Q", "S", "U"),
.(start_date, end_date, col, COL, value)] %>%
.[, variable_type := if_else(COL %chin% c("U"),
"taper",
"IncomeThreshold")] %>%
.[, has_partner := if_else(COL %in% c("S", "T"), "Couple", "Single")] %>%
.[, .(fy_year = date2fy(end_date), end_date = as.Date(end_date), variable_type, has_partner, value)] %>%
dcast(fy_year + end_date ~ variable_type + has_partner, value.var = "value") %>%
melt(measure.vars = patterns("IncomeThreshold"),
variable.factor = FALSE) %>%
.[, hasPartner := endsWith(variable, "Couple")] %>%
.[, .(fy_year, Date = end_date, hasPartner, taper = taper_Single, value)]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.