#' Simple projections of the annual 2\% samples of Australian Taxation Office tax returns.
#'
#' @param sample_file A \code{data.table} matching a 2\% sample file from the ATO.
#' See package \code{taxstats} for an example.
#' @param h An integer. How many years should the sample file be projected?
#' @param fy.year.of.sample.file The financial year of \code{sample_file}. If \code{NULL}, the default, the number is inferred from the
#' number of rows of \code{sample_file} to be one of \code{2012-13}, \code{2013-14}, \code{2014-15}, \code{2015-16}, or \code{2016-17}.
#' @param WEIGHT The sample weight for the sample file. (So a 2\% file has \code{WEIGHT} = 50.)
#' @param excl_vars A character vector of column names in \code{sample_file} that should not be inflated. Columns not present in the 2013-14 sample file are not inflated and nor are the columns \code{Ind}, \code{Gender}, \code{age_range}, \code{Occ_code}, \code{Partner_status}, \code{Region}, \code{Lodgment_method}, and \code{PHI_Ind}.
#' @param forecast.dots A list containing parameters to be passed to \code{generic_inflator}.
#' @param wage.series See \code{\link{wage_inflator}}. Note that the \code{Sw_amt}
#' will uprated by \code{\link{differentially_uprate_wage}} (if requested).
#' @param lf.series See \code{\link{lf_inflator_fy}}.
#' @param use_age_pop_forecast Should the inflation of the number of taxpayers be
#' moderated by the number of resident persons born in a certain year? If \code{TRUE},
#' younger ages will grow at a slightly higher rate beyond 2018 than older ages.
#' @param .recalculate.inflators (logical, default: \code{NA}).
#' Should \code{generic_inflator()} or \code{CG_inflator} be called to project the other variables? Adds time.
#' Default \code{NA} means \code{TRUE} if the pre-calculated inflators are available,
#' \code{FALSE} otherwise.
#'
#' @param .copyDT (logical, default: \code{TRUE}) Should a \code{copy()} of \code{sample_file} be made?
#' If set to \code{FALSE}, will update \code{sample_file} in place, which may
#' be necessary when memory is constrained, but is dangerous as it modifies the
#' original data and its projection. (So if you run the same code twice you
#' may end up with a projection \code{2h} years ahead, not \code{h} years.)
#' @param check_fy_sample_file (logical, default: \code{TRUE})
#' Should \code{fy.year.of.sample.file} be checked against \code{sample_file}?
#' By default, \code{TRUE}, an error is raised if the base is not 2012-13, 2013-14, 2014-15, 2015-16, 2016-17,
#' or 2017-18,
#' and a warning is raised if the
#' number of rows in \code{sample_file} is different to the known number of rows in the sample files.
#' @param differentially_uprate_Sw (logical, default: \code{NA})
#' Should the salary and wage column (\code{Sw_amt}) be differentially uprated
#' using (\code{\link{differentially_uprate_wage}})? Default of \code{NA} means
#' use differential uprating is used when \code{fy.year.of.sample.file <= "2016-17"}.
#' It is known that the Treasury stopped using differential uprating by 2019.
#'
#' Selecting \code{TRUE} for \code{fy.year.of.sample.file > "2016-17"} is an
#' error as the precalculated values are not available.
#'
#' @param r_super_balance The factor to inflate super balances by (annualized).
#' Set to \code{1.05} for backwards compatibility. The annual superannuation
#' bulletin of June 2019 from APRA reported 7.3\% growth of funds with more than
#' fund members over the previous 5 years and 7.9\% growth over the
#' previous ten years.
#'
#' @param r_generic (Present from version 2024.1.0) The factor to inflate other
#' columns. Subject to change in future versions. If \code{NULL}, the default,
#' an internal factor is used.
#'
#'
#' @return A sample file with the same number of rows as \code{sample_file} but
#' with inflated values as a forecast for the sample file in \code{to_fy}.
#' If \code{WEIGHT} is not already a column of \code{sample_file}, it will be added and its sum
#' will be the predicted number of taxpayers in \code{to_fy}.
#'
#' @details
#'
#' Currently components of taxable income are individually inflated
#' based on their historical trends in the ATO sample files, with the
#' exception of:
#' \describe{
#' \item{inflated using \code{\link{differentially_uprate_wage}}.}{\code{Sw_amt}}
#' \item{inflated using \code{\link{wage_inflator}}}{\code{Alow_ben_amt}, \code{ETP_txbl_amt}, \code{Rptbl_Empr_spr_cont_amt}, \code{Non_emp_spr_amt}, \code{MCS_Emplr_Contr}, \code{MCS_Prsnl_Contr}, \code{MCS_Othr_Contr}}
#' \item{inflated using \code{\link{cpi_inflator}}}{\code{WRE_car_amt}, \code{WRE_trvl_amt}, \code{WRE_uniform_amt}, \code{WRE_self_amt}, \code{WRE_other_amt}}
#' \item{inflated by \code{\link{lf_inflator_fy}}}{\code{WEIGHT}}
#' \item{inflated by \code{\link{CG_inflator}}}{\code{Net_CG_amt}, \code{Tot_CY_CG_amt}}
#' }
#'
#' Superannuation balances are inflated by a fixed rate of 5\% p.a.
#'
#' We recommend you use \code{sample_file_1213} over \code{sample_file_1314},
#' unless you need the superannuation variables,
#' as the latter suggests lower-than-recorded tax collections.
#' However, more recent data is of course preferable.
#' @export
project <- function(sample_file,
h = 0L,
fy.year.of.sample.file = NULL,
WEIGHT = 50L,
excl_vars = NULL,
forecast.dots = list(estimator = "mean", pred_interval = 80),
wage.series = NULL,
lf.series = NULL,
use_age_pop_forecast = FALSE,
.recalculate.inflators = NA,
.copyDT = TRUE,
check_fy_sample_file = TRUE,
differentially_uprate_Sw = NA,
r_super_balance = 1.05,
r_generic = NULL) {
if (length(h) != 1L) {
stop("`h` had length-", length(h), ", ",
"but must be a length-1 positive integer.")
}
if (!is.integer(h)) {
htxt <- deparse(substitute(h))
if (is.double(h)) {
if (grepl("^[0-9]+$", htxt)) {
hadvised <- paste0(htxt, "L")
stop("`h = ", htxt, "` was type double, but must be type integer. ",
"(Did you mean `h = ", hadvised, "`?)\n\n",
"Change h to a length-one integer. ",
"For example, to project two years ahead, use `h = 2L`, not `h = 2`.")
} else {
stop("`h = ", htxt, "` was type double, but must be type integer. ",
"Change h to a length-one integer. ",
"For example, to project two years ahead, use `h = 2L`, not `h = 2`.")
}
} else {
stop("`h = ", htxt, "` was type ", typeof(h), ", but must be type integer. ",
"Change h to a length-one nonnegative integer.")
}
}
if (anyNA(h)) {
stop("`h = NA`. This is not permitted. ",
"Change h to a length-one nonnegative integer.")
}
if (h < 0L) {
stop("`h = ", h, "`, but must be a nonnegative integer. ",
"Change h to a nonnegative integer.")
}
if (!is.data.table(sample_file)) {
if (is.data.frame(sample_file)) {
sample_file <- as.data.table(sample_file)
} else {
stop("`sample_file` was of class ", class(sample_file)[1L], ", but must be a data.table. ",
"Ensure `sample_file`` is a data.frame with the same structure as the ATO's sample files.")
}
}
if (.copyDT) {
sample_file <- copy(sample_file)
}
if (check_fy_sample_file) {
# It's been a common error of mine to switch sample files
# without updating the fy.year.of.sample.file
if (is.null(fy.year.of.sample.file)) {
fy.year.of.sample.file <-
match(nrow(sample_file), c(254318L, 258774L, 263339L, 269639L, 277202L, 284925L, 301687L))
if (is.na(fy.year.of.sample.file)) {
stop("`fy.year.of.sample.file` was not provided, and its value could not be ",
"inferred from nrow(sample_file) = ", nrow(sample_file), ". Either use ",
"a 2% sample file of the years 2012-13, 2013-14, or 2014-15 or ",
"supply `fy.year.of.sample.file` manually.")
}
fy.year.of.sample.file <-
c("2012-13", "2013-14", "2014-15", "2015-16", "2016-17", "2017-18", "2020-21")[fy.year.of.sample.file]
}
switch(fy.year.of.sample.file,
"2012-13" = {
if (nrow(sample_file) != 254318) {
warning("nrow(sample_file) != 254318. Should you choose a different fy.year.of.sample.file?")
}
},
"2013-14" = {
if (nrow(sample_file) != 258774) {
warning("nrow(sample_file) != 254318. Should you choose a different fy.year.of.sample.file?")
}
},
"2014-15" = {
if (nrow(sample_file) != 263339) {
warning("nrow(sample_file) != 263339. Should you choose a different fy.year.of.sample.file?")
}
},
"2015-16" = {
if (nrow(sample_file) != 269639) {
warning("nrow(sample_file) != 269639. Should you choose a different fy.year.of.sample.file?")
}
},
"2016-17" = {
if (nrow(sample_file) != 277202) {
warning("nrow(sample_file) != 277202. Should you choose a different fy.year.of.sample.file?")
}
},
"2017-18" = {
if (nrow(sample_file) != 284925) {
warning("nrow(sample_file) != 284925. Should you choose a different fy.year.of.sample.file?")
}
},
"2020-21" = {
if (nrow(sample_file) != 301687) {
warning("nrow(sample_file) != 301687 Should you choose a different fy.year.of.sample.file?")
}
},
stop("`fy.year.of.sample.file` must be '2012-13', '2013-14', '2014-15', '2015-16', '2016-17',
or '2017-18'."))
}
if (h == 0) {
return(sample_file)
}
if ("WEIGHT" %notin% names(sample_file)) {
sample_file[, "WEIGHT" := list(WEIGHT)]
}
if (fy.year.of.sample.file == "2015-16") {
if ("Med_Exp_TO_amt" %chin% names(sample_file)) {
} else {
sample_file[, Med_Exp_TO_amt := 0]
}
}
# NSE e.g inflators[h == h]
H <- h
current.fy <- fy.year.of.sample.file
to.fy <- yr2fy(to.yr <- (fy2yr(current.fy) + h))
if (is.null(wage.series)){
wage.inflator <- wage_inflator(from = current.fy, to = to.fy,
series = grattanInflators::wpi_original(FORECAST = TRUE))
} else {
wage.inflator <- wage_inflator(from = current.fy, to = to.fy,
series = wage.series)
}
dont_inflate_WEIGHT <- FALSE
if (is.null(lf.series)) {
n_taxpayers_2022_2034 <-
c(16216182, 16543257, 16826686, 17076658, 17326098, 17573757,
18006023, 18438289, 18870556, 19144750, 19409635, 19669543, 19917470)
if (to.yr %in% 2022:2034) {
dont_inflate_WEIGHT <- TRUE
WEIGHT <- NULL
sample_file[, WEIGHT := as.double(WEIGHT)]
set(sample_file, j = "WEIGHT", value = as.double(n_taxpayers_2022_2034[to.yr - 2021] / nrow(sample_file)))
} else {
lf.inflator <- lf_inflator(from = current.fy, to = to.fy,
series = grattanInflators::lfi_original(FORECAST = TRUE))
}
} else {
if (is.data.table(lf.series)) {
stop("lf.series should be a series as defined by lf_inflator.")
}
lf.inflator <- lf_inflator(from = current.fy, to = to.fy,
series = lf.series)
}
cpi.inflator <- cpi_inflator(from = current.fy, to = to.fy,
series = grattanInflators::cpi_seasonal(FORECAST = TRUE))
if (is.null(r_generic)) {
r_generic <- cpi.inflator
} else if (!is.numeric(r_generic) || length(r_generic) != 1 || !is.finite(r_generic)) {
stop("r_generic must be NULL or a length-one numeric.") # nocov
}
if (!is.logical(.recalculate.inflators)) {
stop("`.recalculate.inflators` was type ", typeof(.recalculate.inflators), ", ",
"but must be TRUE, FALSE, or NA.")
}
if (length(.recalculate.inflators) != 1L) {
stop("`length(.recalculate.inflators) = ", length(.recalculate.inflators), ",
but must be length-one.")
}
if (is.na(.recalculate.inflators)) {
.recalculate.inflators <- current.fy >= "2017-18"
}
if (.recalculate.inflators) {
CG.inflator <- CG_inflator(1, from_fy = current.fy, to_fy = to.fy)
} else {
if (current.fy %notin% c("2012-13", "2013-14",
"2014-15", "2015-16", "2016-17")) {
stop("Precalculated inflators only available when projecting from ",
"2012-13, 2013-14, 2014-15, 2015-16, or 2016-17.")
} else {
cg_inflators <-
switch(current.fy,
"2012-13" = cg_inflators_1213,
"2013-14" = cg_inflators_1314,
"2014-15" = cg_inflators_1415,
"2015-16" = cg_inflators_1516,
"2016-17" = cg_inflators_1617)
stopifnot("forecast.series" %in% names(cg_inflators))
forecast.series <- NULL
CG.inflator <-
cg_inflators[.(to.fy, forecast.dots$estimator)] %>%
.subset2("cg_inflator")
}
}
col.names <- names(sample_file)
# Differential uprating not available for years outside:
if (is.na(differentially_uprate_Sw)) {
differentially_uprate_Sw <- fy.year.of.sample.file <= "2016-17"
}
diff.uprate.wagey.cols <-
if (differentially_uprate_Sw) {
"Sw_amt"
} else {
character(0L)
}
wagey.cols <- c(if (!differentially_uprate_Sw) "Sw_amt",
"Alow_ben_amt",
"ETP_txbl_amt",
"Rptbl_Empr_spr_cont_amt",
"Non_emp_spr_amt",
"MCS_Emplr_Contr",
"MCS_Prsnl_Contr",
"MCS_Othr_Contr")
super.bal.col <- c("MCS_Ttl_Acnt_Bal")
lfy.cols <- c("WEIGHT")
cpiy.cols <- c(grep("WRE", col.names, value = TRUE), # work-related expenses
"Cost_tax_affairs_amt",
"Other_Ded_amt")
derived.cols <- c("Net_rent_amt",
"Net_PP_BI_amt",
"Net_NPP_BI_amt",
"Tot_inc_amt",
"Tot_ded_amt",
"Taxable_Income")
CGTy.cols <- c("Net_CG_amt", "Tot_CY_CG_amt")
# names(taxstats::sample_file_1314)
alien.cols <-
col.names[!col.names %chin% c("Ind", "Gender", "age_range", "Occ_code", "Partner_status",
"Region", "Lodgment_method", "PHI_Ind", "Sw_amt", "Alow_ben_amt",
"ETP_txbl_amt", "Grs_int_amt", "Aust_govt_pnsn_allw_amt", "Unfranked_Div_amt",
"Frk_Div_amt", "Dividends_franking_cr_amt", "Net_rent_amt", "Gross_rent_amt",
"Other_rent_ded_amt", "Rent_int_ded_amt", "Rent_cap_wks_amt",
"Net_farm_management_amt", "Net_PP_BI_amt", "Net_NPP_BI_amt",
"Total_PP_BI_amt", "Total_NPP_BI_amt", "Total_PP_BE_amt", "Total_NPP_BE_amt",
"Net_CG_amt", "Tot_CY_CG_amt", "Net_PT_PP_dsn", "Net_PT_NPP_dsn",
"Taxed_othr_pnsn_amt", "Untaxed_othr_pnsn_amt", "Other_foreign_inc_amt",
"Other_inc_amt", "Tot_inc_amt", "WRE_car_amt", "WRE_trvl_amt",
"WRE_uniform_amt", "WRE_self_amt", "WRE_other_amt", "Div_Ded_amt",
"Intrst_Ded_amt", "Gift_amt", "Non_emp_spr_amt", "Cost_tax_affairs_amt",
"Other_Ded_amt", "Tot_ded_amt", "PP_loss_claimed", "NPP_loss_claimed",
"Rep_frng_ben_amt", "Med_Exp_TO_amt", "Asbl_forgn_source_incm_amt",
"Spouse_adjusted_taxable_inc", "Net_fincl_invstmt_lss_amt", "Rptbl_Empr_spr_cont_amt",
"Cr_PAYG_ITI_amt", "TFN_amts_wheld_gr_intst_amt", "TFN_amts_wheld_divs_amt",
"Hrs_to_prepare_BPI_cnt", "Taxable_Income", "Help_debt", "MCS_Emplr_Contr",
"MCS_Prsnl_Contr", "MCS_Othr_Contr", "MCS_Ttl_Acnt_Bal",
"WEIGHT")]
Not.Inflated <- c("Ind",
"Gender",
"age_range",
"Occ_code",
"Partner_status",
"Region",
"Lodgment_method",
"PHI_Ind",
derived.cols,
alien.cols)
Not.Inflated <- c(Not.Inflated, excl_vars)
generic.cols <-
col.names[col.names %notchin% c(diff.uprate.wagey.cols,
wagey.cols,
super.bal.col,
lfy.cols,
cpiy.cols,
derived.cols,
Not.Inflated)]
# if (.recalculate.inflators) {
# generic.inflators <-
# generic_inflator(vars = generic.cols,
# h = h,
# fy.year.of.sample.file = fy.year.of.sample.file,
# estimator = forecast.dots$estimator,
# pred_interval = forecast.dots$pred_interval)
# } else {
# generic.inflators <-
# switch(current.fy,
# "2012-13" = generic_inflators_1213,
# "2013-14" = generic_inflators_1314,
# "2014-15" = generic_inflators_1415,
# "2015-16" = generic_inflators_1516,
# "2016-17" = generic_inflators_1617,
# stop("Precalculated inflators only available when projecting from ",
# "2012-13, 2013-14, 2014-15, 2015-16, and 2016-17."))
# }
## Inflate:
# Function to determine which inflator to use, given the column name
inflator_switch <- function(nom) {
if (nom %chin% diff.uprate.wagey.cols) {
return("differential")
}
if (nom %chin% wagey.cols) {
return("wage")
}
if (nom %chin% cpiy.cols) {
return("cpi")
}
if (nom %chin% lfy.cols) {
return("labour-force")
}
if (nom %chin% CGTy.cols) {
return("cgt")
}
if (nom %chin% generic.cols) {
return("generic")
}
if (nom %chin% super.bal.col) {
return("super")
}
stop("Internal error: inflator-switch unmatched. ", # nocov
"`nom = ", nom, "`") # nocov
}
if (use_age_pop_forecast) {
Population <- NULL
pop_inflator_by_age <-
g_pop_forecasts_by_age_range[CJ(age_range = 0:11,
fy = c(current.fy, to.fy)),
on = c("age_range", "fy")] %>%
.[, .(pop_forecast_r = last(Population) / first(Population)),
keyby = "age_range"]
sample_file <- sample_file[pop_inflator_by_age, on = "age_range"]
# Will expand to nrow
lf.inflator <- {lf.inflator + .subset2(sample_file, "pop_forecast_r")} / 2
}
if (!is.numeric(r_super_balance)) {
stop("`r_super_balance` was type ", typeof(r_super_balance), " but must be
numeric.")
}
if (length(r_super_balance) != 1L && length(r_super_balance) != nrow(sample_file)) {
stop("`length(r_super_balance) = ", length(r_super_balance), "`, but must be
length-one or have the same length as the number of rows of `sample_file = ",
nrow(sample_file), "`.")
}
for (j in col.names) {
if (j %chin% Not.Inflated) {
next
}
if (j == "WEIGHT" && dont_inflate_WEIGHT) {
next
}
v <- .subset2(sample_file, j)
v_new <-
switch(inflator_switch(j),
"differential" = {
if (is.null(wage.series)) {
differentially_uprate_wage(v,
from = current.fy,
to = to.fy)
} else {
differentially_uprate_wage(v,
from = current.fy,
to = to.fy,
forecast.series = "custom",
wage.series = wage.series)
}
},
"wage" = {
wage.inflator * v
},
"cpi" = {
cpi.inflator * v
},
"labour-force" = {
lf.inflator * v
},
"cgt" = {
CG.inflator * v
},
"generic" = {
r_generic * v
# if (.recalculate.inflators) {
# if (nrow(generic.inflators)) {
# generic.inflators[variable == j]$inflator * v
# } else {
# v
# }
# } else {
# generic.inflators[.(H, j), inflator] * v
# }
},
"super" = {
{r_super_balance ^ h} * v
},
# nocov start
stop("Internal error: switch(inflator_switch()) not matched\n\t",
"inflator_switch(j) = ", "inflator_switch(", j, ") = ",
inflator_switch(j)))
# nocov end
set(sample_file,
j = j,
value = v_new)
}
sample_file %>%
.[, Net_rent_amt := Gross_rent_amt - Other_rent_ded_amt - Rent_int_ded_amt - Rent_cap_wks_amt] %>%
.[, Net_PP_BI_amt := Total_PP_BI_amt - Total_PP_BE_amt] %>%
.[, Net_NPP_BI_amt := Total_NPP_BI_amt - Total_NPP_BE_amt] %>%
.[, Tot_inc_amt := .add(Sw_amt,
Alow_ben_amt,
ETP_txbl_amt,
Grs_int_amt,
Aust_govt_pnsn_allw_amt,
Unfranked_Div_amt,
Frk_Div_amt,
Dividends_franking_cr_amt,
Net_rent_amt,
Net_farm_management_amt,
Net_PP_BI_amt, ## Need to check exactly how this maps.
Net_NPP_BI_amt,
Net_CG_amt, ## We cannot express this cleanly in terms of Tot_CG
Net_PT_PP_dsn,
Net_PT_NPP_dsn,
Taxed_othr_pnsn_amt,
Untaxed_othr_pnsn_amt,
Other_foreign_inc_amt,
Other_inc_amt)] %>%
.[, Tot_ded_amt := .add(WRE_car_amt,
WRE_trvl_amt,
WRE_uniform_amt,
WRE_self_amt,
WRE_other_amt,
Div_Ded_amt,
Intrst_Ded_amt,
Gift_amt,
Non_emp_spr_amt,
Cost_tax_affairs_amt,
Other_Ded_amt)] %>%
.[, Taxable_Income := pmaxC(Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed, 0)] %>%
.[]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.