# Aggregate categorical variable with varying interval id cols ------------
# Inputs:
# - males: 5-calendar-year interval and 5-year age groups
# - females: 1-calendar-year interval and 1-year age groups
# Output:
# - all sexes combined: 5 calendar-year interval and 5-year age groups
sex_mapping <- data.table(parent = "all", child = c("female", "male"))
id_cols <- c("year_start", "year_end", "sex", "age_start", "age_end")
value_cols <- c("value")
# set up test input data.table
input_dt_male <- CJ(
year_start = 2005, year_end = 2010,
sex = "male",
age_start = seq(0, 95, 5),
value = 25
)
input_dt_male[age_start == 95, value := 5]
input_dt_female <- CJ(
year_start = 2005:2009,
sex = "female",
age_start = seq(0, 95, 1),
value = 1
)
gen_end(
input_dt_female,
id_cols = setdiff(id_cols, c("year_end", "age_end")),
col_stem = "year",
right_most_endpoint = 2010
)
input_dt <- rbind(input_dt_male, input_dt_female)
gen_end(input_dt, id_cols = setdiff(id_cols, "age_end"), col_stem = "age")
setkeyv(input_dt, id_cols)
# set up expected output data.table
expected_dt <- CJ(
year_start = 2005, year_end = 2010,
sex = "all",
age_start = seq(0, 95, 5),
value = 50
)
expected_dt[age_start == 95, value := 10]
gen_end(expected_dt, id_cols = setdiff(id_cols, "age_end"), col_stem = "age")
setkeyv(expected_dt, id_cols)
description <- "aggregating a categorical variable with varying interval id cols
works"
test_that(description, {
# since interval id columns are not collapsed to most common intervals
expect_error(
agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "sex", col_type = "categorical",
mapping = sex_mapping
),
regexp = "expected input data is missing"
)
output_dt <- agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "sex", col_type = "categorical",
mapping = sex_mapping,
collapse_interval_cols = T
)
expect_identical(output_dt, expected_dt)
})
description <- "error is thrown when categorical aggregate is already included
in input or works when `present_agg_severity = 'none'`"
test_that(description, {
new_input_dt <- rbind(input_dt, expected_dt)
setkeyv(new_input_dt, key(input_dt))
expect_error(
agg(
dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "sex", col_type = "categorical",
mapping = sex_mapping,
collapse_interval_cols = T
),
regexp = "aggregate data is already present"
)
output_dt <- agg(
dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "sex", col_type = "categorical",
mapping = sex_mapping,
present_agg_severity = "none",
collapse_interval_cols = T
)
expect_identical(output_dt, expected_dt)
})
description <- "error is thrown when aggregating a categorical variable and
levels are missing"
test_that(description, {
new_input_dt <- input_dt[sex != "female" & year_start == 2008]
expect_error(
agg(
dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "sex", col_type = "categorical",
mapping = sex_mapping
),
regexp = "expected input data is missing"
)
})
description <- "error is thrown when aggregating categorical variable with
missing interval id column values or makes possible aggregates with
`missing_dt_severity` = 'none'`"
test_that(description, {
new_input_dt <- input_dt[!(age_start == 24 & year_start == 2008) &
!(age_start == 46 & year_start == 2006)]
new_expected_dt <- expected_dt[!(age_start %in% c(20, 45))]
expect_error(
agg(dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "sex", col_type = "categorical",
mapping = sex_mapping,
collapse_interval_cols = TRUE
),
regexp = "intervals in `dt` are missing making it impossible to collapse"
)
output_dt <- agg(
dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "sex", col_type = "categorical",
mapping = sex_mapping,
missing_dt_severity = "none",
collapse_interval_cols = TRUE
)
expect_identical(output_dt, new_expected_dt)
})
# Aggregate interval variable with varying interval id cols ------------
# Inputs:
# - males: 5-calendar-year interval and 5-year age groups
# - females: 1-calendar-year interval and 1-year age groups
# Output:
# - aggregate age groups as defined in `age_mapping` for existing male and female
# calendar year and age intervals
# set up mapping for aggregation over age
age_mapping <- data.table(
age_start = c(0, 0, 15, 40, 85),
age_end = c(Inf, 5, 60, 70, Inf)
)
id_cols <- c("year_start", "year_end", "sex", "age_start", "age_end")
value_cols <- c("value")
# set up expected output data.table
# expected value is number of single year age groups in aggregate interval
expected_dt_female <- CJ(year_start = 2005:2009, sex = "female")
gen_end(
expected_dt_female,
id_cols = c("year_start", "sex"),
col_stem = "year",
right_most_endpoint = 2010
)
expected_dt_female <- expected_dt_female[
, data.table(age_mapping),
by = c("year_start", "year_end", "sex")
]
expected_dt_female[, value := rep(c(96, 5, 45, 30, 11), 5)]
expected_dt_male <- CJ(year_start = 2005, year_end = 2010, sex = "male")
expected_dt_male <- expected_dt_male[
, data.table(age_mapping),
by = c("year_start", "year_end", "sex")
]
expected_dt_male[, value := c(96, 5, 45, 30, 11) * 5]
expected_dt <- rbind(expected_dt_female, expected_dt_male, use.names = T)
setcolorder(expected_dt, c(id_cols, value_cols))
setkeyv(expected_dt, id_cols)
description <- "aggregating an interval variable with varying interval id cols
works"
testthat::test_that(description, {
output_dt <- agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = age_mapping,
present_agg_severity = "skip"
)
testthat::expect_identical(output_dt, expected_dt)
})
description <- "throws error when aggregating an interval variable and there
are missing intervals"
testthat::test_that(description, {
new_input_dt <- input_dt[!(sex == "female" & (age_start <= 2 | age_end >= 95))]
setkeyv(new_input_dt, id_cols)
new_expected_dt <- expected_dt[!(sex == "female" & (age_start <= 2 | age_end >= 95))]
setkeyv(new_expected_dt, id_cols)
# check severity
expect_error(
agg(dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = age_mapping,
missing_dt_severity = "stop",
present_agg_severity = "skip"
),
regexp = "expected input data is missing"
)
output_dt <- agg(
dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = age_mapping,
missing_dt_severity = "none",
present_agg_severity = "skip"
)
expect_identical(output_dt, new_expected_dt)
})
description <- "error is thrown when interval aggregate is already included
in input or works when `present_agg_severity = 'none'`"
testthat::test_that(description, {
new_input_dt <- unique(rbind(input_dt, expected_dt))
setkeyv(new_input_dt, id_cols)
setkeyv(expected_dt, id_cols)
expect_error(
agg(dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = age_mapping
),
regexp = "overlapping intervals were identified in `dt`"
)
})
# no possible aggregates to make ------------------------------------------
input_dt <- data.table(year = 2010, age_start = seq(15, 50, 5), age_end = seq(20, 55, 5), value = 1)
description <- "no error is thrown when there are no possible aggregates to make"
testthat::test_that(description, {
expect_error(
agg(
dt = input_dt,
id_cols = c("year", "age_start", "age_end"), value_col = "value",
col_stem = "age", col_type = "interval",
mapping = data.table(age_start = 10, age_end = 55)
),
regexp = "expected input data is missing"
)
output_dt <- agg(
dt = input_dt,
id_cols = c("year", "age_start", "age_end"), value_col = "value",
col_stem = "age", col_type = "interval",
mapping = data.table(age_start = 10, age_end = 55),
missing_dt_severity = "none"
)
expect_equal(nrow(output_dt), 0)
output_dt <- agg(
dt = input_dt,
id_cols = c("year", "age_start", "age_end"), value_col = "value",
col_stem = "age", col_type = "interval",
mapping = data.table(age_start = 10, age_end = 55),
missing_dt_severity = "skip"
)
expect_equal(output_dt, data.table(year = 2010, age_start = 10, age_end = 55, value = nrow(input_dt)))
})
# Aggregate categorical variable with multiple levels in mapping ----------
# Inputs:
# - present day provinces (only the most detailed nodes) in Iran as defined in `iran_mapping`
# see https://ihmeuw-demographics.github.io/hierarchyUtils/articles/hierarchyUtils.html#aggregate-locations-1
# for visualization of mapping
# Output:
# - all historical provinces and the national level
id_cols <- c("location", "year")
value_cols <- c("value")
# set up test input data.table with only the present day provinces
input_dt <- CJ(
location = iran_mapping[!grepl("[0-9]+", child), child],
year = 2011,
value = 1
)
setkeyv(input_dt, id_cols)
# set up expected output table with all unique locations
# the expected value is the number of leaf nodes under each aggregate
expected_dt <- CJ(location = unique(iran_mapping$parent),
year = 2011)
expected_dt[
location %in% c(
"Tehran 2006", "Zanjan 1976-1996", "Mazandaran 1956-1996",
"East Azarbayejan 1956-1986", "Khuzestan and Lorestan 1956",
"Isfahan and Yazd 1966"
),
value := 2
]
expected_dt[
location %in% c(
"Tehran 1986-1995", "Gilan 1956-1966", "Kermanshahan 1956",
"Khorasan 1956-1996", "Isfahan and Yazd 1956"
),
value := 3
]
expected_dt[
location %in% c("Markazi 1966-1976", "Fars and Ports 1956"),
value := 4
]
expected_dt[location %in% "Markazi 1956", value := 5]
expected_dt[location %in% "Iran (Islamic Republic of)", value := 31]
setkeyv(expected_dt, id_cols)
description <- "aggregation of categorical variable with multiple levels works
(only most-detailed levels of mapping included in input `dt`)"
test_that(description, {
output_dt <- agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "location", col_type = "categorical",
mapping = iran_mapping
)
expect_identical(output_dt, expected_dt)
})
description <- "aggregation of categorical variable missing some leaf nodes in
mapping works when `missing_dt_severity = 'none'"
test_that(description, {
new_input_dt <- input_dt[!location %in% c("Tehran", "Alborz")]
setkeyv(new_input_dt, id_cols)
new_expected_dt <- expected_dt[
!location %in% c(
"Tehran 2006", "Tehran 1986-1995", "Markazi 1966-1976", "Markazi 1956",
"Iran (Islamic Republic of)"
)
]
setkeyv(new_expected_dt, id_cols)
# check severity
expect_error(
agg(
dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "location", col_type = "categorical",
mapping = iran_mapping,
missing_dt_severity = "stop"
),
regexp = "expected input data is missing"
)
output_dt <- agg(
dt = new_input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "location", col_type = "categorical",
mapping = iran_mapping,
missing_dt_severity = "none"
)
expect_identical(output_dt, new_expected_dt)
})
# Test aggregating categorical variable with different interval id --------
# Inputs:
# - present day provinces (only the most detailed nodes) in Iran as defined in
# `iran_mapping`. 'Alborz' only has all-ages, all other provinces have 10-year
# age groups.
# see https://ihmeuw-demographics.github.io/hierarchyUtils/articles/hierarchyUtils.html#aggregate-locations-1
# for visualization of mapping
# Output:
# - all aggregate locations containing 'Alborz' should have all-ages row, all
# other aggregate locations should have 10-year age groups
id_cols <- c("location", "year", "age_start", "age_end")
value_cols <- c("value")
input_dt_iran2 <- rbind(
input_dt[
location == "Alborz"
, list(age_start = 0, age_end = Inf, value = 11),
by = c("location", "year")
],
input_dt[
location != "Alborz"
, list(age_start = seq(0, 100, 10), age_end = c(seq(10, 100, 10), Inf), value = 1),
by = c("location", "year")
]
)
setkeyv(input_dt_iran2, id_cols)
parent_alborz_locations <- c(
"Iran (Islamic Republic of)", "Markazi 1956", "Markazi 1966-1976", "Semnan",
"Markazi", "Tehran 1986-1995", "Tehran 2006", "Qom"
)
expected_dt_iran2 <- rbind(
expected_dt[
location %in% parent_alborz_locations
, list(age_start = 0, age_end = Inf, value = value * 11),
by = c("location", "year")
],
expected_dt[
!location %in% parent_alborz_locations
, list(age_start = seq(0, 100, 10), age_end = c(seq(10, 100, 10), Inf), value = value),
by = c("location", "year")
]
)
setkeyv(expected_dt_iran2, id_cols)
description <- "aggregating a categorical variable with varying interval id cols
works"
description <- "aggregation of categorical variable with multiple levels works
(only most-detailed levels of mapping included in input `dt`)"
test_that(description, {
output_dt <- agg(
dt = input_dt_iran2,
id_cols = id_cols, value_cols = value_cols,
col_stem = "location", col_type = "categorical",
mapping = iran_mapping,
collapse_interval_cols = TRUE
)
expect_identical(output_dt, expected_dt_iran2)
})
# Test `na_value_severity` argument ---------------------------------------
id_cols <- c("group", "age_start", "age_end")
value_cols <- "value"
input_dt <- data.table(
group = c(1, 1, 2, 2),
age_start = c(0, 1, 0, 1),
age_end = c(1, 2, 1, 2),
value = c(NA, 1, 2, 3)
)
setkeyv(input_dt, id_cols)
expected_dt <- data.table(
group = c(1, 2),
age_start = c(0, 0),
age_end = c(2, 2),
value = c(NA, 5)
)
setkeyv(expected_dt, id_cols)
description <- "aggregation correctly accounts for NA values with the
'na_value_severity' argument"
test_that(description, {
expect_error(
agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = data.table(age_start = 0, age_end = 2)
),
regexp = "input `value_cols` have 'NA' values"
)
expect_error(
agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = data.table(age_start = 0, age_end = 2),
na_value_severity = "none"
),
regexp = "expected input data is missing"
)
output_dt <- agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = data.table(age_start = 0, age_end = 2),
na_value_severity = "none",
missing_dt_severity = "skip"
)
new_expected_dt <- copy(expected_dt)
new_expected_dt[group == 1 & is.na(value), value := 1]
expect_equal(output_dt, new_expected_dt)
output_dt <- agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = data.table(age_start = 0, age_end = 2),
na_value_severity = "skip"
)
expect_equal(output_dt, expected_dt)
})
# Include NA in interval mapping ------------------------------------------
id_cols <- c("group", "age_start", "age_end")
value_cols <- "value"
input_dt <- data.table(
group = c(1, 1, 1, 2, 2, 2),
age_start = c(0, 1, NA, 0, 1, NA),
age_end = c(1, 2, NA, 1, 2, NA),
value = 1
)
setkeyv(input_dt, id_cols)
expected_dt <- data.table(
group = c(1, 2),
age_start = c(0, 0),
age_end = c(2, 2),
value = 2
)
setkeyv(expected_dt, id_cols)
description <- "aggregation correctly accounts for NA interval col_stem variables"
test_that(description, {
output_dt <- agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = data.table(age_start = 0, age_end = 2)
)
expect_equal(output_dt, expected_dt)
new_expected_dt <- copy(expected_dt)
new_expected_dt[, value := 3]
output_dt <- agg(
dt = input_dt,
id_cols = id_cols, value_cols = value_cols,
col_stem = "age", col_type = "interval",
mapping = data.table(age_start = 0, age_end = 2, include_NA = TRUE)
)
expect_equal(output_dt, new_expected_dt)
})
# Small special case tests ------------------------------------------------
# set up test input data.table
# 0-5, 5-10, 4-6 age groups
input_dt <- data.table(
year = 2010,
age_start = c(0, 5, 4), age_end = c(5, 10, 6),
value = 1
)
description <- "aggregating interval variables with weird overlapping intervals
errors out"
testthat::test_that(description, {
testthat::expect_error(
agg(
dt = input_dt,
id_cols = c("year", "age_start", "age_end"),
value_cols = "value",
col_stem = "age",
col_type = "interval",
mapping = data.table(age_start = 0, age_end = 10)
),
msg = "overlapping intervals were identified in `dt`"
)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.