tests/testthat/test_scale.R

# Scale categorical variable with varying interval id cols ----------------

# Inputs:
# - males: 5-calendar-year interval and 5-year age groups up to 95+
# - females: 1-calendar-year interval and 1-year age groups up to 95+
# - all sexes combined: 5-calendar year interval and 0-15, 15-60 and 60+ age
#     groups. Values are double the sum of male and female values.
# Output:
# - males: row values doubled
# - females: row values doubled
# - all sexes combined: unchanged

sex_mapping <- data.table(parent = "all", child = c("female", "male"))
id_cols <- c("year_start", "year_end", "sex", "age_start", "age_end")
value_cols <- c("value")

# set up test input data.table
input_dt_male <- CJ(
  year_start = 2005, year_end = 2010,
  sex = "male",
  age_start = seq(0, 95, 5),
  value = 25
)
input_dt_male[age_start == 95, value := 5]

input_dt_female <- CJ(
  year_start = 2005:2009,
  sex = "female",
  age_start = seq(0, 95, 1),
  value = 1
)
gen_end(
  input_dt_female,
  id_cols = setdiff(id_cols, c("year_end", "age_end")),
  col_stem = "year",
  right_most_endpoint = 2010
)

input_dt_both <- CJ(
  year_start = 2005, year_end = 2010,
  sex = "all",
  age_start = c(0, 15, 60)
)
input_dt_both[age_start == 0, value := 300]
input_dt_both[age_start == 15, value := 900]
input_dt_both[age_start == 60, value := 720]

input_dt <- rbind(input_dt_male, input_dt_female, input_dt_both)
gen_end(input_dt, id_cols = setdiff(id_cols, "age_end"), col_stem = "age")
setkeyv(input_dt, id_cols)

# set up expected output table
expected_dt <- copy(input_dt)
expected_dt[sex != "all", value := value * 2]
setkeyv(expected_dt, id_cols)

description <- "scaling a categorical variable with varying interval id cols
works"
test_that(description, {

  # since interval id columns are not collapsed to most common intervals
  expect_error(
    scale(
      dt = input_dt,
      id_cols = id_cols,
      value_cols = value_cols,
      col_stem = "sex",
      col_type = "categorical",
      mapping = sex_mapping
    ),
    regexp = "expected input data is missing"
  )

  output_dt <- scale(
    dt = input_dt,
    id_cols = id_cols,
    value_cols = value_cols,
    col_stem = "sex",
    col_type = "categorical",
    mapping = sex_mapping,
    collapse_interval_cols = T
  )
  expect_identical(output_dt, expected_dt)
})

description <- "error is thrown when scaling a categorical variable and
levels are missing"
test_that(description, {

  new_input_dt <- input_dt[!(sex == "female" & year_start == 2008)]
  expect_error(
    scale(
      dt = new_input_dt,
      id_cols = id_cols,
      value_cols = value_cols,
      col_stem = "sex",
      col_type = "categorical",
      mapping = sex_mapping,
      collapse_interval_cols = T
    ),
    regexp = "intervals in `dt` are missing making it impossible to collapse"
  )

  new_input_dt <- input_dt[sex != "all"]
  expect_error(
    scale(
      dt = new_input_dt,
      id_cols = id_cols,
      value_cols = value_cols,
      col_stem = "sex",
      col_type = "categorical",
      mapping = sex_mapping,
      collapse_interval_cols = T
    ),
    regexp = "expected input data is missing"
  )
})

description <- "error is thrown when scaling a categorical variable with missing
id interval columns or is silent and scales possible intervals if requested to
not error out"
test_that(description, {

  new_input_dt <- input_dt[!(age_start == 24 & year_start == 2008) &
                             !(age_start == 46 & year_start == 2006)]

  expect_error(
    scale(
      dt = new_input_dt,
      id_cols = id_cols,
      value_cols = value_cols,
      col_stem = "sex",
      col_type = "categorical",
      mapping = sex_mapping,
      collapse_interval_cols = T
    ),
    regexp = "intervals in `dt` are missing making it impossible to collapse"
  )

  new_expected_dt <- copy(expected_dt)
  new_expected_dt <- new_expected_dt[!(age_start == 24 & year_start == 2008) &
                                       !(age_start == 46 & year_start == 2006)]
  new_expected_dt[between(age_start, 15, 59) & sex != "all", value := value / 2]
  output_dt <- scale(
    dt = new_input_dt,
    id_cols = id_cols,
    value_cols = value_cols,
    col_stem = "sex",
    col_type = "categorical",
    mapping = sex_mapping,
    missing_dt_severity = "none",
    collapse_interval_cols = T
  )
  expect_identical(output_dt, new_expected_dt)
})

# Scale interval variable with varying interval id cols -------------------

# Inputs:
# - 1st level
#   - 5-calendar-year interval and all-ages (value = 4 * 480 = 1920)
# - 2nd level (males only)
#   - 5-calendar-year interval and 5-year age groups up to 90-95 (value = 2 * 25 = 50)
#   - 5-calendar year interval and terminal age group 95+ (value = 2 * 5 = 10)
# - 2nd level (females only)
#   - 1-calendar year interval and 5-year age groups up to 95+ (value = 2 * 5 = 10)
#   - 1-calendar year interval and terminal age group 95+ (value = 2 * 1 = 2)
# - 3rd level (females only)
#   - 1-calendar year interval and 1-year age groups up to 95+ (value = 1 * 1 = 1)
# Output:
# - 1st level
#   - 5-calendar-year interval and all-ages (value = 4 * 480 = 1920)
# - 2nd level (males only)
#   - 5-calendar-year interval and 5-year age groups up to 90-95 (value = 2 * original = 100)
#   - 5-calendar year interval and terminal age group 95+ (value = 2 * original = 20)
# - 2nd level (females only)
#   - 1-calendar year interval and 5-year age groups up to 95+ (value = 2 * original = 20)
#   - 1-calendar year interval and terminal age group 95+ (value = 2 * original = 4)
# - 3rd level (females only)
#   - 1-calendar year interval and 1-year age groups up to 95+ (value = 4 * original = 4)

id_cols <- c("year_start", "year_end", "sex", "age_start", "age_end")
value_cols <- c("value")

# total number of most detailed age and year groupings
total_age_years <- length(seq(2005, 2009, 1)) * length(seq(0, 95, 1))

# create 1st level for males
input_dt_male1 <- data.table::CJ(
  year_start = 2005,
  year_end = 2010,
  sex = "male",
  age_start = 0,
  age_end = Inf,
  value = total_age_years
)

# create 2nd level for males
input_dt_male2 <- data.table::CJ(
  year_start = 2005,
  year_end = 2010,
  sex = "male",
  age_start = seq(0, 95, 5)
)
gen_end(input_dt_male2, setdiff(id_cols, "age_end"), col_stem = "age")
gen_length(input_dt_male2, col_stem = "age")
input_dt_male2[is.infinite(age_length), age_length := 1]
gen_length(input_dt_male2, col_stem = "year")
input_dt_male2[, value := age_length * year_length]
input_dt_male2[, c("age_length", "year_length") := NULL]

# create 1st level for females
input_dt_female1 <- data.table::CJ(
  year_start = 2005,
  year_end = 2010,
  sex = "female",
  age_start = 0,
  age_end = Inf,
  value = total_age_years
)

# create 2nd level for females
input_dt_female2 <- data.table::CJ(
  year_start = 2005:2009,
  sex = "female",
  age_start = seq(0, 95, 5)
)
gen_end(
  input_dt_female2, setdiff(id_cols, c("age_end", "year_end")),
  col_stem = "year", right_most_endpoint = 2010
)
gen_end(input_dt_female2, setdiff(id_cols, "age_end"), col_stem = "age")
gen_length(input_dt_female2, col_stem = "age")
input_dt_female2[is.infinite(age_length), age_length := 1]
gen_length(input_dt_female2, col_stem = "year")
input_dt_female2[, value := age_length * year_length]
input_dt_female2[, c("age_length", "year_length") := NULL]

# create 3rd level for females
input_dt_female3 <- data.table::CJ(
  year_start = 2005:2009,
  sex = "female",
  age_start = 0:94
)
gen_end(
  input_dt_female3, setdiff(id_cols, c("age_end", "year_end")),
  col_stem = "year", right_most_endpoint = 2010
)
gen_end(
  input_dt_female3, setdiff(id_cols, "age_end"),
  col_stem = "age", right_most_endpoint = 95
)
gen_length(input_dt_female3, col_stem = "age")
gen_length(input_dt_female3, col_stem = "year")
input_dt_female3[, value := age_length * year_length]
input_dt_female3[, c("age_length", "year_length") := NULL]

# combine together
input_dt <- list(
  "male1" = input_dt_male1,
  "male2" = input_dt_male2,
  "female1" = input_dt_female1,
  "female2" = input_dt_female2,
  "female3" = input_dt_female3
)

# modify values so that scaling will correct for scalars applied
input_dt[["male1"]][, value := value * 4]
input_dt[["female1"]][, value := value * 4]
input_dt[["male2"]][, value := value * 2]
input_dt[["female2"]][, value := value * 2]

# modify values expected after scaling applied
expected_dt <- copy(input_dt)
expected_dt[["male2"]][, value := value * 2]
expected_dt[["female2"]][, value := value * 2]
expected_dt[["female3"]][, value := value * 4]

# final formatting
input_dt <- rbindlist(input_dt)
setkeyv(input_dt, id_cols)
expected_dt <- rbindlist(expected_dt)
setkeyv(expected_dt, id_cols)

description <- "scaling an interval variable with varying interval id cols
works"
testthat::test_that(description, {
  output_dt <- scale(
    dt = input_dt,
    id_cols = id_cols,
    value_cols = value_cols,
    col_stem = "age",
    col_type = "interval",
    collapse_interval_cols = T
  )
  testthat::expect_identical(output_dt, expected_dt)
})

description <- "throws error when scaling an interval variable and there are
missing intervals"
testthat::test_that(description, {

  # missing one single year age group in females only
  new_input_dt <- input_dt[!(age_start == 24 & year_start == 2008)]

  new_expected_dt <- expected_dt[!(age_start == 24 & year_start == 2008)]
  gen_length(new_expected_dt, "age")
  new_expected_dt[between(age_start, 20, 24) & age_length == 1,
                  value := value / 4]
  new_expected_dt[, age_length := NULL]

  expect_error(
    scale(
      dt = new_input_dt,
      id_cols = id_cols,
      value_cols = value_cols,
      col_stem = "age",
      col_type = "interval",
      collapse_interval_cols = TRUE
    ),
    regexp = "intervals in `dt` are missing making it impossible to collapse"
  )

  output_dt <- scale(
    dt = new_input_dt,
    id_cols = id_cols,
    value_cols = value_cols,
    col_stem = "age",
    col_type = "interval",
    missing_dt_severity = "none",
    collapse_interval_cols = TRUE
  )
  expect_identical(output_dt, new_expected_dt)
})

# Scale multiplicative values ---------------------------------------------

sex_mapping <- data.table(parent = "all", child = c("female", "male"))
id_cols <- c("year", "sex")
value_cols <- "value"

# set up test input data.table
input_dt <- CJ(year = 2010, sex = c("female", "male"), value = 0.9)
input_dt_both <- CJ(year = 2010, sex = "all", value = 0.95)
input_dt <- rbind(input_dt, input_dt_both, use.names = T)
setkeyv(input_dt, id_cols)

# set up expected output table
expected_dt <- CJ(year = 2010, sex = c("female", "male"), value = sqrt(0.95))
expected_dt <- rbind(expected_dt, input_dt_both, use.names = T)
setkeyv(expected_dt, id_cols)

description <- "scaling using 'prod' as the aggregation function works"
test_that(description, {
  output_dt <- scale(
    dt = input_dt,
    id_cols = id_cols,
    value_cols = "value",
    col_stem = "sex",
    col_type = "categorical",
    mapping = sex_mapping,
    agg_function = prod
  )
  expect_identical(output_dt, expected_dt)
})

# Scale categorical variable with multiple levels in mapping --------------

# Inputs:
# - all present day & historical provinces in Iran as defined in `iran_mapping`
#   includes all the nodes in the mapping https://ihmeuw-demographics.github.io/hierarchyUtils/articles/hierarchyUtils.html#aggregate-locations-1
# - each province's value is equal to the number of present day provinces that make up the province
# - Iran national is equal to 2 times the number of present day provinces
# Output:
# - Iran national is the same
# - All provinces are equal to 2 times the original value

id_cols <- c("location", "year")
value_cols <- c("value")

# set up test input data.table so that value is equal to number of child nodes
# for each location
input_dt <- CJ(
  location = iran_mapping[, unique(c(child, parent))],
  year = 2011,
  value = 1
)
input_dt[
  location %in% c(
    "Tehran 2006", "Zanjan 1976-1996", "Mazandaran 1956-1996",
    "East Azarbayejan 1956-1986", "Khuzestan and Lorestan 1956",
    "Isfahan and Yazd 1966"
  ),
  value := 2
]
input_dt[
  location %in% c(
    "Tehran 1986-1995", "Gilan 1956-1966", "Kermanshahan 1956",
    "Khorasan 1956-1996", "Isfahan and Yazd 1956"
  ),
  value := 3
]
input_dt[location %in% c("Markazi 1966-1976", "Fars and Ports 1956"), value := 4]
input_dt[location %in% "Markazi 1956", value := 5]
input_dt[location %in% "Iran (Islamic Republic of)", value := 31]

# multiply top node by 2 to force scaling
input_dt[location == "Iran (Islamic Republic of)", value := value * 2]
setkeyv(input_dt, id_cols)

# set up expected output table
expected_dt <- copy(input_dt)
expected_dt[location != "Iran (Islamic Republic of)", value := value * 2]
setkeyv(expected_dt, id_cols)

description <- "scaling of categorical variable with multiple levels works"
test_that(description, {
  output_dt <- scale(
    dt = input_dt,
    id_cols = id_cols,
    value_cols = value_cols,
    col_stem = "location",
    col_type = "categorical",
    mapping = iran_mapping
  )
  expect_identical(output_dt, expected_dt)
})

description <- "scaling of categorical variable with missing intermediate levels
works when `collapse_missing=TRUE` and throws an error otherwise"
test_that(description, {

  # only include the present day provinces and the national level
  new_input_dt <- input_dt[!grepl("[0-9]+", location)]
  new_expected_dt <- expected_dt[!grepl("[0-9]+", location)]

  expect_error(
    scale(
      dt = new_input_dt,
      id_cols = id_cols,
      value_cols = value_cols,
      col_stem = "location",
      col_type = "categorical",
      mapping = iran_mapping
    ),
    regexp = "expected input data is missing"
  )

  output_dt <- scale(
    dt = new_input_dt,
    id_cols = id_cols,
    value_cols = value_cols,
    col_stem = "location",
    col_type = "categorical",
    mapping = iran_mapping,
    collapse_missing = TRUE
  )
  expect_identical(output_dt, new_expected_dt)
})

description <- "scaling of categorical variable with top level missing throws an
error"
test_that(description, {

  new_input_dt <- input_dt[!location %in% c("Iran (Islamic Republic of)")]

  expect_error(
    scale(
      dt = new_input_dt,
      id_cols = id_cols,
      value_cols = value_cols,
      col_stem = "location",
      col_type = "categorical",
      mapping = iran_mapping
    ),
    regexp = "expected input data is missing"
  )
  expect_error(
    scale(
      dt = new_input_dt,
      id_cols = id_cols,
      value_cols = value_cols,
      col_stem = "location",
      col_type = "categorical",
      mapping = iran_mapping,
      collapse_missing = TRUE
    ),
    regexp = "expected input data is missing"
  )
})

# Test `na_value_severity` argument ---------------------------------------

id_cols <- c("group", "age_start", "age_end")
value_cols <- "value"

input_dt <- data.table(
  group = c(1, 1, 1, 2, 2, 2),
  age_start = c(0, 1, 0, 0, 1, 0),
  age_end = c(1, 2, 2, 1, 2, 2),
  value = c(NA, 1, 2, 2, 3, 10)
)
setkeyv(input_dt, id_cols)

expected_dt <- copy(input_dt)
expected_dt[!(age_start == 0 & age_end == 2), value := value * 2]
setkeyv(expected_dt, id_cols)

description <- "aggregation correctly accounts for NA values with the
'na_value_severity' argument"
test_that(description, {
  expect_error(
    scale(
      dt = input_dt,
      id_cols = id_cols, value_cols = value_cols,
      col_stem = "age", col_type = "interval"
    ),
    regexp = "input `value_cols` have 'NA' values"
  )

  expect_error(
    scale(
      dt = input_dt,
      id_cols = id_cols, value_cols = value_cols,
      col_stem = "age", col_type = "interval",
      na_value_severity = "none"
    ),
    regexp = "expected input data is missing"
  )
  output_dt <- scale(
    dt = input_dt,
    id_cols = id_cols, value_cols = value_cols,
    col_stem = "age", col_type = "interval",
    na_value_severity = "none",
    missing_dt_severity = "skip"
  )
  new_expected_dt <- copy(expected_dt)
  new_expected_dt <- new_expected_dt[!is.na(value)]
  expect_equal(output_dt, new_expected_dt)

  output_dt <- scale(
    dt = input_dt,
    id_cols = id_cols, value_cols = value_cols,
    col_stem = "age", col_type = "interval",
    na_value_severity = "skip"
  )
  new_expected_dt <- copy(expected_dt)
  new_expected_dt[group == 1 & !(age_start == 0 & age_end == 2), value := NA]
  expect_equal(output_dt, new_expected_dt)
})

# Small special case tests ------------------------------------------------

mapping <- data.table(child = 2:3, parent = 1)
input_dt <- data.table(location_id = 1:3, sex = "all")
input_dt[, value := rnorm(.N)]
test_that("scaling a numeric 'categorical' variable works", {
  expect_error(
    scale(
      dt = input_dt,
      id_cols = c("location_id", "sex"),
      value_cols = "value",
      col_stem = "location_id",
      col_type = "categorical",
      mapping = mapping
    ),
    NA
  )
})

# set up test input data.table
# 0-4, 5-10, 0-10 age groups
input_dt <- data.table(
  year = 2010,
  age_start = c(0, 5, 0), age_end = c(4, 10, 10),
  value = 1
)
description <- "scaling interval variables with weird overlapping intervals
errors out"
testthat::test_that(description, {
  testthat::expect_error(
    scale(
      dt = input_dt,
      id_cols = c("year", "age_start", "age_end"),
      value_cols = "value",
      col_stem = "age",
      col_type = "interval"
    ),
    regexp = "expected input data is missing"
  )
})

# set up test input data.table
# 0-5, 5-10, 4-6, 0-10 age groups
input_dt <- data.table(
  year = 2010,
  age_start = c(0, 5, 4, 0),
  age_end = c(5, 10, 6, 10),
  value = 1
)
description <- "scaling age intervals errors out when given
weird overlapping intervals"
testthat::test_that(description, {
  testthat::expect_error(
    scale(
      dt = input_dt,
      id_cols = c("year", "age_start", "age_end"),
      value_cols = "value",
      col_stem = "age",
      col_type = "interval"
    ),
    regexp = "Some overlapping intervals are in `dt`"
  )
})
ihmeuw-demographics/hierarchyUtils documentation built on June 20, 2024, 7:18 a.m.