tests/testthat/test-sample_strata.R

context("test-sample_strata")

library(dplyr)
library(optimall)

set.seed(2343)
data <- data.frame(
  "strata" = c(
    rep("a", times = 15),
    rep("b", times = 15),
    rep("c", times = 12)
  ),
  "y" = c(rnorm(30, sd = 1), rnorm(12, sd = 2)),
  "key" = rbinom(42, 1, 0.2),
  "id" = seq(1:42)
)
data$key[c(1, 16, 31)] <- 1 # To make sure no group gets zero in already_sampled

df <- allocate_wave(
  data = data, strata = "strata",
  already_sampled = "key", y = "y", nsample = 15
)

set.seed(384)
sampled_data <- sample_strata(
  data = data, strata = "strata",
  id = "id", design_data = df,
  already_sampled = "key", design_strata = "strata",
  n_allocated = "n_to_sample"
)

test_that("samples are properly allocated", {
  expect_equal(sum(sampled_data$sample_indicator == 1), 15)
  expect_equal(
    sort(as.vector(
      table(sampled_data[sampled_data$sample_indicator == 1, ]$strata)
    )),
    sort(df$n_to_sample)
  )
})

test_that("no samples in already_sampled are again sampled", {
  expect_equal(any(sampled_data[sampled_data$sample_indicator == 1, ]$id %in%
    data[data$key == 1, ]$id), FALSE)
})

test_that("same thing with new seed yields different sample", {
  set.seed(6589)
  sampled_design_data <- sample_strata(
    data = data, strata = "strata",
    id = "id", design_data = df,
    already_sampled = "key", design_strata = "strata",
    n_allocated = "n_to_sample"
  )
  expect_equal(
    length(setdiff(
      sampled_data[sampled_data$sample_indicator == 1, ]$id,
      sampled_design_data[sampled_design_data$sample_indicator == 1, ]$id
    )) > 0,
    TRUE
  )
})

test_that("works if already_sampled is NULL", {
  design_data <- data
  design_data$key <- rep(0, times = 42)
  df2 <- optimum_allocation(
    data = design_data, strata = "strata",
    y = "y", nsample = 15
  )
  set.seed(384)
  sampled_design_data <- sample_strata(
    data = design_data, strata = "strata",
    id = "id", design_data = df2,
    already_sampled = NULL, design_strata = "strata",
    n_allocated = "stratum_size"
  )
  expect_equal(sum(sampled_design_data$sample_indicator == 1), 15)
  expect_equal(
    sort(as.vector(table(
      sampled_design_data[sampled_design_data$sample_indicator == 1, ]$strata
    ))),
    sort(df2$stratum_size)
  )
})

test_that("Error messages displayed as necessary", {
  x <- c(1, 2, 3, 4, 5)
  expect_error(
    sampled_data <- sample_strata(
      data = x,
      strata = "strata",
      id = "id", design_data = df,
      already_sampled = "key",
      design_strata = "strata",
      n_allocated = "n_to_sample"
    ),
    "'data' and 'design_data' must be a dataframe"
  )
  expect_error(
    sampled_data <- sample_strata(
      data = data,
      strata = "strata_wrong",
      id = "id", design_data = df,
      already_sampled = "key",
      design_strata = "strata",
      n_allocated = "n_to_sample"
    ),
    "'strata' and 'id' must be strings matching a column"
  )
  expect_error(
    sampled_data <- sample_strata(
      data = data,
      strata = "strata",
      id = "id", design_data = df,
      already_sampled = "key",
      design_strata = "strata_wrong",
      n_allocated = "n_to_sample"
    ),
    "'design_strata' and 'n_allocated' must be strings matching"
  )
  df_extra_row <- rbind(df, df[3, ])
  expect_error(
    sampled_data <- sample_strata(
      data = data,
      strata = "strata",
      id = "id",
      design_data = df_extra_row,
      already_sampled = "key",
      design_strata = "strata",
      n_allocated = "n_to_sample"
    ),
    "'design_data' may only contain one row per stratum"
  )
  expect_error(
    sampled_data <- sample_strata(
      data = data,
      strata = "strata",
      id = "id",
      design_data = df,
      already_sampled = "key_wrong",
      design_strata = "strata",
      n_allocated = "n_to_sample"
    ),
    "If not NULL, 'already_sampled' must be a character string"
  )
  data$three_key <- rep(c(1, 2, 3), times = dim(data)[1] / 3)
  expect_error(
    sampled_data <- sample_strata(
      data = data,
      strata = "strata",
      id = "id",
      design_data = df,
      already_sampled = "three_key",
      design_strata = "strata",
      n_allocated = "n_to_sample"
    ),
    "has a binary indicator for whether"
  )
  data$wrong_two_key <- rep(c(2, 3), times = dim(data)[1] / 2)
  expect_error(
    sampled_data <- sample_strata(
      data = data,
      strata = "strata",
      id = "id",
      design_data = df,
      already_sampled = "wrong_two_key",
      design_strata = "strata",
      n_allocated = "n_to_sample"
    ),
    "'already_sampled' column must contain '1'"
  )
  data$wrong_two_key <- c(
    rep(1, times = dim(data)[1] - 6), 0, 0, 0,
    0, 0, 0
  )
  expect_error(
    sampled_data <- sample_strata(
      data = data,
      strata = "strata",
      id = "id",
      design_data = df,
      already_sampled = "wrong_two_key",
      design_strata = "strata",
      n_allocated = "n_to_sample"
    ),
    "Total sample size across waves, taken as"
  )



  df_wrong_strata_name <- df %>%
    dplyr::mutate(strata_new = case_when(
      strata == "a" ~ "a",
      strata == "b" ~ "b",
      strata == "c" ~ "c_2"
    ))
  expect_error(
    sampled_data <- sample_strata(
      data = data,
      strata = "strata",
      id = "id",
      design_data = df_wrong_strata_name,
      already_sampled = "key",
      design_strata = "strata_new",
      n_allocated = "n_to_sample"
    ),
    "strata names in 'design_data' must all match strata"
  )
})

test_that("works if input is a matrix", {
  sampled_data_mat <- sample_strata(
    data = data.matrix(data),
    strata = "strata",
    id = "id",
    design_data = data.matrix(df),
    already_sampled = "key",
    design_strata = "strata",
    n_allocated = "n_to_sample"
  )
  expect_equal(sum(sampled_data_mat$sample_indicator), 15)
})

test_that("returns error if n_allocated is not a whole number", {
  df$n_to_sample <- c(1.5, 2, 4)
  expect_error(
    sample_strata(
      data = data, strata = "strata",
      id = "id", design_data = df,
      already_sampled = "key", design_strata = "strata",
      n_allocated = "n_to_sample"
    ),
    "must specify a numeric column"
  )
  df$n_to_sample <- c("a", "b", "c")
  expect_error(
    sample_strata(
      data = data, strata = "strata",
      id = "id", design_data = df,
      already_sampled = "key", design_strata = "strata",
      n_allocated = "n_to_sample"
    ),
    "must specify a numeric column"
  )
})

test_that("wave argument is correctly accepted, and it rejects new name
  if column  with that name already exists", {
    df <- allocate_wave(
      data = data, strata = "strata",
      already_sampled = "key", y = "y", nsample = 15
    )
    sampled_data <- sample_strata(
      data = data, strata = "strata",
      id = "id", design_data = df,
      already_sampled = "key", design_strata = "strata", wave = "Wave2",
      n_allocated = "n_to_sample")
    expect_equal(names(sampled_data)[5], "sample_indicatorWave2")

    sampled_data <- sample_strata(
      data = sampled_data, strata = "strata",
      id = "id", design_data = df,
      already_sampled = "key", design_strata = "strata", wave = 2,
      n_allocated = "n_to_sample")
    expect_equal(names(sampled_data)[6], "sample_indicator2")

    sampled_data <- sample_strata(
      data = sampled_data, strata = "strata",
      id = "id", design_data = df,
      already_sampled = "key", design_strata = "strata", wave = "Wave2",
      n_allocated = "n_to_sample")
    expect_equal(names(sampled_data)[7], "sample_indicator")


})

test_that("probs argument is correctly accepted",{
  df <- allocate_wave(
    data = data, strata = "strata",
    already_sampled = "key", y = "y", nsample = 15,
    detailed = TRUE
  )
  df$sampprob <- df$n_to_sample/(df$npop - df$nsample_prior)
  sampled_data <- sample_strata(
    data = data, strata = "strata",
    id = "id", design_data = df,
    already_sampled = "key", design_strata = "strata",
    wave = "Wave2", probs = "sampprob",
    n_allocated = "n_to_sample")
  expect_equal(names(sampled_data)[6], "sampling_prob")

  expect_equal(names(table(df$sampprob)),
               names(table(sampled_data$sampling_prob)))
  expect_equal((sum(df$n_to_sample)),
               sum(table(sampled_data$sampling_prob)))

  expect_warning(sample_strata(
    data = sampled_data, strata = "strata",
    id = "id", design_data = df,
    already_sampled = "key", design_strata = "strata",
    wave = "Wave2", probs = "sampprob",
    n_allocated = "n_to_sample"), "Overwriting prior")

  # and still works if probs is given as a formula
  sampled_data <- sample_strata(
    data = data, strata = "strata",
    id = "id", design_data = df,
    already_sampled = "key", design_strata = "strata",
    wave = "Wave2", probs = ~n_to_sample/(npop-nsample_prior),
    n_allocated = "n_to_sample")
  expect_equal(names(sampled_data)[6], "sampling_prob")

  expect_equal(names(table(df$sampprob)),
               names(table(sampled_data$sampling_prob)))
  expect_equal((sum(df$n_to_sample)),
               sum(table(sampled_data$sampling_prob)))

  expect_error(sample_strata(
    data = data, strata = "strata",
    id = "id", design_data = df,
    already_sampled = "key", design_strata = "strata",
    wave = "Wave2", probs = ~n_to_sample/(npopppp-nsample_prior),
    n_allocated = "n_to_sample"), "Variables in probs formula must")
})

Try the optimall package in your browser

Any scripts or data that you put into this service are public.

optimall documentation built on June 22, 2024, 9:34 a.m.