infer: Tidy Statistical Inference

test_that("t_test works", {
  # Two Sample
  expect_snapshot(res_ <- gss_tbl |> t_test(hours ~ sex))

  expect_snapshot(
    error = TRUE,
    gss_tbl |> t_test(response = "hours", explanatory = "sex")
  )

  new_way <- t_test(gss_tbl, hours ~ sex, order = c("male", "female"))
  new_way_alt <- t_test(
    gss_tbl,
    response = hours,
    explanatory = sex,
    order = c("male", "female")
  )
  old_way <- t.test(hours ~ sex, data = gss_tbl) |>
    broom::glance() |>
    dplyr::select(
      statistic,
      t_df = parameter,
      p_value = p.value,
      alternative,
      estimate,
      lower_ci = conf.low,
      upper_ci = conf.high
    )

  expect_equal(new_way, new_way_alt, tolerance = 1e-5)
  expect_equal(new_way, old_way, tolerance = 1e-5)

  # check that the order argument changes output
  new_way2 <- t_test(gss_tbl, hours ~ sex, order = c("female", "male"))
  expect_equal(new_way[["lower_ci"]], -new_way2[["upper_ci"]])
  expect_equal(new_way[["statistic"]], -new_way2[["statistic"]])

  # One Sample
  new_way <- gss_tbl |>
    t_test(hours ~ NULL, mu = 0)
  new_way_alt <- gss_tbl |>
    t_test(response = hours, mu = 0)
  old_way <- t.test(x = gss_tbl$hours, mu = 0) |>
    broom::glance() |>
    dplyr::select(
      statistic,
      t_df = parameter,
      p_value = p.value,
      alternative,
      estimate,
      lower_ci = conf.low,
      upper_ci = conf.high
    )

  expect_equal(new_way, new_way_alt, tolerance = 1e-5)
  expect_equal(new_way, old_way, tolerance = 1e-5)
})

test_that("chisq_test works", {
  # maleependence
  expect_silent(
    gss_tbl |>
      chisq_test(college ~ partyid)
  )
  new_way <- gss_tbl |>
    chisq_test(college ~ partyid)
  new_way_alt <- gss_tbl |>
    chisq_test(response = college, explanatory = partyid)
  old_way <- chisq.test(x = table(gss_tbl$partyid, gss_tbl$college)) |>
    broom::glance() |>
    dplyr::select(statistic, chisq_df = parameter, p_value = p.value)

  expect_equal(new_way, new_way_alt, tolerance = eps)
  expect_equal(new_way, old_way, tolerance = eps)

  # Goodness of Fit
  expect_silent(
    gss_tbl |>
      chisq_test(response = partyid, p = c(.3, .4, .3))
  )
  new_way <- gss_tbl |>
    chisq_test(partyid ~ NULL, p = c(.3, .4, .3))
  new_way_alt <- gss_tbl |>
    chisq_test(response = partyid, p = c(.3, .4, .3))
  old_way <- chisq.test(x = table(gss_tbl$partyid), p = c(.3, .4, .3)) |>
    broom::glance() |>
    dplyr::select(statistic, chisq_df = parameter, p_value = p.value)

  expect_equal(new_way, new_way_alt, tolerance = 1e-5)
  expect_equal(new_way, old_way, tolerance = 1e-5)

  # check that function errors out when response is numeric
  expect_snapshot(
    error = TRUE,
    chisq_test(x = gss_tbl, response = age, explanatory = partyid)
  )

  # check that function errors out when explanatory is numeric
  expect_snapshot(
    error = TRUE,
    chisq_test(x = gss_tbl, response = partyid, explanatory = age)
  )
})

test_that("_stat functions work", {
  # Test of maleependence
  expect_snapshot(
    res_ <- gss_tbl |> chisq_stat(college ~ partyid)
  )

  another_way <- gss_tbl |>
    chisq_test(college ~ partyid) |>
    dplyr::select(statistic)

  expect_snapshot(
    obs_stat_way <- gss_tbl |> chisq_stat(college ~ partyid)
  )
  one_more <- chisq.test(
    table(gss_tbl$partyid, gss_tbl$college)
  )$statistic

  expect_equal(dplyr::pull(another_way), obs_stat_way, ignore_attr = TRUE)
  expect_equal(one_more, obs_stat_way, ignore_attr = TRUE)

  # Goodness of Fit
  new_way <- gss_tbl |>
    chisq_test(partyid ~ NULL) |>
    dplyr::select(statistic)

  expect_snapshot(
    obs_stat_way <- gss_tbl |>
      chisq_stat(partyid ~ NULL)
  )
  expect_snapshot(
    obs_stat_way_alt <- gss_tbl |>
      chisq_stat(response = partyid)
  )

  expect_equal(dplyr::pull(new_way), obs_stat_way, ignore_attr = TRUE)
  expect_equal(dplyr::pull(new_way), obs_stat_way_alt, ignore_attr = TRUE)

  # robust to the named vector
  unordered_p <- gss_tbl |>
    chisq_test(response = partyid, p = c(.2, .3, .5))
  ordered_p <- gss_tbl |>
    chisq_test(response = partyid, p = c(ind = .2, rep = .3, dem = .5))

  expect_equal(unordered_p, ordered_p, ignore_attr = TRUE)

  # Two sample t
  expect_snapshot(
    res_ <- gss_tbl |>
      t_stat(
        hours ~ sex,
        order = c("male", "female")
      )
  )
  another_way <- gss_tbl |>
    t_test(hours ~ sex, order = c("male", "female")) |>
    dplyr::select(statistic) |>
    pull()

  expect_snapshot(
    obs_stat_way <- gss_tbl |>
      t_stat(hours ~ sex, order = c("male", "female"))
  )

  expect_snapshot(
    obs_stat_way_alt <- gss_tbl |>
      t_stat(response = hours, explanatory = sex, order = c("male", "female"))
  )

  expect_equal(another_way, obs_stat_way, ignore_attr = TRUE)
  expect_equal(another_way, obs_stat_way_alt, ignore_attr = TRUE)

  # One sample t
  expect_snapshot(
    res_ <- gss_tbl |> t_stat(hours ~ NULL)
  )

  another_way <- gss_tbl |>
    t_test(hours ~ NULL) |>
    dplyr::select(statistic) |>
    pull()

  expect_snapshot(
    obs_stat_way <- gss_tbl |>
      t_stat(hours ~ NULL)
  )
  expect_snapshot(
    obs_stat_way_alt <- gss_tbl |>
      t_stat(response = hours)
  )

  expect_equal(another_way, obs_stat_way, ignore_attr = TRUE)
  expect_equal(another_way, obs_stat_way_alt, ignore_attr = TRUE)

  expect_snapshot(
    error = TRUE,
    res_ <- chisq_stat(x = gss_tbl, response = age, explanatory = sex)
  )

  expect_snapshot(
    error = TRUE,
    res_ <- chisq_stat(x = gss_tbl, response = sex, explanatory = age)
  )
})

test_that("conf_int argument works", {
  expect_equal(
    names(
      gss_tbl |>
        t_test(hours ~ sex, order = c("male", "female"), conf_int = FALSE)
    ),
    c("statistic", "t_df", "p_value", "alternative", "estimate"),
    tolerance = 1e-5
  )
  expect_equal(
    names(
      gss_tbl |>
        t_test(
          hours ~ sex,
          order = c("male", "female"),
          conf_int = TRUE
        )
    ),
    c(
      "statistic",
      "t_df",
      "p_value",
      "alternative",
      "estimate",
      "lower_ci",
      "upper_ci"
    ),
    tolerance = 1e-5
  )

  ci_test <- gss_tbl |>
    t_test(
      hours ~ sex,
      order = c("male", "female"),
      conf_int = TRUE,
      conf_level = 0.9
    )
  old_way <- t.test(
    formula = hours ~ sex,
    data = gss_tbl,
    conf.level = 0.9
  )[["conf.int"]]
  expect_equal(ci_test$lower_ci[1], old_way[1], tolerance = 1e-5)
  expect_equal(ci_test$upper_ci[1], old_way[2], tolerance = 1e-5)

  expect_snapshot(
    error = TRUE,
    res_ <- gss_tbl |>
      t_test(
        hours ~ sex,
        order = c("female", "male"),
        conf_int = TRUE,
        conf_level = 1.1
      )
  )

  # Check that var.equal produces different results
  # Thanks for fmaleing this @EllaKaye!
  gss_tbl_small <- gss_tbl |> dplyr::slice(1:6, 90:100)

  expect_snapshot(
    no_var_equal <- gss_tbl_small |>
      t_stat(hours ~ sex, order = c("female", "male"))
  )

  expect_snapshot(
    var_equal <- gss_tbl_small |>
      t_stat(
        hours ~ sex,
        order = c("female", "male"),
        var.equal = TRUE
      )
  )

  expect_false(no_var_equal == var_equal)

  shortcut_no_var_equal <- gss_tbl_small |>
    specify(hours ~ sex) |>
    calculate(stat = "t", order = c("female", "male"))

  shortcut_var_equal <- gss_tbl_small |>
    specify(hours ~ sex) |>
    calculate(
      stat = "t",
      order = c("female", "male"),
      var.equal = TRUE
    )
  expect_false(shortcut_no_var_equal == shortcut_var_equal)
})

# generate some data to test the prop.test wrapper
df <- data.frame(
  exp = rep(c("a", "b"), each = 500),
  resp = c(rep("c", 450), rep("d", 50), rep("c", 400), rep("d", 100)),
  stringsAsFactors = FALSE
)

sum_df <- table(df)

bad_df <- data.frame(resp = 1:5, exp = letters[1:5])

bad_df2 <- data.frame(resp = letters[1:5], exp = 1:5)

df_l <- df |>
  dplyr::mutate(resp = dplyr::if_else(resp == "c", TRUE, FALSE))

test_that("two sample prop_test works", {
  # run the tests with default args
  base <- prop.test(sum_df)
  infer <- prop_test(df, resp ~ exp, order = c("a", "b"))

  # check that results are same
  expect_equal(base[["statistic"]], infer[["statistic"]], tolerance = .001)
  expect_equal(base[["parameter"]], infer[["chisq_df"]])
  expect_equal(base[["p.value"]], infer[["p_value"]], tolerance = .001)

  # expect warning for unspecified order
  expect_snapshot(res_ <- prop_test(df, resp ~ exp))

  # check that the functions respond to "p" in the same way
  base2 <- prop.test(sum_df, p = c(.1, .1))
  infer2 <- prop_test(df, resp ~ exp, order = c("a", "b"), p = c(.1, .1))
  expect_equal(base2[["statistic"]], infer2[["statistic"]], tolerance = .001)
  expect_equal(base2[["parameter"]], infer2[["chisq_df"]])
  expect_equal(base2[["p.value"]], infer2[["p_value"]], tolerance = .001)

  # check confidence interval argument
  infer3 <- prop_test(df, resp ~ exp, order = c("a", "b"), conf_int = TRUE)
  expect_length(infer3, 6)
  expect_length(infer2, 4)

  # check that the order argument changes output
  infer4 <- prop_test(df, resp ~ exp, order = c("b", "a"), conf_int = TRUE)
  expect_equal(infer4[["lower_ci"]], -infer3[["upper_ci"]], tolerance = .001)

  expect_snapshot(error = TRUE, res_ <- prop_test(bad_df, resp ~ exp))
  expect_snapshot(error = TRUE, res_ <- prop_test(bad_df2, resp ~ exp))

  # check that the success argument changes output
  infer5 <- prop_test(
    df,
    resp ~ exp,
    order = c("a", "b"),
    success = "d",
    conf_int = TRUE
  )
  expect_equal(infer3[["upper_ci"]], -infer5[["lower_ci"]], tolerance = .001)

  # check that logical variables are leveled intuitively
  infer1_l <- prop_test(df_l, resp ~ exp, order = c("b", "a"))
  infer2_l <- prop_test(df_l, resp ~ exp, order = c("b", "a"), success = "TRUE")
  infer3_l <- prop_test(
    df_l,
    resp ~ exp,
    order = c("b", "a"),
    success = "FALSE"
  )

  expect_equal(infer1_l$lower_ci, infer2_l$lower_ci)
  expect_equal(infer1_l$lower_ci, -infer3_l$upper_ci)
})

# ...and some data for the one sample wrapper
df_1 <- df |>
  select(resp)

sum_df_1 <- table(df_1)

test_that("one sample prop_test works", {
  # check that results with default args are the same
  base <- prop.test(sum_df_1)
  infer <- prop_test(df_1, resp ~ NULL, p = .5)
  expect_equal(base[["statistic"]], infer[["statistic"]], tolerance = .001)
  expect_equal(base[["parameter"]], infer[["chisq_df"]])
  expect_equal(base[["p.value"]], infer[["p_value"]], tolerance = .001)

  # check that the functions respond to "p" in the same way
  base2 <- prop.test(sum_df_1, p = .86)
  infer2 <- prop_test(df_1, resp ~ NULL, p = .86)
  expect_equal(base2[["statistic"]], infer2[["statistic"]], tolerance = .001)
  expect_equal(base2[["parameter"]], infer2[["chisq_df"]])
  expect_equal(base2[["p.value"]], infer2[["p_value"]], tolerance = .001)

  # expect message for unspecified p
  expect_snapshot(res_ <- prop_test(df_1, resp ~ NULL))

  # check that the success argument changes output
  infer3 <- prop_test(df_1, resp ~ NULL, p = .2, success = "c")
  infer4 <- prop_test(df_1, resp ~ NULL, p = .8, success = "d")
  expect_equal(infer3[["chisq_df"]], infer4[["chisq_df"]], tolerance = .001)
  expect_snapshot(
    error = TRUE,
    res_ <- prop_test(df_1, resp ~ NULL, p = .2, success = "b")
  )
})

test_that("prop_test output dimensionality is correct", {
  infer_1_sample <- prop_test(df, resp ~ NULL, p = .5)
  infer_1_sample_z <- prop_test(df, resp ~ NULL, p = .5, z = TRUE)
  infer_2_sample <- prop_test(df, resp ~ exp, order = c("a", "b"))
  infer_2_sample_no_int <- prop_test(
    df,
    resp ~ exp,
    order = c("a", "b"),
    conf_int = FALSE
  )
  infer_2_sample_z <- prop_test(df, resp ~ exp, order = c("a", "b"), z = TRUE)

  expect_length(infer_1_sample, 4)
  expect_length(infer_1_sample, length(infer_1_sample_z) + 1)
  expect_length(infer_2_sample, 6)
  expect_length(infer_2_sample_no_int, 4)
  expect_length(infer_2_sample_z, length(infer_2_sample) - 1)
})

test_that("prop_test handles >2 explanatory levels gracefully", {
  set.seed(1)
  dfr <-
    tibble::tibble(
      exp = sample(c("a", "b", "c"), 100, replace = TRUE),
      resp = sample(c("d", "e"), 100, replace = TRUE)
    )

  res_old <- prop.test(table(dfr))

  # don't pass order
  expect_silent(
    res_1 <- prop_test(dfr, resp ~ exp)
  )

  # pass 2-length order
  expect_snapshot(
    res_2 <- prop_test(dfr, resp ~ exp, order = c("a", "b"))
  )

  # pass 3-length order
  expect_snapshot(
    res_3 <- prop_test(dfr, resp ~ exp, order = c("a", "b", "c"))
  )

  expect_equal(res_1, res_2)
  expect_equal(res_2, res_3)

  expect_named(res_1, c("statistic", "chisq_df", "p_value"))
  expect_equal(res_1$statistic, res_old$statistic)
  expect_equal(res_1$chisq_df, res_old$parameter)
  expect_equal(res_1$p_value, res_old$p.value)
})

test_that("prop_test errors with >2 response levels", {
  set.seed(1)
  dfr <-
    tibble::tibble(
      exp = sample(c("a", "b"), 100, replace = TRUE),
      resp = sample(c("c", "d", "e"), 100, replace = TRUE)
    )

  expect_snapshot(
    error = TRUE,
    res_1 <- prop_test(dfr, resp ~ exp)
  )
})

test_that("prop_test z argument works as expected", {
  chi_res <- prop_test(df, resp ~ NULL, p = .5, correct = FALSE)

  z_res <- prop_test(df, resp ~ NULL, p = .5, z = TRUE)

  expect_equal(unname(chi_res$statistic), z_res$statistic^2, tolerance = eps)
})

test_that("wrappers can handled ordered factors", {
  expect_equal(
    gss_tbl |>
      dplyr::mutate(sex = factor(sex, ordered = FALSE)) |>
      t_test(hours ~ sex, order = c("male", "female")),
    gss_tbl |>
      dplyr::mutate(sex = factor(sex, ordered = TRUE)) |>
      t_test(hours ~ sex, order = c("male", "female"))
  )

  expect_snapshot(
    ordered_t_1 <- gss_tbl |>
      dplyr::mutate(income = factor(income, ordered = TRUE)) |>
      chisq_test(income ~ partyid)
  )

  expect_snapshot(
    ordered_f_1 <- gss_tbl |>
      dplyr::mutate(income = factor(income, ordered = FALSE)) |>
      chisq_test(income ~ partyid)
  )

  expect_equal(ordered_t_1, ordered_f_1)

  expect_snapshot(
    ordered_t_2 <- gss_tbl |>
      dplyr::mutate(income = factor(income, ordered = TRUE)) |>
      chisq_test(partyid ~ income)
  )

  expect_snapshot(
    ordered_f_2 <- gss_tbl |>
      dplyr::mutate(income = factor(income, ordered = FALSE)) |>
      chisq_test(partyid ~ income)
  )

  expect_equal(ordered_t_2, ordered_f_2)

  expect_equal(
    df |>
      dplyr::mutate(resp = factor(resp, ordered = TRUE)) |>
      prop_test(resp ~ NULL, p = .5),
    df |>
      dplyr::mutate(resp = factor(resp, ordered = FALSE)) |>
      prop_test(resp ~ NULL, p = .5)
  )
})

test_that("handles spaces in variable names (t_test)", {
  gss_ <- gss |>
    tidyr::drop_na(college) |>
    dplyr::mutate(`h o u r s` = hours)

  expect_equal(
    t_test(
      gss_,
      formula = hours ~ college,
      order = c("degree", "no degree"),
      alternative = "two-sided"
    ),
    t_test(
      gss_,
      formula = `h o u r s` ~ college,
      order = c("degree", "no degree"),
      alternative = "two-sided"
    )
  )

  expect_equal(
    t_test(
      gss_,
      response = hours,
      explanatory = college,
      order = c("degree", "no degree"),
      alternative = "two-sided"
    ),
    t_test(
      gss_,
      response = `h o u r s`,
      explanatory = college,
      order = c("degree", "no degree"),
      alternative = "two-sided"
    )
  )
})

test_that("handles spaces in variable names (prop_test)", {
  df$`r e s p` <- df$resp

  expect_equal(
    prop_test(df, `r e s p` ~ exp, order = c("a", "b")),
    prop_test(df, resp ~ exp, order = c("a", "b"))
  )

  expect_equal(
    prop_test(df, response = `r e s p`, explanatory = exp, order = c("a", "b")),
    prop_test(df, response = resp, explanatory = exp, order = c("a", "b"))
  )
})