tests/testthat/test-dfl_decompose.R

# test_that("Replicate FFL 2011, table 5, p. 69", {
#
#   # Full data set
#   load("data-raw/men8305_full.rda")
#   men8305_full$weights <- men8305_full$weights/sum(men8305_full$weights) * length(men8305_full$weights)
#
#   flf_model <- log(wage) ~ union*(education + experience) + education*experience
#
#   # Reweighting sample from 1983-85
#   flf_male_inequality  <- dfl_decompose(flf_model,
#                                    data = men8305_full,
#                                    weights = weights,
#                                    group = year)
#
#   FFL_results <- data.frame(effect=c("Observed difference", "Composition effect", "Structure effect"),
#                             p90p10=c(0.1091, 0.0756, 0.0336),
#                             p90p50=c(0.1827, 0.0191, 0.1637),
#                             p50p10=c(-0.0736, 0.0565, -0.1301),
#                             variance = c(0.0617, 0.0208, 0.0408))[,-1]
#   FFL_results <- t(as.matrix(FFL_results))
#   colnames(FFL_results) <- c("Observed difference", "Composition effect", "Structure effect")
#   estimated_statistics <- flf_male_inequality$decomposition_other_statistics[c(4,5,6,2), c(2, 4, 3)]
#   estimated_statistics <- as.matrix(estimated_statistics)
#   rownames(estimated_statistics) <- rownames(FFL_results)
#
#   expect_equal(FFL_results, estimated_statistics, tolerance = 0.01)
# })






test_that("dfl_decompose() does not throw an error", {
  set.seed(89342395)

  model_sequential <- log(wage) ~ union + experience + education | experience + education | education

  decompose_results <- dfl_decompose(model_sequential,
    data = men8305[1:1000, ],
    weights = weights,
    group = year
  )

  testthat::expect_error(decompose_results, NA)
})


test_that("dfl_decompose() does not return fitted models if required", {
  set.seed(89342395)

  model_sequential <- log(wage) ~ union + experience + education | experience + education | education

  decompose_results <- dfl_decompose(model_sequential,
    data = men8305[1:1000, ],
    weights = weights,
    group = year,
    return_model = FALSE
  )

  testthat::expect_error(decompose_results, NA)
})

test_that("bootstrapping dfl_decompose() does not throw an error", {
  flf_model <- log(wage) ~ union + experience + education

  set.seed(123)
  decompose_results_boot_single_core <- dfl_decompose(flf_model,
    data = men8305[1:1000, ],
    weights = weights,
    group = year,
    bootstrap = TRUE,
    bootstrap_iterations = 10,
    cores = 1
  )
  # # Does not work in check()
  # set.seed(123)
  # decompose_results_boot_parallel  <- dfl_decompose(flf_model,
  #                                         data = men8305[1:1000, ],
  #                                         weights = weights,
  #                                         group = year,
  #                                         bootstrap = TRUE,
  #                                         bootstrap_iterations = 10,
  #                                         cores = 2)

  testthat::expect_error(decompose_results_boot_single_core, NA)
})

test_that("dfl_decompose_estimate() does not throw an error", {
  set.seed(89342395)

  data_sample <- men8305[sample(1:nrow(men8305), size = 10000), ]
  formula <- Formula::as.Formula(log(wage) ~ union * (education + experience) + education * experience)
  data_used <- model.frame(formula, data_sample, weights = weights, group = year)
  dep_var <- model.response(data_used, "numeric")

  weights <- model.weights(data_used)
  group_variable_name <- "year"
  group_variable <- data_used[, ncol(data_used)]
  names(data_used)[ncol(data_used)] <- "group_variable"

  reference_0 <- TRUE
  reference_group <- ifelse(reference_0, 0, 1)

  statistics <- c("quantiles", "mean", "variance", "gini", "iq_range_p90_p10", "iq_range_p90_p50", "iq_range_p50_p10")
  probs <- c(1:9) / 10
  trimming <- TRUE
  trimming_threshold <- 0.00005
  right_to_left <- TRUE
  method <- "logit"

  decompose_results <- dfl_decompose_estimate(
    formula = formula,
    dep_var = dep_var,
    data_used = data_used,
    weights = weights,
    group_variable = group_variable,
    reference_group = reference_group,
    method = method,
    estimate_statistics = TRUE,
    statistics = statistics,
    custom_statistic_function = NULL,
    estimate_normalized_difference = TRUE,
    probs = probs,
    right_to_left = right_to_left,
    trimming = trimming,
    trimming_threshold = trimming_threshold,
    return_model = TRUE
  )

  testthat::expect_error(decompose_results, NA)
})


test_that("fit_and_predict_probabilities works properly with fastglm estimation", {
  set.seed(89342395)

  data_sample <- men8305[sample(1:nrow(men8305), size = 10000), ]
  formula <- Formula::as.Formula(log(wage) ~ union + education + experience)
  data_used <- model.frame(formula, data_sample, weights = weights, group = year)
  dep_var <- model.response(data_used, "numeric")

  weights <- model.weights(data_used)
  group_variable_name <- "year"
  group_variable <- data_used[, ncol(data_used)]
  names(data_used)[ncol(data_used)] <- "group_variable"


  formula_new <- group_variable ~ union + education + experience

  method <- "fastglm"
  fits_fastglm <- fit_and_predict_probabilities(formula_new,
    data_used,
    weights,
    method = method,
    return_model = TRUE,
    newdata = NULL
  )

  method <- "logit"
  fits_logit <- fit_and_predict_probabilities(formula_new,
    data_used,
    weights,
    method = method,
    return_model = TRUE,
    newdata = NULL
  )

  testthat::expect_equal(fits_fastglm[[1]][1:30], fits_logit[[1]][1:30], tolerance = 0.0001)
})



test_that("fit_and_predict_probabilities works properly with ranger random forests estimation", {
  set.seed(89342395)

  data_sample <- men8305[sample(1:nrow(men8305), size = 10000), ]
  formula <- Formula::as.Formula(log(wage) ~ union + education + experience)
  data_used <- model.frame(formula, data_sample, weights = weights, group = year)
  dep_var <- model.response(data_used, "numeric")

  weights <- model.weights(data_used)
  group_variable_name <- "year"
  group_variable <- data_used[, ncol(data_used)]
  names(data_used)[ncol(data_used)] <- "group_variable"

  method <- "random_forest"
  formula_new <- group_variable ~ union + education + experience

  fits <- fit_and_predict_probabilities(formula_new,
    data_used,
    weights,
    method = method,
    return_model = TRUE,
    newdata = NULL
  )

  testthat::expect_error(fits, NA)
})


test_that("dfl_decompose_estimate() works properly with ranger random forests estimation", {
  set.seed(89342395)

  data_sample <- men8305[sample(1:nrow(men8305), size = 10000), ]
  formula <- Formula::as.Formula(log(wage) ~ union + education + experience)
  data_used <- model.frame(formula, data_sample, weights = weights, group = year)
  dep_var <- model.response(data_used, "numeric")

  weights <- model.weights(data_used)
  group_variable_name <- "year"
  group_variable <- data_used[, ncol(data_used)]
  names(data_used)[ncol(data_used)] <- "group_variable"

  reference_0 <- TRUE
  reference_group <- ifelse(reference_0, 0, 1)

  statistics <- c("quantiles", "mean", "variance", "gini", "iq_range_p90_p10", "iq_range_p90_p50", "iq_range_p50_p10")
  probs <- c(1:9) / 10
  trimming <- FALSE
  trimming_threshold <- NULL
  right_to_left <- TRUE
  method <- "random_forest"

  decompose_results <- dfl_decompose_estimate(
    formula = formula,
    dep_var = dep_var,
    data_used = data_used,
    weights = weights,
    group_variable = group_variable,
    reference_group = reference_group,
    method = method,
    estimate_statistics = TRUE,
    statistics = statistics,
    custom_statistic_function = NULL,
    estimate_normalized_difference = TRUE,
    probs = probs,
    right_to_left = right_to_left,
    trimming = trimming,
    trimming_threshold = trimming_threshold,
    return_model = TRUE
  )

  testthat::expect_error(decompose_results, NA)
})



test_that("dfl_decompose() does not throw an error without estimating statistics", {
  set.seed(89342395)
  formula <- log(wage) ~ age + central_city + msa + region + black +
    hispanic + education + afqt + family_responsibility + years_worked_civilian +
    years_worked_military + part_time + industry

  decompose_results <- dfl_decompose(
    formula = formula,
    data = nlys00[1:300, ],
    weights = runif(nrow(nlys00[1:300, ]), 0.5, 1.5),
    group = female,
    estimate_statistics = FALSE
  )

  testthat::expect_error(decompose_results, NA)
})



test_that("select_observations_to_be_trimmed() trimms observations as intended", {
  set.seed(123)
  expected_trimmed_observations <- c(1:5, 101:102)
  reweighting_factor <- rep(1 / 100, 200)
  group_variable <- as.factor(rep(c("no", "yes"), each = 100))
  group <- 0
  reweighting_factor[expected_trimmed_observations] <- 20
  trimming_threshold <- NULL

  trimmed_observations <- select_observations_to_be_trimmed(
    reweighting_factor,
    group_variable,
    group,
    trimming_threshold
  )

  testthat::expect_equal(
    trimmed_observations,
    expected_trimmed_observations
  )
})

test_that("dfl_decompose() trimms estimated factors as expected", {
  set.seed(123)
  data_used <- data.frame(
    x = c(
      c(1, 1, rep(2, 98)),
      c(rep(1, 90), rep(2, 10))
    ),
    group = rep(c(0, 1), each = 100)
  )
  data_used$y <- data_used$x + data_used$group + rnorm(200, 0, 0.5)
  data_used$x <- as.factor(data_used$x)
  levels(data_used$x) <- c("A", "B")
  data_used$group <- as.factor(data_used$group)
  expected_trimmed_observations <- which(data_used$x == "A")

  # p_g_given_X <- data_used %>%
  #   dplyr::group_by(x) %>%
  #   dplyr::summarise(p = mean(group == "1")) %>%
  #   dplyr::pull(p)
  #
  # data_used$psi <- p_g_given_X[1]/(1-p_g_given_X[1])
  # data_used[which(data_used$x == "B"), "psi"] <-   p_g_given_X[2]/(1-p_g_given_X[2])
  #

  # reweighting_factor <-   data_used$psi
  # group_variable <- data_used$group
  # group <- 0
  # trimming_threshold = NULL
  #
  # trimmed_observations <- select_observations_to_be_trimmed(reweighting_factor,
  #                                                           group_variable,
  #                                                           group,
  #                                                           trimming_threshold)
  # decompose_results_untrimmed <- dfl_decompose(y ~ x,
  #                                   data_used,
  #                                   group = group)

  decompose_results_trimmed <- dfl_decompose(y ~ x,
    data_used,
    group = group,
    trimming = TRUE
  )

  testthat::expect_equal(
    decompose_results_trimmed$trimmed_observations,
    expected_trimmed_observations
  )
})


test_that("get_distributional_statistics() does not throw an error with custom_statistic_function", {
  data_sample <- men8305
  formula <- Formula::as.Formula(log(wage) ~ union * (education + experience) + education * experience)
  data_used <- model.frame(formula, data_sample, weights = weights, group = year)
  dep_var <- model.response(data_used, "numeric")

  weights <- model.weights(data_used)
  group_variable_name <- "year"
  group_variable <- data_used[, ncol(data_used)]
  names(data_used)[ncol(data_used)] <- "group_variable"

  reference_0 <- TRUE
  reference_group <- ifelse(reference_0, 0, 1)

  statistics <- c("quantiles", "mean", "variance", "gini", "iq_range_p90_p10", "iq_range_p90_p50", "iq_range_p50_p10")
  probs <- c(1:9) / 10
  log_transformed <- TRUE

  top_share <- function(dep_var,
                        weights,
                        top_percent = 0.1) {
    threshold <- Hmisc::wtd.quantile(dep_var, weights = weights, probs = 1 - top_percent)
    share <- sum(weights[which(dep_var > threshold)] * dep_var[which(dep_var > threshold)]) / sum(weights * dep_var)
    return(share)
  }

  direct_estimation <- top_share(dep_var[which(group_variable == "2003-2005")], weights[which(group_variable == "2003-2005")], 0.1)

  nu1 <- get_distributional_statistics(
    dep_var = dep_var,
    weights = weights,
    group_variable = group_variable,
    group = 1,
    statistics = statistics,
    custom_statistic_function = top_share,
    probs = probs,
    log_transformed = log_transformed
  )

  testthat::expect_equal(direct_estimation, as.numeric(nu1[length(nu1)]), tolerance = 0.000001)
})




# test_that("glm and surveyglm return the same coefficients", {
#   set.seed(123)
#   data("nlys00")
#   formula <- female ~ education + afqt + industry + family_responsibility + part_time
#   data_used <- get_all_vars(formula, nlys00)
#   data_used$weights <- runif(nrow(data_used), 0.5, 1.5)
#
#   design <- survey::svydesign(~0,
#                               data=data_used,
#                               weights=~weights)
#   model_fit_surveyglm <- survey::svyglm(formula,
#                                         data=data_used,
#                                         design=design,family=quasibinomial(link="logit"))
#
#   model_fit_glm <- glm(formula, data = data_used, weights = weights, family = binomial(link = "logit"))
#   model_fit_quasiglm <- glm(formula, data = data_used, weights = weights, family = quasibinomial(link = "logit"))
#
#   sglm <- summary(model_fit_glm)
#   squasiglm <- summary(model_fit_quasiglm)
#   ssurveryglm <- summary(model_fit_surveyglm)
#
#   cbind(sglm$coefficients[,2],
#         squasiglm$coefficients[,2],
#         ssurveryglm$coefficients[,2])
#
#   cbind(coef(model_fit_glm), coef(model_fit_surveyglm))
#   testthat::expect_equal(coef(model_fit_glm),
#                          coef(model_fit_surveyglm),
#                          tolerance = 0.000001)
#
#   cbind(coef(model_fit_quasiglm), coef(model_fit_surveyglm))
#   testthat::expect_equal(coef(model_fit_quasiglm),
#                          coef(model_fit_surveyglm),
#                          tolerance = 0.000001)
#
# })


# test_that("dfl_decompose() replicates Table 5, p. 67, in FLF 2011 Handbook Chapter", {
#
#   load("data-raw/men8305_full.rda")
#   men8305_full$weights <- men8305_full$weights/sum(men8305_full$weights) * length(men8305_full$weights)
#   flf_model <- log(wage) ~ union*(education + experience) + education*experience
#
#   # Replicate statistics in table 5, p.67, in in FLF (2011)
#   flf_male_inequality_table_5  <- dfl_decompose(flf_model,
#                                            data = men8305_full,
#                                            weights = weights,
#                                            group = year,
#                                            reference_0 = TRUE,
#                                            statistics = c("iq_range_p90_p10",
#                                                            "iq_range_p90_p50",
#                                                            "iq_range_p50_p10",
#                                                             "variance"))
#   results_dfl_decompose <- flf_male_inequality_table_5$decomposition_other_statistics
#   published_results_FLF_table_5 <- data.frame(statistic = results_dfl_decompose$statistic,
#                                               `Observed difference` = c(0.0617, 0.1091, 0.1827, -0.0736),
#                                               `Structure effect` = c(0.0408, 0.0336, 0.1637, -0.1301),
#                                               `Composition effect` = c(0.0208, 0.0756, 0.0191, 0.0565))
#   rownames(results_dfl_decompose) <- rownames(published_results_FLF_table_5) <- 1:4
#   names(published_results_FLF_table_5) <- names(results_dfl_decompose)
#   testthat::expect_equal(results_dfl_decompose,
#                          published_results_FLF_table_5,
#                          tolerance = 0.0075)
# })

Try the ddecompose package in your browser

Any scripts or data that you put into this service are public.

ddecompose documentation built on May 29, 2024, 5:36 a.m.