data("trans", package = "cat2cat")
data("occup", package = "cat2cat")
# Split the panel dataset to separate years
occup_2006 <- occup[occup$year == 2006, ]
occup_2008 <- occup_old <- occup[occup$year == 2008, ]
occup_2010 <- occup_new <- occup[occup$year == 2010, ]
occup_2012 <- occup[occup$year == 2012, ]
occup_simple_backward <- cat2cat::cat2cat(
data = list(
old = occup_old,
new = occup_new,
cat_var = "code",
time_var = "year"
),
mappings = list(
trans = trans,
direction = "backward"
)
)
testthat::test_that("Simple cat2cat example - two periods - backward", {
testthat::expect_true(all(names(occup_simple_backward) %in% c("old", "new")))
testthat::expect_true(is.data.frame(occup_simple_backward$old))
testthat::expect_true(is.data.frame(occup_simple_backward$new))
})
# Adding the dummy level to the mapping table for levels without the candidate
# The best to fill them manually with proper candidates, if possible
trans2 <- rbind(trans, data.frame(
old = "no_cat",
new = setdiff(occup_new$code, trans$new)
))
# Forward mapping for the case with two periods
occup_simple_forward <- cat2cat::cat2cat(
data = list(
old = occup_old,
new = occup_new,
cat_var = "code",
time_var = "year"
),
mappings = list(
trans = trans2,
direction = "forward"
)
)
testthat::test_that("Simple cat2cat example - two periods - forward", {
testthat::expect_true(all(names(occup_simple_forward) %in% c("old", "new")))
testthat::expect_true(is.data.frame(occup_simple_forward$old))
testthat::expect_true(is.data.frame(occup_simple_forward$new))
})
testthat::test_that("Table with number of replications for both mapping directions", {
# Build number of observations before and after unification table
res <- data.frame(
`before_mapping` = c(nrow(occup_old), nrow(occup_new)),
`after_mapping` = c(
paste0(
nrow(occup_simple_backward$old),
" (nonzero ",
sum(occup_simple_backward$old$wei_freq_c2c > 0),
")"
),
paste0(
nrow(occup_simple_forward$new),
" (nonzero ",
sum(occup_simple_forward$new$wei_freq_c2c > 0),
")"
)
)
)
rownames(res) <- c("old (backward)", "new (forward)")
res_tab <- knitr::kable(
res,
"latex",
caption = "Number of observations before and after unification."
)
testthat::expect_identical(
res_tab,
structure(
"\\begin{table}\n\n\\caption{Number of observations before and after unification.}\n\\centering\n\\begin{tabular}[t]{l|r|l}\n\\hline\n & before\\_mapping & after\\_mapping\\\\\n\\hline\nold (backward) & 17223 & 227662 (nonzero 163262)\\\\\n\\hline\nnew (forward) & 17323 & 18680 (nonzero 18517)\\\\\n\\hline\n\\end{tabular}\n\\end{table}",
format = "latex",
class = "knitr_kable"
)
)
})
# Set the seed as e.g., randomForest is used
set.seed(1234)
# Statistical models setup
# It could be shared for different iterations for this scenario
ml_setup <- list(
data = rbind(occup_2010, occup_2012),
cat_var = "code",
method = c("knn", "rf"),
features = c("age", "sex", "edu", "exp", "parttime", "salary"),
args = list(k = 10)
)
# Use the cat2cat procedure to map 2010 to 2008
occup_back_2008_2010 <- cat2cat::cat2cat(
data = list(
old = occup_2008,
new = occup_2010,
cat_var = "code",
time_var = "year"
),
mappings = list(trans = trans, direction = "backward"),
ml = ml_setup
)
# Use the cat2cat procedure to map 2008 to 2006
occup_back_2006_2008 <- cat2cat::cat2cat(
data = list(
old = occup_2006,
new = occup_back_2008_2010$old,
cat_var_new = "g_new_c2c",
cat_var_old = "code",
time_var = "year"
),
mappings = list(trans = trans, direction = "backward"),
ml = ml_setup
)
# Select a proper dataset for each year
o_2006 <- occup_back_2006_2008$old
o_2008 <- occup_back_2008_2010$old # or occup_back_2006_2008$new
o_2010 <- occup_back_2008_2010$new
# Add default cat2cat procedure columns to not processed dataset
o_2012 <- cat2cat::dummy_c2c(occup_2012, cat_var = "code", ml = c("knn", "rf"))
# Combine datasets
final_data_back_ml <- do.call(rbind, list(o_2006, o_2008, o_2010, o_2012))
testthat::test_that("Backward mapping, with four periods, one mapping table, and ml models", {
testthat::expect_true(is.data.frame(final_data_back_ml))
testthat::expect_identical(
dim(final_data_back_ml),
c(475543L, 19L)
)
testthat::expect_identical(
colnames(final_data_back_ml),
c(
"id", "age", "sex", "edu", "exp", "district", "parttime", "salary",
"code", "multiplier", "year", "code4", "index_c2c", "g_new_c2c",
"wei_freq_c2c", "rep_c2c", "wei_naive_c2c", "wei_knn_c2c", "wei_rf_c2c"
)
)
})
testthat::test_that("Correlations between ml methods", {
corr_separate <-
dplyr::filter(
dplyr::do(
dplyr::select(
dplyr::select(
dplyr::group_by(
dplyr::filter(final_data_back_ml, rep_c2c >= 10),
index_c2c
),
matches("wei.*c2c")
),
-wei_naive_c2c
),
corr = tryCatch(
cor(.[, -1]),
error = function(e) NA, warning = function(w) NA
)
), !any(is.na(corr))
)
corr_table <- Reduce("+", corr_separate$corr) / length(corr_separate$corr)
testthat::expect_true(inherits(corr_table, "matrix"))
testthat::expect_equal(dim(corr_table), c(3L, 3L))
})
testthat::test_that("Counts for a few random levels in the unified variable over time", {
data_count_plot <- cat2cat::prune_c2c(df = final_data_back_ml)
data_count_plot2 <-
dplyr::summarise_all(
dplyr::group_by(
dplyr::select(
dplyr::mutate(
dplyr::filter(
data_count_plot,
g_new_c2c %in% c("261102", "325502", "352111")
),
g_new_c2c_nams = forcats::fct_recode(
g_new_c2c,
`OHS Inspector` = "325502",
`Sound Engineer` = "352111",
`Prosecutor` = "261102"
)
),
"wei_freq_c2c", "wei_rf_c2c", "year", "g_new_c2c_nams", -"g_new_c2c"
),
g_new_c2c_nams, year
),
sum
)
# Build counts across a 3 random categories table
counts_base <-
tidyr::pivot_wider(
data_count_plot2,
names_from = "year",
values_from = c("wei_freq_c2c", "wei_rf_c2c"),
names_sep = " "
)
res_tab <- dplyr::select(
counts_base,
g_new_c2c_nams,
`rf 2006` = `wei_rf_c2c 2006`,
`freq 2006` = `wei_freq_c2c 2006`,
`rf 2008` = `wei_rf_c2c 2008`,
`freq 2008` = `wei_freq_c2c 2008`,
`2010` = `wei_rf_c2c 2010`,
`2012` = `wei_rf_c2c 2012`
)
testthat::expect_true(inherits(res_tab, "data.frame"))
testthat::expect_equal(dim(res_tab), c(3L, 7L))
})
# Mincerian-like regression formula
formula_micer <- I(log(salary)) ~ sex + parttime + edu + exp + I(exp**2)
testthat::test_that("Regression - neutral impact of the unified variable", {
# Fit a weighted linear regression on replicated
lms_replicated <- lm(
formula = formula_micer,
data = final_data_back_ml,
weights = multiplier * wei_freq_c2c
)
# Adjust size of stds as the replication process enlarges degrees of freedom
lms_replicated$df.residual <- nrow(occup) - length(lms_replicated$assign)
# Fit a linear regression on original dataset
lms_original <- lm(
formula = formula_micer,
data = occup,
weights = multiplier
)
summary_replicated <- suppressWarnings(summary(lms_replicated))
summary_original <- summary(lms_original)
testthat::expect_equal(summary_replicated$coefficients, summary_original$coefficients)
testthat::expect_equal(summary_replicated$r.squared, summary_original$r.squared)
})
testthat::test_that("Regression for each level in the unified variable", {
# Separate regression for each occupational group under newest classification
regression_sep <-
dplyr::filter(
dplyr::do(
dplyr::filter(
dplyr::mutate(
dplyr::group_by(
cat2cat::prune_c2c(df = final_data_back_ml, method = "nonzero"),
g_new_c2c
),
n = dplyr::n()
),
n >= 30
),
lm = tryCatch(
lm(formula_micer, .data, weights = multiplier * wei_freq_c2c),
error = function(e) NULL
)
),
!is.null(lm)
)
# Regression results for the first group
summary_group <- summary(regression_sep$lm[[1]])
testthat::expect_equal(
round(summary_group$coefficients, 2),
structure(
c(
9.5, 0.52, 1.87, -0.25, 0, 0, 0.46, 0.1, 0.5, 0.07,
0.02, 0, 20.63, 5.17, 3.77, -3.55, 0.08, 0.29, 0, 0, 0, 0, 0.94,
0.77
),
dim = c(6L, 4L),
dimnames = list(
c("(Intercept)", "sexTRUE", "parttime", "edu", "exp", "I(exp^2)"),
c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
)
)
)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.