catTOcat: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

data("trans", package = "cat2cat")
data("occup", package = "cat2cat")
# Split the panel dataset to separate years
occup_2006 <- occup[occup$year == 2006, ]
occup_2008 <- occup_old <- occup[occup$year == 2008, ]
occup_2010 <- occup_new <- occup[occup$year == 2010, ]
occup_2012 <- occup[occup$year == 2012, ]

occup_simple_backward <- cat2cat::cat2cat(
  data = list(
    old = occup_old,
    new = occup_new,
    cat_var = "code",
    time_var = "year"
  ),
  mappings = list(
    trans = trans,
    direction = "backward"
  )
)

testthat::test_that("Simple cat2cat example - two periods - backward", {
  testthat::expect_true(all(names(occup_simple_backward) %in% c("old", "new")))
  testthat::expect_true(is.data.frame(occup_simple_backward$old))
  testthat::expect_true(is.data.frame(occup_simple_backward$new))
})

# Adding the dummy level to the mapping table for levels without the candidate
# The best to fill them manually with proper candidates, if possible
trans2 <- rbind(trans, data.frame(
  old = "no_cat",
  new = setdiff(occup_new$code, trans$new)
))
# Forward mapping for the case with two periods
occup_simple_forward <- cat2cat::cat2cat(
  data = list(
    old = occup_old,
    new = occup_new,
    cat_var = "code",
    time_var = "year"
  ),
  mappings = list(
    trans = trans2,
    direction = "forward"
  )
)

testthat::test_that("Simple cat2cat example - two periods - forward", {
  testthat::expect_true(all(names(occup_simple_forward) %in% c("old", "new")))
  testthat::expect_true(is.data.frame(occup_simple_forward$old))
  testthat::expect_true(is.data.frame(occup_simple_forward$new))
})

testthat::test_that("Table with number of replications for both mapping  directions", {
  # Build number of observations before and after unification table
  res <- data.frame(
    `before_mapping` = c(nrow(occup_old), nrow(occup_new)),
    `after_mapping` = c(
      paste0(
        nrow(occup_simple_backward$old),
        " (nonzero ",
        sum(occup_simple_backward$old$wei_freq_c2c > 0),
        ")"
      ),
      paste0(
        nrow(occup_simple_forward$new),
        " (nonzero ",
        sum(occup_simple_forward$new$wei_freq_c2c > 0),
        ")"
      )
    )
  )
  rownames(res) <- c("old (backward)", "new (forward)")
  res_tab <- knitr::kable(
    res,
    "latex",
    caption = "Number of observations before and after unification."
  )
  testthat::expect_identical(
    res_tab,
    structure(
      "\\begin{table}\n\n\\caption{Number of observations before and after unification.}\n\\centering\n\\begin{tabular}[t]{l|r|l}\n\\hline\n  & before\\_mapping & after\\_mapping\\\\\n\\hline\nold (backward) & 17223 & 227662 (nonzero 163262)\\\\\n\\hline\nnew (forward) & 17323 & 18680 (nonzero 18517)\\\\\n\\hline\n\\end{tabular}\n\\end{table}",
      format = "latex",
      class = "knitr_kable"
    )
  )
})

# Set the seed as e.g., randomForest is used
set.seed(1234)
# Statistical models setup
# It could be shared for different iterations for this scenario
ml_setup <- list(
  data = rbind(occup_2010, occup_2012),
  cat_var = "code",
  method = c("knn", "rf"),
  features = c("age", "sex", "edu", "exp", "parttime", "salary"),
  args = list(k = 10)
)
# Use the cat2cat procedure to map 2010 to 2008
occup_back_2008_2010 <- cat2cat::cat2cat(
  data = list(
    old = occup_2008,
    new = occup_2010,
    cat_var = "code",
    time_var = "year"
  ),
  mappings = list(trans = trans, direction = "backward"),
  ml = ml_setup
)
# Use the cat2cat procedure to map 2008 to 2006
occup_back_2006_2008 <- cat2cat::cat2cat(
  data = list(
    old = occup_2006,
    new = occup_back_2008_2010$old,
    cat_var_new = "g_new_c2c",
    cat_var_old = "code",
    time_var = "year"
  ),
  mappings = list(trans = trans, direction = "backward"),
  ml = ml_setup
)
# Select a proper dataset for each year
o_2006 <- occup_back_2006_2008$old
o_2008 <- occup_back_2008_2010$old # or occup_back_2006_2008$new
o_2010 <- occup_back_2008_2010$new
# Add default cat2cat procedure columns to not processed dataset
o_2012 <- cat2cat::dummy_c2c(occup_2012, cat_var = "code", ml = c("knn", "rf"))
# Combine datasets
final_data_back_ml <- do.call(rbind, list(o_2006, o_2008, o_2010, o_2012))

testthat::test_that("Backward mapping, with four periods, one mapping table, and ml models", {
  testthat::expect_true(is.data.frame(final_data_back_ml))
  testthat::expect_identical(
    dim(final_data_back_ml),
    c(475543L, 19L)
  )
  testthat::expect_identical(
    colnames(final_data_back_ml),
    c(
      "id", "age", "sex", "edu", "exp", "district", "parttime", "salary",
      "code", "multiplier", "year", "code4", "index_c2c", "g_new_c2c",
      "wei_freq_c2c", "rep_c2c", "wei_naive_c2c", "wei_knn_c2c", "wei_rf_c2c"
    )
  )
})

testthat::test_that("Correlations between ml methods", {
  corr_separate <-
    dplyr::filter(
      dplyr::do(
        dplyr::select(
          dplyr::select(
            dplyr::group_by(
              dplyr::filter(final_data_back_ml, rep_c2c >= 10),
              index_c2c
            ),
            matches("wei.*c2c")
          ),
          -wei_naive_c2c
        ),
        corr = tryCatch(
          cor(.[, -1]),
          error = function(e) NA, warning = function(w) NA
        )
      ), !any(is.na(corr))
    )
  corr_table <- Reduce("+", corr_separate$corr) / length(corr_separate$corr)

  testthat::expect_true(inherits(corr_table, "matrix"))
  testthat::expect_equal(dim(corr_table), c(3L, 3L))
})

testthat::test_that("Counts for a few random levels in the unified variable over time", {
  data_count_plot <- cat2cat::prune_c2c(df = final_data_back_ml)
  data_count_plot2 <-
    dplyr::summarise_all(
      dplyr::group_by(
        dplyr::select(
          dplyr::mutate(
            dplyr::filter(
              data_count_plot,
              g_new_c2c %in% c("261102", "325502", "352111")
            ),
            g_new_c2c_nams = forcats::fct_recode(
              g_new_c2c,
              `OHS Inspector` = "325502",
              `Sound Engineer` = "352111",
              `Prosecutor` = "261102"
            )
          ),
          "wei_freq_c2c", "wei_rf_c2c", "year", "g_new_c2c_nams", -"g_new_c2c"
        ),
        g_new_c2c_nams, year
      ),
      sum
    )
  # Build counts across a 3 random categories table
  counts_base <-
    tidyr::pivot_wider(
      data_count_plot2,
      names_from = "year",
      values_from = c("wei_freq_c2c", "wei_rf_c2c"),
      names_sep = " "
    )
  res_tab <- dplyr::select(
    counts_base,
    g_new_c2c_nams,
    `rf 2006` = `wei_rf_c2c 2006`,
    `freq 2006` = `wei_freq_c2c 2006`,
    `rf 2008` = `wei_rf_c2c 2008`,
    `freq 2008` = `wei_freq_c2c 2008`,
    `2010` = `wei_rf_c2c 2010`,
    `2012` = `wei_rf_c2c 2012`
  )

  testthat::expect_true(inherits(res_tab, "data.frame"))
  testthat::expect_equal(dim(res_tab), c(3L, 7L))
})

# Mincerian-like regression formula
formula_micer <- I(log(salary)) ~ sex + parttime + edu + exp + I(exp**2)

testthat::test_that("Regression - neutral impact of the unified variable", {
  # Fit a weighted linear regression on replicated
  lms_replicated <- lm(
    formula = formula_micer,
    data = final_data_back_ml,
    weights = multiplier * wei_freq_c2c
  )
  # Adjust size of stds as the replication process enlarges degrees of freedom
  lms_replicated$df.residual <- nrow(occup) - length(lms_replicated$assign)
  # Fit a linear regression on original dataset
  lms_original <- lm(
    formula = formula_micer,
    data = occup,
    weights = multiplier
  )

  summary_replicated <- suppressWarnings(summary(lms_replicated))
  summary_original <- summary(lms_original)

  testthat::expect_equal(summary_replicated$coefficients, summary_original$coefficients)
  testthat::expect_equal(summary_replicated$r.squared, summary_original$r.squared)
})

testthat::test_that("Regression for each level in the unified variable", {
  # Separate regression for each occupational group under newest classification
  regression_sep <-
    dplyr::filter(
      dplyr::do(
        dplyr::filter(
          dplyr::mutate(
            dplyr::group_by(
              cat2cat::prune_c2c(df = final_data_back_ml, method = "nonzero"),
              g_new_c2c
            ),
            n = dplyr::n()
          ),
          n >= 30
        ),
        lm = tryCatch(
          lm(formula_micer, .data, weights = multiplier * wei_freq_c2c),
          error = function(e) NULL
        )
      ),
      !is.null(lm)
    )
  # Regression results for the first group
  summary_group <- summary(regression_sep$lm[[1]])

  testthat::expect_equal(
    round(summary_group$coefficients, 2),
    structure(
      c(
        9.5, 0.52, 1.87, -0.25, 0, 0, 0.46, 0.1, 0.5, 0.07,
        0.02, 0, 20.63, 5.17, 3.77, -3.55, 0.08, 0.29, 0, 0, 0, 0, 0.94,
        0.77
      ),
      dim = c(6L, 4L),
      dimnames = list(
        c("(Intercept)", "sexTRUE", "parttime", "edu", "exp", "I(exp^2)"),
        c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
      )
    )
  )
})

Polkas/catTOcat documentation built on Jan. 26, 2024, 7:10 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Polkas/catTOcat
Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

tests/testthat/test-softwarex_paper.R
In Polkas/catTOcat: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

R Package Documentation

Browse R Packages

We want your feedback!

Polkas/catTOcat Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

tests/testthat/test-softwarex_paper.R In Polkas/catTOcat: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

R Package Documentation

Browse R Packages

We want your feedback!

Polkas/catTOcat
Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

tests/testthat/test-softwarex_paper.R
In Polkas/catTOcat: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset