cat2cat.R
In cat2cat: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(size = "tiny")
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)

## ---- message=FALSE, warning=FALSE--------------------------------------------
library("cat2cat")
library("dplyr")

data("occup", package = "cat2cat")
data("trans", package = "cat2cat")

occup_2006 <- occup[occup$year == 2006, ]
occup_2008 <- occup_old <- occup[occup$year == 2008, ]
occup_2010 <- occup_new <- occup[occup$year == 2010, ]
occup_2012 <- occup[occup$year == 2012, ]

## -----------------------------------------------------------------------------
## cat2cat
occup_simple <- cat2cat(
  data = list(
    old = occup_old, new = occup_new, cat_var = "code", time_var = "year"
  ),
  mappings = list(trans = trans, direction = "backward")
)

## with informative features it might be usefull to run ml algorithm
## currently knn, lda and rf (randomForest), could be a few at once
## where probability will be assessed as fraction of closest points.
occup_2 <- cat2cat(
  data = list(
    old = occup_old, new = occup_new,
    cat_var = "code", time_var = "year"
  ),
  mappings = list(trans = trans, direction = "backward"),
  ml = list(
    data = occup_new,
    cat_var = "code",
    method = "knn",
    features = c("age", "sex", "edu", "exp", "parttime", "salary"),
    args = list(k = 10)
  )
)

## -----------------------------------------------------------------------------
# summary_plot
plot_c2c(occup_2$old, type = c("both"))

## -----------------------------------------------------------------------------
# mix of methods
occup_2_mix <- cat2cat(
  data = list(
    old = occup_old, new = occup_new,
    cat_var = "code", time_var = "year"
  ),
  mappings = list(trans = trans, direction = "backward"),
  ml = list(
    data = occup_new,
    cat_var = "code",
    method = c("knn", "rf", "lda"),
    features = c("age", "sex", "edu", "exp", "parttime", "salary"),
    args = list(k = 10, ntree = 50)
  )
)
# cross all methods and subset one highest probability category for each subject
occup_old_mix_highest1 <- occup_2_mix$old %>%
  cross_c2c(.) %>%
  prune_c2c(., column = "wei_cross_c2c", method = "highest1")

## -----------------------------------------------------------------------------
# correlation between ml models and simple fequencies
occup_2_mix$old %>%
  select(wei_knn_c2c, wei_rf_c2c, wei_lda_c2c, wei_freq_c2c) %>%
  cor()

## -----------------------------------------------------------------------------
# from 2010 to 2008
occup_back_2008_2010 <- cat2cat(
  data = list(
    old = occup_2008, new = occup_2010,
    cat_var = "code", time_var = "year"
  ),
  mappings = list(trans = trans, direction = "backward")
)

# optional, give more control
# the counts could be any of wei_* or their combination
freqs_df <-
  occup_back_2008_2010$old[, c("g_new_c2c", "wei_freq_c2c")] %>%
  group_by(g_new_c2c) %>%
  summarise(counts = round(sum(wei_freq_c2c)))

# from 2008 to 2006
occup_back_2006_2008 <- cat2cat(
  data = list(
    old = occup_2006,
    new = occup_back_2008_2010$old,
    cat_var_new = "g_new_c2c",
    cat_var_old = "code",
    time_var = "year"
  ),
  mappings = list(
    trans = trans, direction = "backward",
    freqs_df = freqs_df
  )
)

o_2006_new <- occup_back_2006_2008$old
# or occup_back_2006_2008$new
o_2008_new <- occup_back_2008_2010$old
o_2010_new <- occup_back_2008_2010$new
# use ml argument when applied ml models
o_2012_new <- dummy_c2c(occup_2012, "code")

final_data_back <- do.call(
  rbind,
  list(o_2006_new, o_2008_new, o_2010_new, o_2012_new)
)

## -----------------------------------------------------------------------------
# We persist the number of observations
counts_new <- final_data_back %>%
  cross_c2c() %>%
  group_by(year) %>%
  summarise(
    n = as.integer(round(sum(wei_freq_c2c))),
    n2 = as.integer(round(sum(wei_cross_c2c)))
  )

counts_old <- occup %>%
  group_by(year) %>%
  summarise(n = n(), n2 = n(), .groups = "drop")

identical(counts_new, counts_old)

# counts per each level
counts_per_level <- final_data_back %>%
  group_by(year, g_new_c2c) %>%
  summarise(n = sum(wei_freq_c2c), .groups = "drop") %>%
  arrange(g_new_c2c, year)

## -----------------------------------------------------------------------------
trans2 <- rbind(
  trans,
  data.frame(
    old = "no_cat",
    new = setdiff(
      c(occup_2010$code, occup_2012$code),
      trans$new
    )
  )
)

## -----------------------------------------------------------------------------
# from 2008 to 2010
occup_for_2008_2010 <- cat2cat(
  data = list(
    old = occup_2008, new = occup_2010,
    cat_var = "code", time_var = "year"
  ),
  mappings = list(trans = trans2, direction = "forward")
)

# optional, give more control
# the counts could be any of wei_* or their combination
freqs_df <-
  occup_for_2008_2010$new[, c("g_new_c2c", "wei_freq_c2c")] %>%
  group_by(g_new_c2c) %>%
  summarise(counts = round(sum(wei_freq_c2c)))

# from2010 to 2012
occup_for_2010_2012 <- cat2cat(
  data = list(
    old = occup_for_2008_2010$new,
    new = occup_2012,
    cat_var_old = "g_new_c2c",
    cat_var_new = "code",
    time_var = "year"
  ),
  mappings = list(
    trans = trans2, direction = "forward",
    freqs_df = freqs_df
  )
)

# use ml argument when applied ml models
o_2006_new <- dummy_c2c(occup_2006, "code")
o_2008_new <- occup_for_2008_2010$old
o_2010_new <- occup_for_2008_2010$new # or occup_for_2010_2012$old
o_2012_new <- occup_for_2010_2012$new

final_data_for <- do.call(
  rbind,
  list(o_2006_new, o_2008_new, o_2010_new, o_2012_new)
)

## -----------------------------------------------------------------------------
# We persist the number of observations
counts_new <- final_data_for %>%
  cross_c2c() %>%
  group_by(year) %>%
  summarise(
    n = as.integer(round(sum(wei_freq_c2c))),
    n2 = as.integer(round(sum(wei_cross_c2c)))
  )

counts_old <- occup %>%
  group_by(year) %>%
  summarise(n = n(), n2 = n(), .groups = "drop")

identical(counts_new, counts_old)

# counts per each level
counts_per_level <- final_data_for %>%
  group_by(year, g_new_c2c) %>%
  summarise(n = sum(wei_freq_c2c), .groups = "drop") %>%
  arrange(g_new_c2c, year)

## -----------------------------------------------------------------------------
# from 2010 to 2008
occup_back_2008_2010 <- cat2cat(
  data = list(
    old = occup_2008, new = occup_2010,
    cat_var = "code", time_var = "year"
  ),
  mappings = list(trans = trans, direction = "backward"),
  ml = list(
    data = dplyr::bind_rows(occup_2010, occup_2012),
    cat_var = "code",
    method = c("knn"),
    features = c("age", "sex", "edu", "exp", "parttime", "salary"),
    args = list(k = 10)
  )
)

# from 2008 to 2006
occup_back_2006_2008 <- cat2cat(
  data = list(
    old = occup_2006,
    new = occup_back_2008_2010$old,
    cat_var_new = "g_new_c2c",
    cat_var_old = "code",
    time_var = "year"
  ),
  mappings = list(trans = trans, direction = "backward"),
  ml = list(
    data = dplyr::bind_rows(occup_2010, occup_2012),
    cat_var = "code",
    method = c("knn"),
    features = c("age", "sex", "edu", "exp", "parttime", "salary"),
    args = list(k = 10)
  )
)

o_2006_new <- occup_back_2006_2008$old
# or occup_back_2006_2008$new
o_2008_new <- occup_back_2008_2010$old
o_2010_new <- occup_back_2008_2010$new
o_2012_new <- dummy_c2c(occup_2012, cat_var = "code", ml = c("knn"))

final_data_back_ml <- do.call(
  rbind,
  list(o_2006_new, o_2008_new, o_2010_new, o_2012_new)
)

## -----------------------------------------------------------------------------
counts_new <- final_data_back_ml %>%
  cross_c2c() %>%
  group_by(year) %>%
  summarise(
    n = as.integer(round(sum(wei_freq_c2c))),
    n2 = as.integer(round(sum(wei_cross_c2c))),
    .groups = "drop"
  )

counts_old <- occup %>%
  group_by(year) %>%
  summarise(n = n(), n2 = n(), .groups = "drop")

identical(counts_new, counts_old)

# counts per each level
counts_per_level <- final_data_back_ml %>%
  group_by(year, g_new_c2c) %>%
  summarise(n = sum(wei_freq_c2c), .groups = "drop") %>%
  arrange(g_new_c2c, year)

## -----------------------------------------------------------------------------
ff <- final_data_back_ml %>%
  split(.$year) %>%
  lapply(function(x) {
    x %>%
      cross_c2c() %>%
      prune_c2c(column = "wei_cross_c2c", method = "highest1")
  }) %>%
  bind_rows()
all.equal(nrow(ff), sum(final_data_back_ml$wei_freq_c2c))

## -----------------------------------------------------------------------------
## orginal dataset
lms2 <- lm(
  I(log(salary)) ~ age + sex + factor(edu) + parttime + exp,
  data = occup_old,
  weights = multiplier
)
summary(lms2)

## using one highest cross weights
## cross_c2c to cross differen methods weights
## prune_c2c
## highest1 leave only one the highest probability obs for each subject
occup_old_2 <- occup_2$old %>%
  cross_c2c(., c("wei_freq_c2c", "wei_knn_c2c"), c(1, 1) / 2) %>%
  prune_c2c(., column = "wei_cross_c2c", method = "highest1")
lms <- lm(
  I(log(salary)) ~ age + sex + factor(edu) + parttime + exp,
  data = occup_old_2,
  weights = multiplier
)
summary(lms)

## we have to adjust size of stds
## as we artificialy enlarge degrees of freedom
occup_old_3 <- occup_2$old %>%
  prune_c2c(method = "nonzero") # many prune methods like highest
lms_replicated <- lm(
  I(log(salary)) ~ age + sex + factor(edu) + parttime + exp,
  data = occup_old_3,
  weights = multiplier * wei_freq_c2c
)
# Adjusted R2 is meaningless here
lms_replicated$df.residual <-
  nrow(occup_old) - length(lms_replicated$assign)
suppressWarnings(summary(lms_replicated))

## -----------------------------------------------------------------------------
formula_oo <- formula(
  I(log(salary)) ~ age + sex + factor(edu) + parttime + exp + factor(year)
)
oo <- final_data_back %>%
  prune_c2c(method = "nonzero") %>% # many prune methods like highest
  group_by(g_new_c2c) %>%
  filter(n() >= 15) %>%
  do(
    lm = tryCatch(
      summary(lm(formula_oo, ., weights = multiplier * wei_freq_c2c)),
      error = function(e) NULL
    )
  ) %>%
  filter(!is.null(lm))

head(oo)

oo$lm[[2]]

## -----------------------------------------------------------------------------
library("cat2cat")
data("verticals", package = "cat2cat")

agg_old <- verticals[verticals$v_date == "2020-04-01", ]
agg_new <- verticals[verticals$v_date == "2020-05-01", ]

## cat2cat_agg - could map in both directions at once although
## usually we want to have old or new representation
agg <- cat2cat_agg(
  data = list(
    old = agg_old,
    new = agg_new,
    cat_var = "vertical",
    time_var = "v_date",
    freq_var = "counts"
  ),
  Automotive %<% c(Automotive1, Automotive2),
  c(Kids1, Kids2) %>% c(Kids),
  Home %>% c(Home, Supermarket)
)

## possible processing
library("dplyr")
agg %>%
  bind_rows() %>%
  group_by(v_date, vertical) %>%
  summarise(
    sales = sum(sales * prop_c2c),
    counts = sum(counts * prop_c2c),
    v_date = first(v_date),
    .groups = "drop"
  )

## -----------------------------------------------------------------------------
library(cat2cat)
## the ean variable is a unique identifier
data("verticals2", package = "cat2cat")

vert_old <- verticals2[verticals2$v_date == "2020-04-01", ]
vert_new <- verticals2[verticals2$v_date == "2020-05-01", ]

## get mapping (transition) table
trans_v <- vert_old %>%
  inner_join(vert_new, by = "ean") %>%
  select(vertical.x, vertical.y) %>%
  distinct()

## -----------------------------------------------------------------------------
## cat2cat
## it is important to set id_var as then we merging categories 1 to 1
## for this identifier which exists in both periods.
verts <- cat2cat(
  data = list(
    old = vert_old, new = vert_new, id_var = "ean",
    cat_var = "vertical", time_var = "v_date"
  ),
  mappings = list(trans = trans_v, direction = "backward")
)

Any scripts or data that you put into this service are public.

cat2cat documentation built on Feb. 16, 2023, 7:11 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cat2cat
Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

inst/doc/cat2cat.R
In cat2cat: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

Try the cat2cat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cat2cat Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

inst/doc/cat2cat.R In cat2cat: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

Try the cat2cat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cat2cat
Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset

inst/doc/cat2cat.R
In cat2cat: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset