In josiahpjking/DasGuptR: Das Gupta Standardisation and Decomposition

#| echo: false
knitr::opts_chunk$set(
  message=FALSE,
  warning=FALSE
)

library(DasGuptR)
library(tidyverse)
library(gt)

For cross-classified data, Chevan & Sutherland 2009{target="_blank"} provide a method for decomposing differences in crude rates into differences in each specific category of the sub-population variables in terms of change in what proportion of the population they make up, and changes in sub-population rates.

Below we demonstrate this using eg5.3 from Das Gupta 1993 {target="_blank"}, and then Scottish Reconvictions data.

Example 5.3, Das Gupta 1993

eg5.3 <- data.frame(
  race = rep(rep(1:2, e=11),2),
  age = rep(rep(1:11,2),2),
  pop = rep(c(1985,1970), e=22),
  size = c(3041,11577,27450,32711,35480,27411,19555,19795,15254,8022,2472,
           707,2692,6473,6841,6547,4352,3034,2540,1749,804,236,
           2968,11484,34614,30992,21983,20314,20928,16897,11339,5720,1315,
           535,2162,6120,4781,3096,2718,2363,1767,1149,448,117),
  rate = c(9.163,0.462,0.248,0.929,1.084,1.810,4.715,12.187,27.728,64.068,157.570,
           17.208,0.738,0.328,1.103,2.045,3.724,8.052,17.812,34.128,68.276,125.161,
           18.469,0.751,0.391,1.146,1.287,2.672,6.636,15.691,34.723,79.763,176.837,
           36.993,1.352,0.541,2.040,3.523,6.746,12.967,24.471,45.091,74.902,123.205)
)

Here is the population-level decomposition:

dgnpop(eg5.3, pop = "pop", factors = "rate", 
       id_vars = c("race", "age"), crossclassified = "size") |>
  dg_table()

In order to calculate the influence of changes in specific categories, we need to first get out the sub-population level standardised rates from the decomposition. When we specify crossclassified argument in dgnpop(), this will aggregate up to population level unless we also specify agg = FALSE.

As we are working with vector-factors, this will return rates at the sub-population levels:

res <- dgnpop(eg5.3, pop = "pop", factors="rate",
              id_vars = c("race","age"), crossclassified = "size", agg = FALSE)
head(res)

We can take those individual rates, and then calculate the category effects for each group of the sub-population variables --- e.g., for each $i$ of race and $j$ of age.

In the scenario that we have two cross-classified variables $I$ and $J$ and the relevant sub-population rates $T$, dgnpop() will return three standardised rates: $P-I$ standardised, $P-J$ standardised, and $P-T$ standardised.

We can see these for each sub-population (reshaped to wide):

res |>
  pivot_wider(names_from = factor, values_from = rate)

To calculate category effects, we aggregate these sub-population rates up to the categories in for each cross-classified variable independently for each population (see Chevan & Sutherland 2009{target="_blank"}), and the differences between populations provide the category effects.

Race categories:

race_ce <- res |>
  pivot_wider(names_from = factor, values_from = rate) |>
  group_by(pop,race) |>
  summarise(
    # sum of P-I std rates
    IAi = sum(race_struct),
    # sum of P-T std rates / number of crossclass vars
    RTi = sum(rate)*(1/2),
    CEi = IAi+RTi
  ) |>
  group_by(race) |>
  summarise(
    # category composition effect
    comp_ce = IAi[pop==1985] - IAi[pop==1970],
    # category rate effect
    rate_ce = RTi[pop==1985] - RTi[pop==1970],
    # category total effect
    tot_ce = CEi[pop==1985] - CEi[pop==1970]
  )

Age categories:

age_ce <- res |>
  pivot_wider(names_from = factor, values_from = rate) |>
  group_by(pop, age) |>
  summarise(
    # sum of P-J std rates
    JBj = sum(age_struct),
    # sum of P-T std rates / number of crossclass vars
    RTj = sum(rate)*(1/2),
    CEj = JBj+RTj
  ) |>
  group_by(age) |>
  summarise(
    # category composition effect
    comp_ce = JBj[pop==1985] - JBj[pop==1970],
    # category rate effect
    rate_ce = RTj[pop==1985] - RTj[pop==1970],
    # category total effect
    tot_ce = CEj[pop==1985] - CEj[pop==1970]
  )

The category-total effects will sum back up to the difference in the crude rates (see top of page)

sum( c(race_ce$tot_ce, age_ce$tot_ce) )

And the category-rate effects will sum back up to the differences in rate-adjusted rates:

sum( c(race_ce$rate_ce, age_ce$rate_ce) )

We can simply take these as a percentage of the crude rate differences.

The category-composition effect (comp_ce below) represents the percentage of the crude rate difference that is due to change in $\frac{\text{category size}}{\text{total population size}}$.
The category-rate effect (rate_ce below) represents the percentage of the crude rate difference that is due to change in category-specific rates.
The category-total effect (tot_ce below) represents the percentage of the crude rate difference that is due to change in both category relative size and category specific rates.

bind_rows(
  race_ce |> mutate(var = "race") |> rename(category = race),
  age_ce |> mutate(var = "age") |> rename(category = age)
) |> 
  mutate(
    comp_ce = comp_ce/sum(tot_ce)*100,
    rate_ce = rate_ce/sum(tot_ce)*100,
    tot_ce = tot_ce/sum(tot_ce)*100
  ) |>
  group_by(var) |> 
  mutate(across(2:4, ~round(.,2))) |>
  gt()

Note that these will map to how each category changes between the two populations with respect to a) the proportion of the population that the category constitutes, and b) the average rate. If a category makes up a larger proportion of the population, its category-composition effect will be positive (if the crude rate difference is positive). If the category specific rate changes in the same direction as the crude rate changes, the category-rate effect is positive. The benefit of Chevan & Sutherland{target="_blank"}'s approach is that these are now scaled proportional to the total crude rate difference.

Changes in category proportions:

eg5.3 |> group_by(pop, age) |> 
  summarise(size = sum(size)) |> 
  group_by(pop) |> 
  mutate(prop = size/sum(size)) |> 
  group_by(age) |> 
  summarise(delta = prop[pop==1985] - prop[pop==1970])

Scottish Reconvictions Data

Data:

data(reconv)
srec <- reconv |> filter(year %in% c(2004,2016)) |>
  select(year, Sex, Age, offenders, prev_rate)

Population level decomposition:

 dgnpop(srec, pop = "year", factors = c("prev_rate"),
              id_vars = c("Age","Sex"),
              crossclassified = "offenders", agg = TRUE) |>
  dg_table()

Sub-population standardised rates:

res <- dgnpop(srec, pop = "year", factors = c("prev_rate"),
              id_vars = c("Age","Sex"),
              crossclassified = "offenders", agg = FALSE)

Category effects for Sex and Age:

sex_ce <- res |>
  pivot_wider(names_from = factor, values_from = rate) |>
  group_by(pop, Sex) |>
  summarise(
    IAi = sum(Sex_struct),
    RTi = sum(prev_rate)*(1/2),
    CEi = IAi+RTi
  ) |>
  group_by(Sex) |>
  summarise(
    comp_ce = diff(IAi),
    rate_ce = diff(RTi),
    tot_ce = diff(CEi)
  )

age_ce <- res |>
  pivot_wider(names_from = factor, values_from = rate) |>
  group_by(pop, Age) |>
  summarise(
    JBj = sum(Age_struct),
    RTj = sum(prev_rate)*(1/2),
    CEj = JBj+RTj
  ) |>
  group_by(Age) |>
  summarise(
    comp_ce = diff(JBj),
    rate_ce = diff(RTj),
    tot_ce = diff(CEj)
  )

bind_rows(
  sex_ce |> mutate(var = "sex") |> rename(category = Sex),
  age_ce |> mutate(var = "age") |> rename(category = Age)
) |> 
  mutate(
    comp_ce = comp_ce/sum(tot_ce)*100,
    rate_ce = rate_ce/sum(tot_ce)*100,
    tot_ce = tot_ce/sum(tot_ce)*100
  ) |>
  group_by(var) |> 
  mutate(across(2:4, ~round(.,2))) |>
  gt()