knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "tools/figs/README-",
  message = FALSE,
  warning = FALSE
)

OpenSDP Data

A project to generate realistic synthetic unit-level longitudinal education data.

Design Goals

  1. Generate synthetic education data that is realistic for use by analysts across the education sector. Realistic means messy, and reflective of the general pattern of relationships found in the U.S. education sector.
  2. Synthetic data should be able to be generated on-demand and responsive to inputs from the user. These inputs should allow the user to configure the process to produce data that resembles the patterns of data in their agency.
  3. The package should be modular and extendable allowing new data topics to be generated as needed so synthetic data coverage can grow.

Make some data

Using the wakefield package we can generate a simple set of demographic data.

library(OpenSDPsynthR)
library(magrittr)
library(wakefield)
library(lubridate)
library(purrr)
set.seed(612)

demog_master <- r_data_frame(n = 2000, 
                             id(random = TRUE), 
                             sex, 
                             # dob, set range of years available for birth
                             dob(start = Sys.Date() - 365 * 25, 
                                 k = 365 * 8, by = "1 days"), 
                             race(x = c("White", "Hispanic or Latino Ethnicity", 
                                        "Black or African American", 
                               "Asian", "American Indian or Alaska Native", 
                              "Native Hawaiian or Other Pacific Islander", 
                                        "Demographic Race Two or More Races"), 
                        prob = c(0.637, 0.163, 0.122, 0.047, .007, .0015, .021)))

head(demog_master)

Next, let's break the "Race" variable into a series of indicator variables.

demog_master %<>% make_inds("Race")
demog_master %<>% mutate_at(5:11, 
                        funs(recode(., `0` = "No", `1` = "Yes")))
head(demog_master[, 4:9])

Now, let's generate some variables conditional on race. To do this we build a list that defines the distribution of this new variable for each category of the factor level.

# List of conditional probabilties
ses_list <- list("White" = list(f = rnorm, 
                                pars = list(mean = 0.3, sd = 1.1)), 
                 "Hispanic or Latino Ethnicity" = list(f = rnorm, 
                                pars = list(mean = -0.1, sd = 0.9)),
                 "Black or African American" = list(f = rnorm, 
                                pars = list(mean = -0.2, sd = 1.2)), 
                    "Asian" = list(f = rnorm, 
                                pars = list(mean = 0.23, sd = 1.2)), 
                 "Demographic Race Two or More Races" = list(f = rnorm, 
                                pars = list(mean = 0.0, sd = 1)), 
                 "American Indian or Alaska Native" = list(f = rnorm, 
                                pars = list(mean = -0.2, sd = 1)), 
                    "Other" = list(f = rnorm, 
                                pars = list(mean = 0, sd = 1)),
                 "Native Hawaiian or Other Pacific Islander" = list(f = rnorm, 
                                pars = list(mean = 0, sd = 1))
                    )

ses_list_b <- list("White" = list(f = rbinom, 
                                pars = list(size = 1, prob = 0.4)), 
                 "Hispanic or Latino Ethnicity" = list(f = rbinom, 
                              pars = list(size = 1, prob = 0.6)),
                 "Black or African American" = list(f = rbinom, 
                              pars = list(size = 1, prob = 0.65)), 
                 "Asian" = list(f = rbinom, 
                                pars = list(size = 1, prob = 0.375)), 
                 "Demographic Race Two or More Races" = list(f = rbinom, 
                                pars = list(size = 1, prob = 0.4)), 
                 "American Indian or Alaska Native" = list(f = rbinom, 
                              pars = list(size = 1, prob = 0.4)), 
                 "Other" = list(f = rbinom, 
                                pars = list(size = 1, prob = 0.4)),
                 "Native Hawaiian or Other Pacific Islander" = list(f = rbinom, 
                                  pars = list(size = 1, prob = 0.4))
)

# Note that cond_prob returns the whole data object
demog_master <- as.data.frame(demog_master)
demog_master <- cond_prob(demog_master, factor = "Race", 
                 newvar = "ses", prob_list = ses_list_b)

head(demog_master)

Now we have basic individual demographics, let's add annual attributes.

## Generate student-year data
minyear <- 1997
maxyear <- 2016
stu_year <- vector(mode = "list", nrow(demog_master))

# Make a list of dataframes, one for each student, for each year
for(i in 1:nrow(demog_master)){
  tmp <- expand_grid_df(demog_master[i, c(1, 3)], 
                        data.frame(year = 1:12))

  tmp$year <- lubridate::year(tmp$DOB + (tmp$year + 4) * 365)
  tmp$year - lubridate::year(tmp$DOB)
  stu_year[[i]] <- tmp; rm(tmp)
}

stu_year <- bind_rows(stu_year) %>% as.data.frame()
stu_year$age <- age_calc(dob = stu_year$DOB, 
                         enddate = as.Date(paste0(stu_year$year, "-09-21")),
                         units = "years", precise = TRUE)

head(stu_year)

ELL is a good example. A student's initial ELL status determines future ELL status to a high degree. To generate a student's ELL status over time, first the initial ELL status of a student needs to be set. Below, a dataframe of the first observation for each student in the dataset is created, this contains the ID, year, age in years, and Race of the student.

# Create ELL
### Initial
## Identify first enrollment period for a student
## Look up probability based on age/race of being ELL
## Assign student to ELL status or not in first year

stu_first <- stu_year %>% group_by(ID) %>% 
  mutate(flag = if_else(age == min(age), 1, 0)) %>% 
  filter(flag == 1) %>% select(-flag) %>% as.data.frame() %>% 
  select(ID, year, age)
stu_first <- inner_join(stu_first, demog_master[, c("ID", "Race")])
stu_first$age <- round(stu_first$age, 0)
head(stu_first)

To assign students to inital ELL status, we need three things:

  1. A function that generates a random status ("ELL", "Not ELL")
  2. Parameters that define the probability of being in those two statuses
  3. A baseline of observed probabilities defined by those baselines

For ELL status, age and race are strong determinants of initial ELL status. Some racial groups are much more likely to be ELL and younger students are more likely to be ELL than older students.

The OpenSDPsynthR package bakes in some baseline values using baseline objects. A baseline object is a simple list with three elements:

  1. keys - the variable names that are required to match probabilities to cases, (e.g. age, race, etc.)
  2. fun - the function used to generate the student status
  3. data - the data that sets the parameters of the function

Let's look at the baseline for ELL, which can be accessed using the get_baseline function.

bl_data <- get_baseline("ell")
bl_data$keys

The keys are race and age -- to use this baseline we need data that includes the student age and race.

The function that will be used is rbinom and it will be passed one parameter, x.

bl_data$fun

The bl_data$data object tells us what the value of x will be:

head(bl_data$data)

For each combination of age and race, rbinom will be assigned a different probability, reflecting the empirical observed probability of being an ELL given the age and race provided.

Before we can use this baseline data, however, we need to ensure that the values age and race in our data match those in the baseline. We can check that this is not the case by comparing:

unique(bl_data$data$race)
levels(stu_first$Race)

Our stu_first object is mapped to the CEDS specification. To convert it from CEDS to a more analyst friendly scheme, the OpenSDPsynthR package provides the map_CEDS() function.

# map_CEDS assigns a new vector, so put it in a new object
stu_first$race <- map_CEDS(stu_first$Race)
table(stu_first$race, stu_first$Race)[, 1:4]

With our data matching, we can now use the assign_baseline() function.

# Assign baseline creates a new vector, so assign it
stu_first$ell_first <- assign_baseline(baseline = "ell", data = stu_first)
# Recode it
stu_first$ell_first <- ifelse(stu_first$ell_first == 1, "Yes", "No")
head(stu_first)

Using the initial ELL status of students it is now possible to simulate the transition from ELL to non-ELL student.

To simulate this process, we can use a Markov chain defined by a transition matrix: https://en.wikipedia.org/wiki/Examples_of_Markov_chains

A transition matrix simply tabulates the number of times a vector transitions from one value to another. Given a student whose ELL status is defined as 0 = not ELL and 1 = ELL, with annual statuses given by:

Student A:
1 1 1 1 1 0 1 0 0 0

The transition matrix for this student is then:

from/to | 0 | 1 | ------- | --- | ----| 0 | 2 | 1 | 1 | 2 | 4 |

To construct a proper Markov transition matrix, this matrix needs to be converted to probabilities, that sum to 1 by rows.

from/to | 0 | 1 | ------- | ---- | ----- | 0 | 0.66 | 0.33 | 1 | 0.33 | 0.66 |

This can be read as:

Then, using this transition matrix, we can generate a sequence of enrollment patterns that fit this process. This approach has two advantages:

Let's look at an example. First, we combine the first observation for each student with the annual data.

stu_year <- left_join(stu_year, stu_first[, c(1, 6)])
head(stu_year)

Now we define the transition matrix. Conveniently, we can input the observed pattern and then normalize it to a transition matrix by dividing it by the rowSums().

# Define the transition matrix
statesNames <- c("No", "Yes")
tm <- matrix(c(800, 10, 200, 200), nrow = 2, byrow = TRUE,
             dimnames = list(statesNames, statesNames))
tm <- tm / rowSums(tm)
tm

Now, for each student we need to apply the transition matrix. Using the OpenSDPsynthR function make_markov_series(), this is simple.

make_markov_series(10, tm = tm)

And applying it to each student:

stu_year <- stu_year %>% 
  group_by(ID) %>% 
  arrange(ID, year) %>% 
  mutate(ell = make_markov_series(n() - 1, 
          tm = tm, #define transition matrix
          t0 = ell_first[1], # specify that the matrix should start with first obs
          include.t0 = TRUE) # include the first observation in the sequence
         )

table(initialELL =stu_year$ell_first, byyear = stu_year$ell)

Diagnostics

How do we know it worked? We can look at the patterns of ELL enrollment that are observed and see what patterns are the most common. To do this, let's compute the frequency of transition states observed per student.

library(ggplot2)
library(tidyr)
plotdf <- stu_year %>% arrange(ID, year) %>% group_by(ID) %>% 
  do(tidy_sequence(.$ell, states = c("Yes", "No")))

plotdf$total <- rowSums(plotdf[, -1])
plotdf <- plotdf %>% gather(-ID, key = "Transition", value = "Count")

plotdf %>% group_by(Transition) %>% filter(Transition != "total") %>%
  summarize(sum(Count))


plotdf <- plotdf %>% filter(Transition != "total")  %>% 
  group_by(ID) %>% 
  mutate(total = sum(Count)) %>% 
  mutate(per = Count / total) %>% filter(Transition != "total")  %>% 
  separate(Transition, into = c("From", "To"), sep = "-") %>% 
  filter(Count < 13)

ggplot(plotdf, aes(Count)) + geom_histogram() + 
  scale_x_continuous(breaks = c(0:11)) + 
  facet_grid(From~To, labeller = label_both, switch = "y") + 
  theme_bw() + 
  labs(title = "Frequency of Transition States by Student", 
       y = "Count", x = "Times per Student State Observed")

Looking at this chart we can see that most students went from the No state to a No state -- as would be expected when there are few ELLs.

# Initial
table(stu_first$ell_first)

# Ever ELL after Markov
plotdf %>% group_by(ID, To) %>% 
  summarize(Count = sum(Count)) %>% filter(To == "Yes") %>% 
  filter(Count > 0) %>% with(., table(ELL = To))

Through this process we've gained students in the ELL status who were not initially ELL. Depending on our application this may not be desirable and we may want to modify the transition matrix to avoid this. Otherwise, later, this becomes an exercise in data cleaning.

Two other visual diagnostics are below.

# Other plots

ggplot(plotdf, aes(per)) + geom_density() + 
  facet_grid(From ~ To, labeller = label_both, switch = "y") + 
  theme_bw() + labs(title = "By Student Densities of Transitions")

# Heatmap
plotdf %>% group_by(From, To) %>% 
  summarise(Count = sum(Count)) %>% 
  ungroup %>% 
  mutate(total = sum(Count)) %>%
  mutate(per = Count/total) %>%
ggplot(aes(x = From, y = To, fill = per)) + 
  geom_tile(color= I("black")) + 
  geom_text(aes(label = round(per, digits = 2))) + 
  theme_minimal() +
  coord_cartesian()

We can also do a comparative diagnostic. Given the relatively short length of our sequence per student, it will be hard to estimate fit from a short sequence.

# series <- stu_year$ell[stu_year$ID == "1705"]
# series <- stu_year$ell[stu_year$ID == "0001"]

test_fit <- function(series, expected){
  if(dim(table(series)) == 1){
    return(TRUE)
  } else {
  out <- fit_series(series, return = "fit", confidencelevel = 0.99, 
                    possibleStates = rownames(expected))
  low <- out$lowerEndpointMatrix < expected
  hi <- out$upperEndpointMatrix > expected
  return(all(low, hi))
  }
}

test_res <- stu_year %>% group_by(ID) %>% 
  summarize(fit_ok = test_fit(ell, expected = tm))

table(test_res$fit_ok)

A better test might be to look at the summed aggregate pattern across students. This involves creating a TM per student. These will be different depending on whether a student is an initial ELL or not. This will provide a more stable estimate of how the algorithm is working.

results <- stu_year %>% group_by(ell_first) %>% 
  do(.out = markovchain::createSequenceMatrix(.$ell, 
                                          possibleStates = c("Yes", "No"))) %>% 
  ungroup %>%
  nest(-ell_first) %>%
  mutate(summed = map(data, ~ reduce(.$.out, `+`)))

lst <- results$summed
names(lst) <- results$ell_first
lst 

test_fit_m <- function(obs, expected, tol){
  obs <- obs / rowSums(obs)
  test_m <- abs(obs - expected)
  test_m < tol
}

test_fit_m(obs = lst$No, expected = tm, tol = 0.05)
test_fit_m(obs = lst$Yes, expected = tm, tol = 0.1)

Scaling

The advantage of this approach is that by combining it with the cond_prob() function, we can further simulate discrete processes for students based on other characteristics. This can be done in a group.

rm(tm, test_res, results, plotdf, bl_data, i, lst, maxyear, minyear)

# Need Race for this process
stu_year <- left_join(stu_year, stu_first[, c("ID", "Race")])

# Make a list of random transition matrices
tm_list <- replicate(8, matrix(c(sample(750:900, 1),
                    sample(400:500, 1),
                    sample(125:175, 1),
                    sample(1500:2200, 1)),
                    2, 2, dimnames = list(c("Yes", "No"), 
                    c("Yes", "No"))), simplify = FALSE) %>% 
  lapply(function(x) x / rowSums(x))

# Put them in a list expected by condprob
ses_list_MC <- list("White" = list(f = make_markov_series, 
                                pars = list(tm = tm_list[[1]])), 
                 "Hispanic or Latino Ethnicity" = list(f = make_markov_series, 
                                pars = list(tm = tm_list[[2]])),
                 "Black or African American" = list(f = make_markov_series, 
                                pars = list(tm = tm_list[[3]])),
                 "Asian" = list(f = make_markov_series, 
                                pars = list(tm = tm_list[[4]])), 
                 "Demographic Race Two or More Races" = list(f = make_markov_series, 
                                pars = list(tm = tm_list[[5]])), 
                 "American Indian or Alaska Native" = list(f = make_markov_series, 
                                pars = list(tm = tm_list[[6]])), 
                 "Other" = list(f = make_markov_series, 
                                pars = list(tm = tm_list[[7]])),
                 "Native Hawaiian or Other Pacific Islander" = list(f = make_markov_series, 
                                pars = list(tm = tm_list[[8]])))

stu_year <- cond_prob(stu_year, factor = "Race", 
                 newvar = "frpl", prob_list = ses_list_MC)

## Create the matrix by student by race, 
## Sum the student matrices within each race so that there is 1 matrix 
## per category
results <- stu_year %>% group_by(Race, ID) %>% 
  do(.out = markovchain::createSequenceMatrix(.$frpl, 
                            possibleStates = c("Yes", "No"))) %>% 
  ungroup %>%
  nest(-Race) %>%
  mutate(summed = map(data, ~ reduce(.$.out, `+`)))

## Turn this into a list for easier manipulation
lst <- results$summed
names(lst) <- results$Race
# lst 

# create a list that you can use to compare the observed matrix to
names(tm_list) <- names(ses_list_MC)
tm_list <- tm_list[names(lst)]
# Run the test_fit_m function over both lists simultaneously
map2(lst, tm_list, ~ test_fit_m(obs = .x, expected = .y, tol = 0.15)) %>% 
  map_lgl(all)

Now, let's take this to scale. We can make a by-variable list that specifies the way to generate status sequences. The list is structured as:

stu_year <- left_join(stu_year, demog_master[, c("ID", "Sex")])
# gifted
tm_gifted_f <- matrix(c(500, 1, 2, 500), nrow = 2, byrow = TRUE, 
                      dimnames = list(c("Yes", "No"), c("Yes", "No")))
tm_gifted_m <- tm_gifted_f
# Make the TM different for boys, boys less likely to transition out of gifted status
tm_gifted_m[1, 1] <- tm_gifted_m[1, 1] + 25

# Set initial condition as 10% gifted / 90% not for boys, 8%/88% for girls
gifted_list <- list("Male" = list(f = make_markov_series, 
                                   pars = list(tm = tm_gifted_m/ rowSums(tm_gifted_m), 
                                    # Use quote so for each call in the loop sample is redrawn
                                         t0 = quote(sample(c("Yes", "No"), 1, prob = c(10, 90))))),
                       "Female" = list(f = make_markov_series, 
                                     pars = list(tm_gifted_f / rowSums(tm_gifted_f), 
                                        t0 = quote(sample(c("Yes", "No"), 1, prob = c(8, 92))))))
# IEP
tm_iep_f <- matrix(c(250, 50, 150, 900), nrow = 2, byrow = TRUE, 
                      dimnames = list(c("Yes", "No"), c("Yes", "No")))
tm_iep_m <- tm_iep_f
tm_iep_m[, 1] <- tm_iep_m[, 1] + 50

iep_list <- list("Male" = list(f = make_markov_series, 
                                     pars = list(tm = tm_iep_m / rowSums(tm_iep_m), 
                                       t0 = quote(sample(c("Yes", "No"), 1, prob = c(20, 80))))),
                       "Female" = list(f = make_markov_series, 
                                       pars = list(tm_iep_f / rowSums(tm_iep_f)), 
                                       t0 = quote(sample(c("Yes", "No"), 1, prob = c(16, 84)))))


stu_year <- stu_year %>% group_by(ID) %>% arrange(year) %>% 
  mutate(iep = markov_cond_list(Sex[1], n = n(), iep_list),
          gifted = markov_cond_list(Sex[1], n = n(), gifted_list))

results <- stu_year %>% group_by(Sex) %>% 
  do(.out = markovchain::createSequenceMatrix(.$gifted, 
                                              possibleStates = c("Yes", "No"))) %>% 
  ungroup %>%
  nest(-Sex) %>%
  mutate(summed = map(data, ~ reduce(.$.out, `+`)))

## Turn this into a list for easier manipulation
lst <- results$summed
names(lst) <- results$Sex

Now let's add grades:

# grade_mat <- read.csv("data-raw/grade_matrix.csv")
# zzz <- as.matrix(grade_mat[, 2:15])
# dimnames(zzz) <- list(grade_mat[, 1], grade_mat[, 1])
# 
# make_markov_series(10, tm = zzz, t0 = "2")

gradeNames <- c("2", "1", "0", "-1")
tm_grade <- matrix(c(0, 800, 20, 2, 
                     10, 1200, 20, 2, 
                     12, 1200, 16, 0, 
                     20, 1200, 2, 1), nrow = 4, byrow=TRUE, 
               dimnames = list(gradeNames, gradeNames))
testMC <- as(tm_grade/rowSums(tm_grade), "markovchain")

tm_grade_f <- tm_grade/rowSums(tm_grade)
tm_grade_m <- tm_grade
tm_grade_m[, 2] <- tm_grade_m[, 2] + 30
tm_grade_m[, 3] <- tm_grade_m[, 3] + 5
tm_grade_m <- tm_grade_m/rowSums(tm_grade_m)

# make_markov_series(100, tm = tm_grade_m, t0 = "-1")


grade_list <- list("Male" = list(f = make_markov_series, 
                                     pars = list(tm = tm_grade_m)),
                       "Female" = list(f = make_markov_series, 
                                       pars = list(tm = tm_grade_f)))


stu_year <- stu_year %>% group_by(ID) %>% arrange(year) %>% 
  mutate(grade_adv = markov_cond_list(Sex[1], n = n(), grade_list))


results <- stu_year %>% group_by(Sex) %>% 
  do(.out = markovchain::createSequenceMatrix(.$grade_adv, 
                                  possibleStates = c("-1", "0", "1", "2"))) %>% 
  ungroup %>%
  nest(-Sex) %>%
  mutate(summed = map(data, ~ reduce(.$.out, `+`)))

## Turn this into a list for easier manipulation
lst <- results$summed
names(lst) <- results$Sex
lst

Cleanup workspace and data

rm(tm_gifted_f, tm_gifted_m, tm_grade, tm_grade_f, tm_grade_m, results, 
   tm_iep_f, tm_iep_m, grade_mat, gifted_list, grade_list, gradeNames, ses_list, 
   ses_list_b, ses_list_MC, statesNames, testMC, tm_list, lst, iep_list, zzz)

stu_year$ell_first <- NULL

# Look at by year patterns of relationships by student year
table(FRL = stu_year$frpl, GIFTED = stu_year$gifted)
table(FRL = stu_year$frpl, IEP = stu_year$iep)
table(FRL = stu_year$gifted, IEP = stu_year$iep)

gamma_GK(stu_year$gifted, stu_year$iep)
gamma_GK(stu_year$frpl, stu_year$iep)
gamma_GK(stu_year$frpl, stu_year$ell)

gamma_GK(stu_year$Race, stu_year$ell)
gamma_GK(stu_year$Sex, stu_year$iep)
gamma_GK(stu_year$Sex, stu_year$gifted)

Collapse down

test_df <- stu_year %>% group_by(ID) %>% 
  summarize(iep_ever = if_else(any(iep == "Yes"), "Yes", "No"), 
            ell_ever = if_else(any(ell == "Yes"), "Yes", "No"), 
            frpl_ever = if_else(any(frpl == "Yes"), "Yes", "No"), 
            gifted_ever = if_else(any(gifted == "Yes"), "Yes", "No"))

table(IEP_EVER = test_df$iep_ever)
table(ELL_EVER = test_df$ell_ever)
table(FRPL_EVER = test_df$frpl_ever)
table(GIFTED_EVER = test_df$gifted_ever)

Package Dependencies

OpenSDP

OpenSDPsynthR is part of the OpenSDP project.



strategicdataproject/OpenSDPsynthR documentation built on June 20, 2020, 6:17 a.m.