knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(babynames)

Let's write a function that simulates 3 data files.

1) Demographics has all of students in it with 3 years. Columns are student id, year, name, sex, learning disability.

2) Math test has 80% of student-years in it. Columns are student id, year, and math score. Math score is missing in 20% of cases and is N(105 - 15 * learning disability, 15).

3) ELL test is taken by the 40% flagged as potentially ELL. Columns are student id, year, and score. Ell score is missing in 30% of cases. Scores are N(90 - 5 * learning disability, 15). Students no longer take the test once they score 100.

Part 1: Write the function

student_data_sim <- function(n_students = 1000){
  # ---------------- Demographics ----------------
  years <- c(2015, 2016, 2017)

  # need a special function to simulate learning disability in a semi-realistic way
  sim_learning_disability <- function(){
    x <- rep(NA, 3)
    x[1] <- runif(n = 1) < 0.2
    x[2] <- runif(n = 1) < 0.05 + 0.65 * x[1]
    x[3] <- runif(n = 1) < 0.05 + 0.20 * x[1] + 0.45 * x[2]
    x
  }

  # create demographics
  demographics <- babynames %>% 
    select(name, sex) %>% 
    distinct() %>% 
    split(.$sex) %>% 
    map(sample_n, size = n_students %/% 2, replace = TRUE) %>% # balance gender
    bind_rows() %>% 
    sample_frac() %>% # randomly shuffle dataframe
    mutate(student_id = paste0("S", sample(x = 0:(1000 * n_students), size = n_students))) %>% 
    crossing(year = years) %>% 
    group_by(student_id) %>% 
    mutate(learning_disability = sim_learning_disability()) %>% 
    ungroup() %>% 
    select(student_id, year, name, sex, learning_disability)

  # ---------------- Math test ----------------
  # function randomly input na values in a vector x with probability p
  random_na <- function(x, p){
   x[runif(length(x)) < p] <- NA
   x
  }

  # create math_test
  math_test <- demographics %>% 
    select(student_id, year, learning_disability) %>% 
    sample_frac(0.8) %>% # 80% of students took the math test
    mutate(math_score = learning_disability %>% map_dbl(~ rnorm(1, 105 - 15 * ., 15) %>% round()),
           math_score = random_na(math_score, 0.2)) # N(105 - 15 * LD, 15) and 20% of values go missing

  # ---------------- ELL test ----------------
  # Function to have student stop taking test once they score over 100
  first_100 <- function(ell_score){
    rows <- length(ell_score)
    ell_score <- ell_score[!is.na(ell_score)]
    ifelse(suppressWarnings(max(ell_score) >= 100), which(ell_score >= 100)[1], rows)
  }

  # create ell test
  ell_test <- data_frame(student_id = sample(unique(demographics$student_id), round(n_students * 0.4))) %>% 
    left_join(demographics, by = "student_id") %>% 
    mutate(ell_score = learning_disability %>% map_dbl(~ rnorm(1, 90 - . * 5, 15) %>% round()),
           ell_score = random_na(ell_score, 0.3)) %>% # N(90 - 5 * LD, 15) and 20% of values go missing
    group_by(student_id) %>% 
    slice(1:first_100(ell_score)) %>% # stop taking test if get over score of 100
    ungroup()

  # ---------------- Return ----------------
  data_frame(name = c("demographics", "math_test", "ell_test"),
             data = list(demographics, math_test, ell_test)) %>% 
    mutate(data = data %>% map(~ arrange(., student_id, year)))
}

Part 2: Use the function

sample_data <- student_data_sim(n_students = 10000)

sample_data %>% write_rds("sample_data.rds")


stenhaug/combineR documentation built on May 31, 2019, 5:13 p.m.