knitr::opts_chunk$set(echo = TRUE) library(tidyverse) library(babynames)
Let's write a function that simulates 3 data files.
1) Demographics has all of students in it with 3 years. Columns are student id, year, name, sex, learning disability.
2) Math test has 80% of student-years in it. Columns are student id, year, and math score. Math score is missing in 20% of cases and is N(105 - 15 * learning disability, 15).
3) ELL test is taken by the 40% flagged as potentially ELL. Columns are student id, year, and score. Ell score is missing in 30% of cases. Scores are N(90 - 5 * learning disability, 15). Students no longer take the test once they score 100.
student_data_sim <- function(n_students = 1000){ # ---------------- Demographics ---------------- years <- c(2015, 2016, 2017) # need a special function to simulate learning disability in a semi-realistic way sim_learning_disability <- function(){ x <- rep(NA, 3) x[1] <- runif(n = 1) < 0.2 x[2] <- runif(n = 1) < 0.05 + 0.65 * x[1] x[3] <- runif(n = 1) < 0.05 + 0.20 * x[1] + 0.45 * x[2] x } # create demographics demographics <- babynames %>% select(name, sex) %>% distinct() %>% split(.$sex) %>% map(sample_n, size = n_students %/% 2, replace = TRUE) %>% # balance gender bind_rows() %>% sample_frac() %>% # randomly shuffle dataframe mutate(student_id = paste0("S", sample(x = 0:(1000 * n_students), size = n_students))) %>% crossing(year = years) %>% group_by(student_id) %>% mutate(learning_disability = sim_learning_disability()) %>% ungroup() %>% select(student_id, year, name, sex, learning_disability) # ---------------- Math test ---------------- # function randomly input na values in a vector x with probability p random_na <- function(x, p){ x[runif(length(x)) < p] <- NA x } # create math_test math_test <- demographics %>% select(student_id, year, learning_disability) %>% sample_frac(0.8) %>% # 80% of students took the math test mutate(math_score = learning_disability %>% map_dbl(~ rnorm(1, 105 - 15 * ., 15) %>% round()), math_score = random_na(math_score, 0.2)) # N(105 - 15 * LD, 15) and 20% of values go missing # ---------------- ELL test ---------------- # Function to have student stop taking test once they score over 100 first_100 <- function(ell_score){ rows <- length(ell_score) ell_score <- ell_score[!is.na(ell_score)] ifelse(suppressWarnings(max(ell_score) >= 100), which(ell_score >= 100)[1], rows) } # create ell test ell_test <- data_frame(student_id = sample(unique(demographics$student_id), round(n_students * 0.4))) %>% left_join(demographics, by = "student_id") %>% mutate(ell_score = learning_disability %>% map_dbl(~ rnorm(1, 90 - . * 5, 15) %>% round()), ell_score = random_na(ell_score, 0.3)) %>% # N(90 - 5 * LD, 15) and 20% of values go missing group_by(student_id) %>% slice(1:first_100(ell_score)) %>% # stop taking test if get over score of 100 ungroup() # ---------------- Return ---------------- data_frame(name = c("demographics", "math_test", "ell_test"), data = list(demographics, math_test, ell_test)) %>% mutate(data = data %>% map(~ arrange(., student_id, year))) }
sample_data <- student_data_sim(n_students = 10000) sample_data %>% write_rds("sample_data.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.