In daranzolin/rCAEDDATA: California Department of Education Datasets

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "README-"
)

The California Department of Education in R

The California Department of Education provides ample data. Now, that data is available in R.

Installation

devtools::install_github("daranzolin/rCAEDDATA")
library(rCAEDDATA)

Available Datasets

Cohort Outcome Data ("cohorts") -- California Longitudinal Pupil Achievement Data System (CALPADS) cohort outcome data reported by race/ethnicity, program participation, and gender.
Dropouts by Race and Gender ("dropouts") -- Data for grade seven through twelve dropouts and enrollment by race/ethnic designation and gender by school.
English Learners by Grade and Language ("english_learners") -- Data for English learners (ELs) by grade, language, and school.
Enrollment by School ("enrollments") -- Data for school-level enrollment by racial/ethnic designation, gender, and grade.
Student Poverty FRPM ("frpm") -- Data for students eligible for Free or Reduced Price Meals (FRPM).
Graduates by Race and Gender ("graduates") -- Data for graduates and graduates meeting University of California (UC)/California State University (CSU) entrance requirements by race/ethnic designation and gender by school.
Primary and Short-Term Enrollment ("primary_and_short_term") -- Data for primary and short-term school-level enrollment by racial/ethnic designation, gender, and grade.
Expulsion and Suspension Data -- Data containing student discipline data by ethnicity. Expulsion, in-school suspension, and out-of-school suspension data are provided.
Truancy -- Data containing aggregate truancy data at the state, county, district, and school levels, including Census Day enrollment, cumulative enrollment, and rates.

Examples

Graduates

library(rCAEDDATA)
library(tidyverse)
data("graduates")
graduates %>% 
  group_by(YEAR) %>% 
  summarize(total_grads = sum(GRADS),
            Yes = sum(UC_GRADS),
            No = total_grads - Yes) %>%
  select(-total_grads) %>% 
  gather(Eligibility, Graduates, -YEAR) %>% 
  ggplot(aes(YEAR, Graduates, fill = Eligibility)) +
  geom_bar(stat = "identity", color = "black") +
  labs(x = "Year",
       y = "Graduates",
       title = "California High School Graduates, 1992-2016",
       fill = "UC Eligible?") +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = c("yellow", "lightblue")) +
  theme_minimal()

Dropouts

data("dropouts")
dropouts %>% 
  select(GENDER, matches("D[0-9]{1,2}")) %>% 
  gather(GRADE, DROPOUTS, -GENDER) %>% 
  mutate(GRADE = as.numeric(stringr::str_replace(GRADE, "D", ""))) %>% 
  group_by(GENDER, GRADE) %>% 
  summarize(DROPOUTS = sum(DROPOUTS)) %>% 
  ggplot(aes(GRADE, DROPOUTS, fill = GENDER)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_x_continuous(breaks = c(7:12)) +
  labs(x = "Grade",
       y = "",
       title = "Proportion of Student Dropouts by Gender, Grades 7-12",
       fill = "Gender") +
  theme_minimal()

Enrollments

enrollments %>% 
  mutate(ETHNIC = case_when(
    ETHNIC == 0 ~ "Not Reported",
    ETHNIC == 1 ~ "American Indian",
    ETHNIC == 2 ~ "Asian",
    ETHNIC == 3 ~ "Pacific Islander",
    ETHNIC == 4 ~ "Filipino",
    ETHNIC == 5 ~ "Hispanic",
    ETHNIC == 6 ~ "African American",
    ETHNIC == 7 ~ "White",
    ETHNIC == 9 | ETHNIC == 8 ~ "Two or More")
    ) %>% 
  filter(DISTRICT %in% c("Santa Clara Unified",
                         "Milpitas Unified",
                         "San Jose Unified",
                         "Fremont Union High",
                         "Mountain View-Los Altos Union High",
                         "Cupertino Union",
                         "Campbell Union", 
                         "Cambrian",
                         "Palo Alto Unified")
  ) %>% 
  select(DISTRICT, YEAR, ETHNIC, starts_with("GR_")) %>% 
  gather(GRADE, STUDENTS, -DISTRICT, -YEAR, -ETHNIC) %>% 
  group_by(DISTRICT, YEAR, ETHNIC) %>% 
  summarize(TOTAL_STUDENTS = sum(STUDENTS)) %>% 
  ggplot(aes(YEAR, TOTAL_STUDENTS, fill = ETHNIC)) +
  geom_bar(stat = "identity", position = "fill") +
  facet_wrap(~DISTRICT, nrow = 3) +
  labs(x = "Year",
       y = "",
       title = "Ethnic Diversity in Silicon Valley, 2007-2017",
       subtitle = "Santa Clara Districts",
       fill = "Ethnicity") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Suspensions

library(maps)
library(ggmap)
library(mapdata)
states <- map_data("state")
ca_df <- subset(states, region == "california")
counties <- map_data("county")
ca_county <- subset(counties, region == "california")

drug_data <- suspensions %>% 
  filter(YEAR == "2014-15",
         AGGEGATELEVEL == "O") %>% 
  group_by(NAME) %>% 
  summarize(TOTAL_DRUGS = sum(DRUGS, na.rm = TRUE),
            TOTAL = sum(TOTAL, na.rm = TRUE),
            DRUG_PROP = round(TOTAL_DRUGS/TOTAL, 2))

map_data <- left_join(ca_county, drug_data %>% 
                        mutate(subregion = stringr::str_to_lower(NAME)), 
                      by = "subregion")

ggplot(data = ca_df, mapping = aes(x = long, y = lat, group = group)) + 
  coord_fixed(1.3) + 
  geom_polygon(color = "black", fill = "gray") +
  geom_polygon(data = map_data, aes(fill = DRUG_PROP), color = "white") +
  geom_polygon(color = "black", fill = NA) +
  labs(title = "Proportion of Drugs-Related Suspensions by County, 2014-2015",
       fill = "Proportion") +
  theme_void() +
  viridis::scale_fill_viridis()