R/parcc.R

Defines functions fetch_all_parcc fetch_parcc tidy_parcc_subgroup process_parcc parcc_column_order get_raw_sla get_raw_parcc

Documented in fetch_all_parcc fetch_parcc get_raw_parcc get_raw_sla parcc_column_order process_parcc tidy_parcc_subgroup

#' @title reads the raw PARCC excel files from the state website
#' 
#' @description
#' \code{get_raw_parcc} builds a url and reads the xlsx file into a dataframe
#' 
#' @param end_year a school year.  end_year is the end of the academic year - eg 2014-15
#' school year is end_year 2015.  valid values are 2015-2017
#' @param grade_or_subj grade level (eg 8) OR math subject code (eg ALG1, GEO, ALG2)
#' @param subj PARCC subject. c('ela' or 'math')
#' @return PARCC dataframe
#' @export

get_raw_parcc <- function(end_year, grade_or_subj, subj) {  
  
  if (is.numeric(grade_or_subj)) {
    parcc_grade <- pad_grade(grade_or_subj)
    
    #in 2017 they forgot how grade levels work
    if (end_year == 2017 & grade_or_subj >= 10) {
      parcc_grade <- paste0('0', parcc_grade)
    }
    #in 2018 - honestly I just can't.  
    # fine, state of NJ, ELA003. it's only broken code, not life and death, as they say.
    if (end_year == 2018 & subj == 'ela') {
      parcc_grade <- paste0('0', parcc_grade) 
    }
  } else {
    parcc_grade <- grade_or_subj
  }
  
  stem <- 'https://www.nj.gov/education/assessment/results/reports/'
  
  #after 2016 they
  #added a spring / fall element
  #eg http://www.nj.gov/education/schools/achievement/16/parcc/spring/ELA03.xlsx
  #we're pulling spring only (for now)
  season_variant <- if (end_year >= 2016) {
    'spring/'
  } else {
    'parcc/'
  }
  
  sy <- as.numeric(substr(end_year, 3, 4))
  
  target_url <- paste0(
    stem, sy - 1, sy, '/', season_variant,
      parse_parcc_subj(subj), parcc_grade, '.xlsx' 
  )
  
  tname <- tempfile(pattern = 'parcc', tmpdir = tempdir(), fileext = '.xlsx')
  tdir <- tempdir()
  downloader::download(target_url, destfile = tname, mode = 'wb') 
  parcc <- readxl::read_excel(path = tname, skip = 2, na = '*', guess_max = 30000)
  
  #last two rows are notes
  parcc <- parcc[1:(nrow(parcc)-2), ]
  parcc
}


#' Title
#'
#' @inheritParams get_raw_parcc
#'
#' @return NJSLA dataframe
#' @export

get_raw_sla <- function(end_year, grade_or_subj, subj) {
  
  if (is.numeric(grade_or_subj)) {
    parcc_grade <- pad_grade(grade_or_subj)
    subj <- parse_parcc_subj(subj)
  } else if (grepl('ALG|GEO', grade_or_subj)) {
    parcc_grade <- gsub('ALG', 'ALG0', grade_or_subj)
    subj <- ''
  } else {
    parcc_grade <- grade_or_subj
    subj <- parse_parcc_subj(subj)
  }
  
  stem <- 'https://www.nj.gov/education/assessment/results/reports/'
  
  target_url <- paste0(
    stem, substr(end_year - 1, 3, 4), substr(end_year, 3, 4), '/spring/',
    subj, parcc_grade, '%20NJSLA%20DATA%20', 
    # "2018-19"
    end_year - 1, '-', substr(end_year, 3, 4),
    '.xlsx' 
  )
  # 19/njsla/spring/ALG01%20NJSLA%20DATA%202018-19.xlsx
  # 19/njsla/spring/ALG02%20NJSLA%20DATA%202018-19.xlsx
  
  tname <- tempfile(pattern = 'njsla', tmpdir = tempdir(), fileext = '.xlsx')
  tdir <- tempdir()
  downloader::download(target_url, destfile = tname, mode = 'wb') 
  njsla <- readxl::read_excel(path = tname, skip = 2, na = '*', guess_max = 30000)

  #last two rows are notes
  njsla <- njsla[1:(nrow(njsla)-2), ]
  njsla
}


#' PARCC column order
#'
#' @param df tidied PARCC dataframe.  called as final step in fetch_parcc when tidy=TRUE
#'
#' @return PARCC df with columns in coherent order
#' @export

parcc_column_order <- function(df) {
  df %>% 
    select(
      testing_year,
      assess_name, 
      test_name,
      grade,
      county_id, county_name,
      district_id, district_name,
      school_id, school_name,
      contains("dfg"),
      subgroup, subgroup_type,
      number_enrolled, number_not_tested, 
      number_of_valid_scale_scores,
      scale_score_mean,
      pct_l1, pct_l2, pct_l3, pct_l4, pct_l5, proficient_above,
      num_l1, num_l2, num_l3, num_l4, num_l5,
      is_state, is_dfg, 
      is_district, is_school, is_charter,
      is_charter_sector,
      is_allpublic
    )
}


#' Process a raw PARCC data file
#' 
#' @description all the logic needed to clean up the raw PARCC files
#'
#' @param parcc_file output of get_raw_parcc
#' @inheritParams get_raw_parcc
#'
#' @return a tbl_df / data rfame
#' @export

process_parcc <- function(parcc_file, end_year, grade, subj) {
  
  parcc_name_vector <- c(
    'county_code', 'county_name', 
    'district_code', 'district_name', 
    'school_code', 'school_name', 
    'dfg', 
    'subgroup_type', 'subgroup', 
    'number_enrolled', 'number_not_tested', 'number_of_valid_scale_scores', 
    'scale_score_mean', 'pct_l1', 'pct_l2', 'pct_l3', 
    'pct_l4', 'pct_l5'
  )
  
  # NJSLA
  if (end_year >= 2019) {
    # dfg dropped in 2018-19
    names(parcc_file) <- parcc_name_vector[c(1:6, 8:18)]
  } else {
    names(parcc_file) <- parcc_name_vector
  }
  
  #make some numeric
  parcc_file$number_enrolled <- as.numeric(parcc_file$number_enrolled)
  parcc_file$number_not_tested <- as.numeric(parcc_file$number_not_tested)
  parcc_file$number_of_valid_scale_scores <- as.numeric(parcc_file$number_of_valid_scale_scores)
  parcc_file$pct_l1 <- as.numeric(parcc_file$pct_l1)
  parcc_file$pct_l2 <- as.numeric(parcc_file$pct_l2)
  parcc_file$pct_l3 <- as.numeric(parcc_file$pct_l3)
  parcc_file$pct_l4 <- as.numeric(parcc_file$pct_l4)
  parcc_file$pct_l5 <- as.numeric(parcc_file$pct_l5)
  parcc_file$scale_score_mean <- as.numeric(parcc_file$scale_score_mean)
  
  #new columns
  parcc_file$testing_year <- end_year
  parcc_file$assess_name <- ifelse(end_year >= 2019, 'NJSLA', 'PARCC')
  parcc_file$grade <- as.character(grade)
  parcc_file$test_name <- subj
  parcc_file <- parcc_file %>%
    rowwise() %>%
    mutate(
      proficient_above = ifelse(
        is.finite(pct_l4),
        sum(pct_l4 + pct_l5, na.rm = TRUE),
        NA_real_
      )
    )
  
  #remove random NA row that has the year and grade only
  parcc_file <- parcc_file %>% filter(!is.na(county_code))
  
  #tag subsets
  parcc_file$is_state <- parcc_file$county_code == 'STATE'
  parcc_file$is_dfg <- parcc_file$county_code == 'DFG'
  parcc_file$is_district = is.na(parcc_file$school_code) & !is.na(parcc_file$district_code)
  parcc_file$is_school = !is.na(parcc_file$school_code)
  parcc_file$is_charter = parcc_file$county_code == '80'

  parcc_file$is_charter_sector <- FALSE
  parcc_file$is_allpublic <- FALSE
  
  # use district_id, etc
  parcc_file <- parcc_file %>%
    rename(
      county_id = county_code,
      district_id = district_code,
      school_id = school_code
    )
  
  # level counts
  parcc_file <- parcc_perf_level_counts(parcc_file)
  
  # column order
  parcc_column_order(parcc_file)
}


#' tidy parcc subgroup
#'
#' @param sv subgroup column from parcc data file
#'
#' @return character vector with consistent subgroup names
#' @export

tidy_parcc_subgroup <- function(sv) {
  
  # 2018 is all proper case
  sv <- toupper(sv)
  
  sv <- gsub("ALL STUDENTS", 'total_population', sv, fixed = TRUE)
  
  sv <- gsub('WHITE', 'white', sv, fixed = TRUE)
  sv <- gsub('AFRICAN AMERICAN', 'black', sv, fixed = TRUE)
  sv <- gsub('ASIAN', 'asian', sv, fixed = TRUE)
  sv <- gsub('HISPANIC', 'hispanic', sv, fixed = TRUE)
  sv <- gsub(
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER|NATIVE HAWAIIAN', 'pacific_islander', 
    sv, fixed = FALSE
  )
  sv <- gsub('AMERICAN INDIAN', 'american_indian', sv, fixed = TRUE)
  sv <- gsub('OTHER', 'other', sv, fixed = TRUE)
  
  sv <- gsub('FEMALE', 'female', sv, fixed = TRUE)
  sv <- gsub('MALE', 'male', sv, fixed = TRUE)
  
  sv <- gsub(
    "STUDENTS WITH DISABLITIES|STUDENTS WITH DISABILITIES", 'special_education', 
    sv, fixed = FALSE
  )
  sv <- gsub("SE ACCOMMODATION", 'sped_accomodations', sv, fixed = TRUE)
  
  sv <- gsub('ECONOMICALLY DISADVANTAGED', 'ed', sv, fixed = TRUE)
  sv <- gsub(
    'NON ECON. DISADVANTAGED|NON-ECON. DISADVANTAGED', 'non_ed', sv, fixed = FALSE
  )
  
  sv <- gsub('ENGLISH LANGUAGE LEARNERS', 'lep_current_former', sv, fixed = TRUE)
  sv <- gsub('CURRENT - ELL', 'lep_current', sv, fixed = TRUE)
  sv <- gsub('FORMER - ELL', 'lep_former', sv, fixed = TRUE)
  
  sv <- gsub('GRADE - other', 'grade_other', sv, fixed = TRUE)
  sv <- gsub('GRADE - 06', 'grade_06', sv, fixed = TRUE)
  sv <- gsub('GRADE - 07', 'grade_07', sv, fixed = TRUE)
  sv <- gsub('GRADE - 08', 'grade_08', sv, fixed = TRUE)
  sv <- gsub('GRADE - 09', 'grade_09', sv, fixed = TRUE)
  sv <- gsub('GRADE - 10', 'grade_10', sv, fixed = TRUE)
  sv <- gsub('GRADE - 11', 'grade_11', sv, fixed = TRUE)
  sv <- gsub('GRADE - 12', 'grade_12', sv, fixed = TRUE)
  
  
  sv
}

#' @title gets and cleans up a PARCC data file file
#' 
#' @description
#' \code{fetch_parcc} is a wrapper around \code{get_raw_parcc} and
#' \code{process_parcc} that gets a parcc file and performs any cleanup.
#' @param tidy clean up the data frame to make it more compatible with 
#' NJASK naming conventions and do some additional calculations?  default is FALSE.
#' @inheritParams get_raw_parcc
#' @export

fetch_parcc <- function(end_year, grade_or_subj, subj, tidy = FALSE) {

  if (end_year >= 2019) {
    p <- get_raw_sla(end_year, grade_or_subj, subj)
  } else {
    p <- get_raw_parcc(end_year, grade_or_subj, subj)
  }
  p <- process_parcc(p, end_year, grade_or_subj, subj)
  
  if (tidy) {
    p$subgroup <- tidy_parcc_subgroup(p$subgroup)
    
    p <- p %>% parcc_perf_level_counts()
  }
  
  p
}


#' Fetch all PARCC results
#'
#' @description convenience function to download and combine all PARCC results
#' into single data frame
#' 
#' @return a data frame with all PARCC results
#' @export

fetch_all_parcc <- function() {
  
  parcc_results <- list()
  
  for (i in c(2015:2019)) {
    #normal grade level tests
    for (j in c(3:8)) {
      for (k in c('ela', 'math')) {
      
        p <- fetch_parcc(end_year = i, grade_or_subj = j, subj = k, tidy = TRUE)
        
        parcc_results[[paste(i, j, k, sep = '_')]] <- p
        
      }
    }
    #hs ela
    
    if (i >= 2019) {
      # 11th grade optional and not reported
      for (j in c(9:10)) {
        p <- fetch_parcc(end_year = i, grade_or_subj = j, subj = 'ela', tidy = TRUE)
        
        parcc_results[[paste(i, j, 'ela', sep = '_')]] <- p
      }
    } else {
      for (j in c(9:11)) {
        p <- fetch_parcc(end_year = i, grade_or_subj = j, subj = 'ela', tidy = TRUE)
        
        parcc_results[[paste(i, j, 'ela', sep = '_')]] <- p
      }
    }

    #specific math tests
    for (j in c('ALG1', 'GEO', 'ALG2')) {
      p <- fetch_parcc(end_year = i, grade_or_subj = j, subj = 'math', tidy = TRUE)
      
      parcc_results[[paste(i, j, 'math', sep = '_')]] <- p
    }
  }
  
  dplyr::bind_rows(parcc_results)
}
almartin82/njschooldata documentation built on Nov. 23, 2023, 1:33 a.m.