R/fetch_nj_assess.R

Defines functions fetch_old_nj_assess standard_assess valid_call

Documented in fetch_old_nj_assess standard_assess valid_call

#' @title determine if a end_year/grade pairing can be downloaded from the state website
#' 
#' @description
#' \code{valid_call} returns a boolean value indicating if a given end_year/grade pairing is
#' valid for assessment data
#' @inheritParams fetch_njask
#' @export

valid_call <- function(end_year, grade) {
  #data for 2015 school year doesn't exist yet
  #common core transition started in 2015 (njask is no more)
  if(end_year > 2014) {
    valid_call <- FALSE
  #assessment coverage 3:8 from 2006 on.
  #NJASK fully implemented in 2008
  } else if(end_year >= 2006) {
    valid_call <- grade %in% c(3:8, 11)
  } else if (end_year >= 2004) {
    valid_call <- grade %in% c(3, 4, 8, 11)
  } else if (end_year < 2004) {
    valid_call <- FALSE
  }
  
  return(valid_call)
}



#' @title call the correct \code{fetch} function for normal assessment years
#' 
#' @description for 2008-2014, this function will grab the NJASK for gr 3-8, and HSPA
#' for grade 11
#' @inheritParams fetch_njask
#' @export

standard_assess <- function(end_year, grade) {
  if (grade %in% c(3:8)) {
    assess_data <- fetch_njask(end_year, grade)
  } else if (grade == 11) {
    assess_data <- fetch_hspa(end_year) 
  }
  
  return(assess_data)
} 



#' @title a simplified interface into NJ assessment data
#' 
#' @description this is the workhorse function.  given a end_year and a grade (valid years are 2004-present), 
#' \code{fetch_old_nj_assess} will call the appropriate function, process the raw 
#' text file, and return a data frame.  \code{fetch_old_nj_assess} is a wrapper around 
#' all the individual subject functions (NJASK, HSPA, etc.), abstracting away the 
#' complexity of finding the right location/file layout.
#' @param end_year a school year.  end_year is the end of the academic year - eg 2013-14
#' school year is end_year '2014'.  valid values are 2004-2014.
#' @param grade a grade level.  valid values are 3,4,5,6,7,8,11
#' @param tidy if TRUE, takes the unwieldy, inconsistent wide data and normalizes into a 
#' long, tidy data frame with ~20 headers - constants(school/district name and code),
#' subgroup (all the NCLB subgroups) and test_name (LAL, math, etc).  
#' @export

fetch_old_nj_assess <- function(end_year, grade, tidy = FALSE) {
  #only allow valid calls
  valid_call(end_year, grade) %>%
    ensure_that(
      all(.) ~ "invalid grade/end_year parameter passed")
  
  #everything post 2008 has the same grade coverage
  #some of the layouts are funky, but the fetch_njask function covers that.
  if (end_year >= 2008) {
    assess_data <- standard_assess(end_year, grade)
    
    if (grade == 11) {
      assess_name <- 'HSPA'
    } else {
      assess_name <- 'NJASK'
    }
    
  #2006 and 2007: NJASK 3rd-7th, GEPA 8th, HSPA 11th
  } else if (end_year %in% c(2006, 2007)) {
    if (grade %in% c(3:7)) {
      assess_data <- standard_assess(end_year, grade)
      assess_name <- 'NJASK'
    } else if (grade == 8) {
      assess_data <- fetch_gepa(end_year)
      assess_name <- 'GEPA'
    } else if (grade == 11) {
      assess_data <- fetch_hspa(end_year)
      assess_name <- 'HSPA'
    }
    
  #2004 and 2005:  NJASK 3rd & 4th, GEPA 8th, HSPA 11th
  } else if (end_year %in% c(2004, 2005)) {
    if (grade %in% c(3:4)) {
      assess_data <- standard_assess(end_year, grade)  
      assess_name <- 'NJASK'
    } else if (grade == 8) {
      assess_data <- fetch_gepa(end_year)
      assess_name <- 'GEPA'
    } else if (grade == 11) {
      assess_data <- fetch_hspa(end_year)
      assess_name <- 'HSPA'
    }
    
  } else {
    #if we ever reached this block, there's a problem with our `valid_call()` function
    stop("unable to match your grade/end_year parameters to the appropriate function.")
  }
 
  if (tidy) assess_data <- tidy_nj_assess(assess_name, assess_data)
  
  return(assess_data)
}



#' @title tidies NJ assessment data
#' 
#' @description
#' \code{tidy_nj_assess} is a utility/internal function that takes the somewhat messy/inconsistent 
#' assessment headers and returns a tidy data frame.
#' @param assess_name NJASK, GEPA, HSPA
#' @param df a processed data frame (eg, output of process_njask)
#' @export

tidy_nj_assess <- function(assess_name, df) {
  
  logistical_columns <- c("CDS_Code", "County_Code/DFG/Aggregation_Code", "District_Code", 
    "School_Code", "County_Name", "District_Name", "School_Name", 
    "DFG", "Special_Needs", "Testing_Year", "Grade", "RECORD_KEY", "County_Code", 
    "DFG_Flag", "Special_Needs_(Abbott)_district_flag", "Grade_Level", "Test_Year"
  )
  
  #by population
  logistical_mask <- names(df) %in% logistical_columns
  total_population_mask <- grepl('TOTAL_POPULATION', names(df))
  general_education_mask <- grepl('GENERAL_EDUCATION', names(df))
  special_education_mask <- grepl('SPECIAL_EDUCATION(?!_WITH_ACCOMMODATIONS)', names(df), perl = TRUE)
  
  lep_current_former_mask <- grepl('LIMITED_ENGLISH_PROFICIENT_current_and_former', names(df)) |
    grepl('LIMITED_ENGLISH_PROFICIENT_Current_and_Former_LEP', names(df)) | 
    grepl('LIMITED_ENGLISH_PROFICIENT_Current_and_', names(df)) |
    grepl('LIMITED_ENGLISH_PROFICIENT_Current_+', names(df), fixed = TRUE) |
    #weirdly, unmarked LEP means 'current and former'
    grepl('(?<!CURRENT_|FORMER_)LIMITED_ENGLISH_PROFICIENT(?!_Current|_current|_Former)', names(df), perl = TRUE)
  
  lep_current_mask <- grepl('CURRENT_LIMITED_ENGLISH_PROFICIENT', names(df)) |
    grepl('LIMITED_ENGLISH_PROFICIENT_Current_LEPC', names(df)) |
    grepl('LIMITED_ENGLISH_PROFICIENT_Current(?!_and|_\\+)', names(df), perl = TRUE)

  lep_former_mask <- grepl('FORMER_LIMITED_ENGLISH_PROFICIENT', names(df)) |
    grepl('LIMITED_ENGLISH_PROFICIENT_Former_LEPF', names(df)) |
    grepl('LIMITED_ENGLISH_PROFICIENT_Former', names(df))
  
  female_mask <- grepl('FEMALE', names(df))
  male_mask <- grepl('(?<!FE)MALE', names(df), perl = TRUE)
  migrant_mask <- grepl('(?<!NON-)MIGRANT', names(df), perl = TRUE)
  nonmigrant_mask <- grepl('NON-MIGRANT', names(df))
  white_mask <- grepl('WHITE', names(df))
  black_mask <- grepl('BLACK', names(df))
  asian_mask <- grepl('ASIAN', names(df))
  pacific_islander_mask <- grepl('PACIFIC_ISLANDER', names(df))
  hispanic_mask <- grepl('HISPANIC', names(df))
  american_indian_mask <- grepl('AMERICAN_INDIAN', names(df))
  other_mask <- grepl('OTHER', names(df))
  ed_mask <- grepl('(?<!NON-)ECONOMICALLY_DISADVANTAGED', names(df), perl = TRUE)
  non_ed_mask <- grepl('NON-ECONOMICALLY_DISADVANTAGED', names(df))
  sped_accomodations_mask <- grepl('SPECIAL_EDUCATION_WITH_ACCOMMODATIONS', names(df))
  #irregular
  not_exempt_from_passing_mask <- grepl('NOT_EXEMPT_FROM_PASSING', names(df))
  iep_exempt_from_passing_mask <- grepl('IEP_EXEMPT_FROM_PASSING', names(df))
  iep_exempt_from_taking_mask <- grepl('IEP_EXEMPT_FROM_TAKING', names(df))
  lep_exempt_lal_only_mask <- grepl('LEP_EXEMPT_(LAL_Only)', names(df), fixed = TRUE) |
    grepl('LEP_EXEMPT_LAL_Only', names(df), fixed = TRUE)

  demog_masks <- rbind(logistical_mask, total_population_mask, general_education_mask, 
    special_education_mask, lep_current_former_mask, lep_current_mask, lep_former_mask, 
    female_mask, male_mask, migrant_mask, nonmigrant_mask, white_mask, black_mask, 
    asian_mask, pacific_islander_mask, hispanic_mask, american_indian_mask, other_mask, 
    ed_mask, non_ed_mask, sped_accomodations_mask, not_exempt_from_passing_mask, 
    iep_exempt_from_passing_mask, iep_exempt_from_taking_mask, lep_exempt_lal_only_mask
  ) %>% 
    as.data.frame()
  
  demog_test <- demog_masks %>%
    dplyr::summarise_each(dplyr::funs(sum)) %>% 
    unname() %>% unlist()
  
  if (!all(demog_test == 1)) {
    print(names(df)[!demog_test == 1])
    print(demog_test)
  }
  
  #by subject
  language_arts_mask <- grepl('LANGUAGE_ARTS', names(df), fixed = TRUE) | grepl('_ELA$', names(df)) |
    grepl('_LAL_', names(df)) | grepl('_LAL$', names(df))
  mathematics_mask <- grepl('MATHEMATICS', names(df), fixed = TRUE)
  science_mask <- grepl('SCIENCE', names(df), fixed = TRUE)
  #only number enrolled without subject (some years they did not specify)
  number_enrolled_mask <- grepl('Number_Enrolled', names(df), fixed = TRUE) & 
    !language_arts_mask & !mathematics_mask & !science_mask
  
  subj_masks <- rbind(logistical_mask, language_arts_mask, mathematics_mask, 
    science_mask, number_enrolled_mask) %>% 
    as.data.frame()
  
  subj_test <- subj_masks %>%
    dplyr::summarise_each(dplyr::funs(sum)) %>% 
    unname() %>% unlist()

  if (!all(subj_test == 1)) {
    print(names(df)[!subj_test == 1])  
    names(subj_masks) <- names(df)
    print(subj_masks[, !subj_test == 1])
  }

  subgroups <- c('total_population', 'general_education', 'special_education', 
    'lep_current_former', 'lep_current', 'lep_former', 'female', 'male', 'migrant', 
    'nonmigrant', 'white', 'black', 'asian', 'pacific_islander', 'hispanic', 
    'american_indian', 'other', 'ed', 'non_ed', 'sped_accomodations', 'not_exempt_from_passing',
    'iep_exempt_from_passing', 'iep_exempt_from_taking', 'lep_exempt_lal_only')
  
  result_list <- list()
  
  tidy_col <- function(mask, nj_df) {
    if (sum(mask) > 1) stop("tidying assessment data matched more than one column")
    if (all(mask == FALSE)) {
      out <- rep(NA, nrow(nj_df))
    } else {
      out <- nj_df[, mask]
    }
    return(out)
  }
  
  testing_year <- grepl('(Test_Year|Testing_Year)', names(df))
  grade <- grepl('(Grade|Grade_Level)', names(df))
  county_code <- grepl('County_Code', names(df), fixed = TRUE)
  district_code <- grepl('District_Code', names(df), fixed = TRUE)
  school_code <- grepl('School_Code', names(df), fixed = TRUE)
  district_name <- grepl('District_Name', names(df), fixed = TRUE)
  school_name <- grepl('School_Name', names(df), fixed = TRUE)
  dfg <- grepl('^DFG', names(df), perl = TRUE)
  special_needs <- grepl('^Special_Needs', names(df), fixed = TRUE)
  
  constant_df <- data.frame(
    assess_name = assess_name,
    testing_year = tidy_col(testing_year, df) %>% as.integer(),
    grade = tidy_col(grade, df),
    county_code = tidy_col(county_code, df) %>% as.character(),
    district_code = tidy_col(district_code, df),
    school_code = tidy_col(school_code, df),
    district_name = tidy_col(district_name, df),
    school_name = tidy_col(school_name, df),
    dfg = tidy_col(dfg, df),
    special_needs = tidy_col(special_needs, df),
    stringsAsFactors = FALSE
  )
  
  iters <- 1
  
  for (i in subgroups) {
    subgroup_mask <- paste0(i, '_mask') %>% get()
    if (!any(subgroup_mask)) next
    
    for (j in c('language_arts', 'mathematics', 'science')) {
      subj_mask <- paste0(j, '_mask') %>% get()
      
      #skip when no data
      if (!any(subj_mask)) next
      
      this_df <- df[, subgroup_mask & subj_mask]

      this_tidy <- cbind(
        constant_df,
        data.frame(
          subgroup = i,
          test_name = j,

          number_enrolled = tidy_col(grepl('Number_Enrolled', names(this_df)), this_df),
          number_not_present = tidy_col(grepl('Number_Not_Present', names(this_df)), this_df),
          number_of_voids = tidy_col(grepl('Number_of_Voids', names(this_df)), this_df),
          number_of_valid_classifications = tidy_col(grepl('Number_of_Valid_Classifications', names(this_df)), this_df),
          number_apa = tidy_col(grepl('Number_APA', names(this_df)), this_df),
          number_valid_scale_scores = tidy_col(grepl('Number_of_Valid_Scale_Scores', names(this_df)), this_df),
          partially_proficient = tidy_col(grepl('Partially_Proficient_Percentage', names(this_df)), this_df),
          proficient = tidy_col(grepl('(?<!Partially_|Advanced_)Proficient_Percentage', names(this_df), perl = TRUE), this_df),
          advanced_proficient = tidy_col(grepl('Advanced_Proficient_Percentage', names(this_df)), this_df),
          scale_score_mean = tidy_col(grepl('Scale_Score_Mean', names(this_df)), this_df),
          stringsAsFactors = FALSE
        )
      )
      
      result_list[[iters]] <- this_tidy      
      iters <- iters + 1
    }
  }
  
  return(dplyr::bind_rows(result_list))
}


#' @title nj_coltype_parser
#' 
#' @description turns layout datatypes into compact string required by read_fwf
#' @param datatypes vector of datatypes (from a layout df)
#' @return a character string of the types, for read_fwf
#' @export

nj_coltype_parser <- function(datatypes) {
  datatypes <- ifelse(datatypes == "Text", 'c', datatypes)
  datatypes <- ifelse(datatypes == "Integer", 'i', datatypes)
  datatypes <- ifelse(datatypes == "Decimal", 'd', datatypes)
  datatypes <- datatypes %>% unlist() %>% unname()
 
  paste(datatypes, collapse = '')
}


#' @title common_fwf_req
#' 
#' @description common fwf logic across various assessment types.  DRY.
#' @param url file location
#' @return layout layout to use
#' @export

common_fwf_req <- function(url, layout) {
  #got burned by bad layouts.  read in the raw file
  #this will take extra time, but it is worth it.

  raw_fwf <- readLines(url)
  raw_fwf <- iconv(raw_fwf, "LATIN2", "UTF-8")
  num_lines <- lapply(raw_fwf, nchar) %>% unlist()

  #if everything is consistent, great.  if the fwf is ragged, trim whitespace.  
  if (any(num_lines < max(num_lines))) {
    raw_fwf <- raw_fwf %>% gsub("[[:space:]]*$","", .)
  }
  
  #check that incoming response (when cleaned) is of consistent length.
  if (!nchar(raw_fwf) %>% unique() %>% length() == 1) {
    warning("the fixed width input file is not fixed - rows are of different length.")
    warning("truncating rows that are too wide, and padding rows that are too short...")    
  }
  
  #sometimes the raw response is too short.  that wrecks havoc with read_fwf
  #additionally, some layouts call for data that there is data that really isnt there.  
  #(aka science).      
  #right pad them to the full extent of the array OR layout
  max_extent <- max(nchar(raw_fwf), max(layout$field_end_position))
  raw_fwf <- sprintf(paste0('%-', max_extent, 's'), raw_fwf)
  
  #read_fwf
  df <- readr::read_fwf(
    file = raw_fwf %>% paste(collapse = '\n'),
    col_positions = readr::fwf_positions(
      start = layout$field_start_position,
      end = layout$field_end_position,
      col_names = layout$final_name
    ),
    col_types = nj_coltype_parser(layout$data_type),
    na = "*",
    progress = TRUE
  )
  
  if (!nrow(df) == length(raw_fwf)) {
    paste('read_fwf is', nrow(df), 'lines') %>% print()
    paste('raw response', length(raw_fwf), 'lines') %>% print()
    stop("read_fwf and readlines don't agree on size of df.  probably a layout error.")
  }

  df
}
  
almartin82/njschooldata documentation built on Nov. 23, 2023, 1:33 a.m.