R/report_card.R

Defines functions matric_column_order id_rc_aggs enrich_matric_counts enrich_rc_enrollment extract_rc_enrollment clean_rc_enrollment extract_rc_cds extract_rc_AP extract_rc_college_matric extract_rc_SAT get_merged_rc_database get_rc_databases download_and_clean_pr get_standalone_rc_database get_one_rc_database

Documented in clean_rc_enrollment download_and_clean_pr enrich_matric_counts enrich_rc_enrollment extract_rc_AP extract_rc_cds extract_rc_college_matric extract_rc_enrollment extract_rc_SAT get_merged_rc_database get_one_rc_database get_rc_databases get_standalone_rc_database id_rc_aggs matric_column_order

#' Get Raw Report Card Database
#'
#' @param end_year a school year.  end_year is the end of the academic year - eg 2014-15
#' school year is end_year '2015'.  valid values are 2003 to 2019
#'
#' @return list of data frames
#' @export

get_one_rc_database <- function(end_year) {
  
  if (end_year >= 2017) {
    df_list <- get_merged_rc_database(end_year)
  } else if (end_year >= 2003) {
    df_list <- get_standalone_rc_database(end_year)
  } else {
    stop('Year not covered by NJ report card data.')
  }
  
  df_list
}


#' Get Standalone Raw Report Card Database
#'
#' @param end_year a school year.  end_year is the end of the academic year - eg 2014-15
#' school year is end_year '2015'.  valid values are 2003 to 2016
#'
#' @return list of data frames

get_standalone_rc_database <- function(end_year) {
  
  pr_urls <- list(
    "2016" = "https://rc.doe.state.nj.us/ReportsDatabase/15-16/PerformanceReports.xlsx",
    "2015" = "https://nj.gov/education/schoolperformance/archive/201415/2015PRDATABASE.xlsx",
    "2014" = "https://nj.gov/education/schoolperformance/archive/201314/2014%20performance%20report%20database.xlsx",
    "2013" = "https://nj.gov/education/schoolperformance/archive/201213/nj%20pr13%20database.xlsx",
    "2012" = "https://nj.gov/education/schoolperformance/archive/201112/nj%20pr12%20database.xlsx"
    
    # 2003-2011 report cards deleted
    # "2011" = "http://www.nj.gov/education/reportcard/2011/database/RC11%20database.xls",
    # "2010" = "http://www.nj.gov/education/reportcard/2010/database/RC10%20database.xls",
    # "2009" = "http://www.nj.gov/education/reportcard/2009/database/RC09%20database.xls",
    # "2008" = "http://www.nj.gov/education/reportcard/2008/database/nj_rc08.xls",
    # "2007" = "http://www.nj.gov/education/reportcard/2007/database/nj_rc07.xls",
    # "2006" = "http://www.nj.gov/education/reportcard/2006/database/nj_rc06_data.xls",
    # "2005" = "http://www.nj.gov/education/reportcard/2005/database/NJ_RC05_DATA.XLS",
    # "2004" = "http://www.nj.gov/education/reportcard/2004/database/nj_rc04_data.xls",
    # "2003" = "http://www.nj.gov/education/reportcard/2003/database/nj_rc03_data.xls"
  )
  
  #temp file for downloading
  file_exts <- c(
    rep('xlsx', 5),
    rep('xls', 9)
  )
  tmp_pr = tempfile(fileext = paste0('.', file_exts[1 + (2016-end_year)]))
  
  pr_list <- download_and_clean_pr(tmp_pr,  pr_urls[[as.character(end_year)]], end_year)
  
  pr_list
}


#' Download and clean performance report data
#'
#' @param tmp_pr path to tempfile
#' @param url url to download
#' @param end_year report year
#'
#' @return list of dataframes

download_and_clean_pr <- function(tmp_pr, url, end_year) {
  #download to temp
  download.file(url, destfile = tmp_pr, mode = "wb")
  
  #get the sheet names
  sheets_pr <- readxl::excel_sheets(tmp_pr) %>%
    clean_name_vector()
  
  #get all the data.frames as list
  pr_list <- map(
    .x = c(1:length(sheets_pr)),
    .f = function(.x) {
      readxl::read_excel(
        tmp_pr, 
        sheet = .x,
        na = c('NA', 'N', '*', '**')
      ) %>%
      mutate(end_year = end_year) %>%
      janitor::clean_names()
    }
  )
  
  #rename each pr
  names(pr_list) <- sheets_pr
  
  pr_list
}

#' Get multiple RC databases
#'
#' @param end_year_vector vector of years.  Current valid values are 2003 to 2018. 
#'
#' @return a list of dataframes
#' @export

get_rc_databases <- function(end_year_vector = c(2003:2018)) {

  all_prs <- map(
    .x = end_year_vector,
    .f = function(.x) {
      print(.x)
      get_one_rc_database(.x)
    }
  )
  
  names(all_prs) <- end_year_vector
  
  all_prs
}


#' Combines school and district Performance Reports for 2017-on, when two files were released.
#'
#' @param end_year end of the academic year.  Valid values are 2017, 2018, 2019
#'
#' @return list of dataframes
#' @export

get_merged_rc_database <- function(end_year) {
  
  pr_urls <- list(
    "sch_2019" = "https://rc.doe.state.nj.us/ReportsDatabase/PerformanceReports.xlsx",
    "dist_2019" = "https://rc.doe.state.nj.us/ReportsDatabase/DistrictPerformanceReports.xlsx",
    "sch_2018" = "https://rc.doe.state.nj.us/ReportsDatabase/17-18/PerformanceReports.xlsx",
    "dist_2018" = "https://rc.doe.state.nj.us/ReportsDatabase/17-18/DistrictPerformanceReports.xlsx",
    "sch_2017" = "https://rc.doe.state.nj.us/ReportsDatabase/16-17/PerformanceReports.xlsx",
    "dist_2017" = "https://rc.doe.state.nj.us/ReportsDatabase/16-17/DistrictPerformanceReports.xlsx"
  )
  
  # get district and school df
  dist <- pr_urls[[paste0('dist_', end_year)]]
  sch <- pr_urls[[paste0('sch_', end_year)]]
  
  tmp_pr1 = tempfile(fileext = 'xlsx')
  tmp_pr2 = tempfile(fileext = 'xlsx')
  dist_pr = download_and_clean_pr(tmp_pr1, dist, end_year)
  sch_pr = download_and_clean_pr(tmp_pr2, sch, end_year)
  
  # exclude bad / duplicate data
  sch_pr <- sch_pr[!names(sch_pr) %in% c('per_pupil_expenditures')]

  # tag source
  dist_pr <- map(dist_pr, ~.x %>% mutate(source_file='district'))
  sch_pr <- map(sch_pr, ~.x %>% mutate(source_file='school'))
  
  # add logical tags
  dist_pr <- map(
    dist_pr, 
    function(.x) {
      # would it be too much to ask that the same code be used for State data 
      # *inside the same file*?  It would.
      state_codes <- c('STATE', 'State')
      
      if ('district_code' %in% names(.x)) {
        .x %>%
          mutate(
            school_code = '999',
            school_name = ifelse(!district_code %in% state_codes, 'District Total', ''),
            is_district = ifelse(!district_code %in% state_codes, TRUE, FALSE),
            is_school = FALSE
          )
      } else {
        .x
      }
    }
  )
  
  sch_pr <- map(
    sch_pr, 
    function(.x) {
      if ('school_code' %in% names(.x)) {
        .x %>%
          mutate(
            is_district = FALSE,
            is_school = TRUE
          )
      } else {
        .x
      }
    }
  )
  
  # combine if match
  joint <- names(sch_pr)[names(sch_pr) %in% names(dist_pr)]
  sch_only <- names(sch_pr)[!names(sch_pr) %in% names(dist_pr)]
  dist_only <- names(dist_pr)[!names(dist_pr) %in% names(sch_pr)]
  
  combined <- map(
    joint,
    function(.x) {
      this_dist_df <- dist_pr[[.x]]
      this_sch_df <- sch_pr[[.x]]
      bindable <- compare_df_cols_same(this_dist_df, this_sch_df, verbose=FALSE)
      if (!bindable) {
        # which don't match?
        bad_cols <- compare_df_cols(
          this_dist_df, 
          this_sch_df, 
          return = 'mismatch', 
          bind_method = 'bind_rows'
        )
        # make character if not matched
        this_dist_df <- this_dist_df %>% mutate_at(bad_cols$column_name, as.character)
        this_sch_df <- this_sch_df %>% mutate_at(bad_cols$column_name, as.character)
      }
      bind_rows(this_dist_df, this_sch_df)
    }
  )
  names(combined) <- joint
  
  # only school and only dist
  sch_dfs <- map(sch_only, ~sch_pr[[.x]])
  names(sch_dfs) <- sch_only
  
  dist_dfs <- map(dist_only, ~dist_pr[[.x]])
  names(dist_dfs) <- dist_only
  
  # combine list of prs and return
  c(combined, sch_dfs, dist_dfs)
}


#' Extract Report Card SAT School Averages
#'
#' @param list_of_prs output of get_rc_databases (ie, a list where each element is)
#' a list of data.frames
#' @param school_only some years have district average results, not just school-level. 
#' if school_only, return only school data.  default is TRUE
#' @param cds_identifiers add the county, district and school name?  default is TRUE
#'
#' @return data frame with all years of SAT School Averages present in the input
#' @export

extract_rc_SAT <- function(
  list_of_prs, school_only = TRUE, cds_identifiers = TRUE
) {
  
  all_sat <- map(
    .x = list_of_prs,
    .f = function(.x) {
      #finds tables that have SAT data
      sat_tables <- grep('sat', names(.x), value = TRUE)
      #excludes some tables in years where multiple tables match 'sat'
      sat_tables <- sat_tables[!grepl("participation|1550", sat_tables)]
      df <- .x %>% extract2(sat_tables)
      
      #reshapes the data for some years where it was reported in 'long' format
      if ('test' %in% names(df)) {
        
        names(df) <- gsub('school_avg', 'school_mean', names(df))
        names(df) <- gsub('bt_pct|schoolwide_benchmark', 'school_benchmark', names(df))
        
        df <- df %>%
          filter(test == 'SAT')
        
        df$subject <- gsub('Math', 'math', df$subject)
        df$subject <- gsub('Reading and Writing', 'reading', df$subject)
        
        df <- df %>%
          select(county_code, district_code, school_code, 
                 end_year, test, subject, school_mean)
        
        df <- reshape2::dcast(
          df, 
          county_code + district_code + school_code + end_year + test ~ subject,
          value.var = 'school_mean'
        )
      }
      
      df <- clean_cds_fields(df)
      
      #2 digit years to 4 digit YYYY years
      if ('year' %in% names(df)) {
        df <- rc_year_matcher(df)
      }
      
      #filters out district results and only returns schools
      if ('level' %in% names(df) & school_only) {
        df <- df %>% filter(level == 'S')  
      }
      
      #cleans variable names
      names(df) <- gsub('mmean|mathematics', 'math', names(df))
      names(df) <- gsub('vmean|critical_reading', 'reading', names(df))
      
      df <- df %>%
        select(county_code, district_code, school_code, end_year, math,
               reading)
      
      #cleans up suppression and NA codes
      df$math <- gsub('N', NA_character_, df$math, fixed = TRUE) 
      df$math <- gsub('N/A', NA_character_, df$math, fixed = TRUE) 
      df$math <- gsub('*', NA_character_, df$math, fixed = TRUE) 
      df$math <- df$math %>% as.numeric()
      
      df$reading <- gsub('N', NA_character_, df$reading, fixed = TRUE)
      df$reading <- gsub('N/A', NA_character_, df$reading, fixed = TRUE)
      df$reading <- gsub('*', NA_character_, df$reading, fixed = TRUE)
      df$reading <- df$reading %>% as.numeric()
      
      df
    }
  ) 
  
  out <- bind_rows(all_sat)
  
  if (cds_identifiers) {
    all_cds <- extract_rc_cds(list_of_prs)
    out <- out %>%
      left_join(
        all_cds, 
        by = c('county_code', 'district_code', 'school_code', 'end_year')
      )
  }
  
  out
}


#' Extract Report Card Matriculation Rates
#'
#' @inheritParams extract_rc_SAT
#' 
#' @return data frame with all the years of 4 year and 2 year college matriculation data
#' present in in the input
#' @export

extract_rc_college_matric <- function(
  list_of_prs, type = '16 month', school_only = TRUE, cds_identifiers = TRUE
) {
  
  all_matric <- map(
    list_of_prs,
    function(.x) {
      
      matric_tables <- grep('postgrad|post_sec|postsecondary', names(.x), value = TRUE)
      matric_tables <- matric_tables[!grepl("summary", matric_tables)]
      
      # type does nothing when year < 2017
      if (type == '16 month') {
        matric_tables <- matric_tables[!grepl("fall", matric_tables)]
      } else if (type == '12 month') { 
        matric_tables <- matric_tables[!grepl("16mos|16_mos", matric_tables)]
      } else {
        stop("type should be one of {\'16 month\', \'12 month\'")
      }
      
      
      #2011 didn't include postsec because :shrug:
      if (!is_empty(matric_tables)) {
        df <- .x %>% extract2(matric_tables)
      } else {
        return(NULL)
      }
      
      #pre-2010 they included longitudinal data in the table
      #filter to just the matching year
      if ('year' %in% names(df)) {
        df <- rc_year_matcher(df)
      }
      #they also reported school and district together
      if ('level' %in% names(df) & school_only) {
        df <- df %>% filter(level == 'S')  
      }
      
      df <- clean_cds_fields(df)
      
      #specific field cleaning for college matric
      names(df) <- gsub(
        'colleg4|enroll_4yr_percent|postsec_enrolled_4yr|percent_in4years|post_sec_pct|enrolled4yr|postsec_enrolled_4year|enrolled4year', 
        'enroll_4yr', names(df)
      )
      names(df) <- gsub(
        'colleg2|enroll_2yr_percent|postsec_enrolled_2yr|percent_in2years|enrolled2yr|postsec_enrolled_2year|enrolled2year',
        'enroll_2yr', names(df)
      )
      names(df) <- gsub(
        'grad_percent|post_sec_enrolled_percent|percent_enrolled|postsec_enrolled_percent|enrolled_percent', 
        'enroll_any', names(df)
      )
      names(df) <- gsub('sub_group|student_group', 'subgroup', names(df))
      
      #clean up messy data vectors, suppression codes, etc
      if ('enroll_any' %in% names(df)) {
        df$enroll_any <- rc_numeric_cleaner(df$enroll_any)
      }
      if ('enroll_2yr' %in% names(df)) {
        df$enroll_2yr <- rc_numeric_cleaner(df$enroll_2yr)
      }
      if ('enroll_4yr' %in% names(df)){
        df$enroll_4yr <- rc_numeric_cleaner(df$enroll_4yr)
      }
      
      #from 2015 onward enroll_4yr, enroll_2yr pct of college-going, not pct of grade
      # it looks liks enroll_4yr, enroll_2yr are always percents of enrollees?
      # this_year <- df$end_year %>% unique()
      # if (this_year >= 2015) {
      #   df <- df %>%
      #     mutate(enroll_4yr = enroll_4yr * (enroll_any/100))
      # }
      
      #if there's no subgroup field, implicitly assume that means Schoolwide
      if (!'subgroup' %in% names(df)) {
        df <- df %>%
          mutate(subgroup = 'Total Population')
      }
      
      #make subgroups consistent
      df$subgroup <- gsub('African American', 'Black', df$subgroup)
      # above changes "Black or African American" to "Black or Black"
      df$subgroup <- gsub('Black or Black', 'Black', df$subgroup)
      df$subgroup <- gsub('Students with Disability', 'Students With Disabilities', df$subgroup)
      df$subgroup <- gsub('Schoolwide', 'Total Population', df$subgroup)
      df$subgroup <- gsub('Districtwide', 'Total Population', df$subgroup)
      #df$subgroup <- gsub('Statewide', 'Total Population', df$subgroup)
      df$subgroup <- gsub('Limited English Proficient Students', 'English Language Learners', df$subgroup)
      
      df <- df %>%
        filter(subgroup != "Statewide") %>%
        select(
          one_of('county_code', 'district_code', 'school_code', 
                 'end_year', 'subgroup', 'enroll_any', 'enroll_2yr', 'enroll_4yr')
        )
      
      df
    }
  )
  
  out <- bind_rows(all_matric)
  
  if (cds_identifiers) {
    all_cds <- extract_rc_cds(list_of_prs)
    out <- out %>%
      left_join(
        all_cds, 
        by = c('county_code', 'district_code', 'school_code', 'end_year')
      )
  }
  
  out %>%
    select(
      one_of(
        'county_code', 'county_name',
        'district_code', 'district_name',
        'school_code', 'school_name',
        'end_year', 'subgroup',
        'enroll_any', 'enroll_4yr', 'enroll_2yr'
      )
    )
}


#' Extract Report Card Advanced Placement Data
#'
#' @inheritParams extract_rc_SAT
#'
#' @return data frame with all the years of AP participation and AP achievement
#' present in in the input
#' @export

extract_rc_AP <- function(list_of_prs, school_only = TRUE, cds_identifiers = TRUE) {
  
  all_ap <- map(
    list_of_prs,
    function(.x) {
      ap_sum_tables <- grep(
        'ap_sum\\b|ap_ib_sum|apib_test_performance|apib_coursework_part_perf', 
        names(.x), 
        value = TRUE
      )
      ap_tables <- grep('ap\\b|ap_ib\\b|apib_advanced_course_participation|apib_coursework_part_perf|ap03|apnew', names(.x), value = TRUE)
      
      #get the relevant table
      df <- .x %>% extract2(ap_sum_tables)
      df2 <- .x %>% extract2(ap_tables)
      
      #clean up cds fields
      df <- clean_cds_fields(df)
      
      #pre-2010 they included longitudinal data in the table
      if ('year' %in% names(df)) {
        df <- rc_year_matcher(df)
      }
      #they also reported school and district together
      if ('level' %in% names(df) & school_only) {
        df <- df %>% filter(level == 'S')  
      }
      
      #pct tested
      names(df) <- gsub(
        'pctest|perc_stud_ap_ib_score',
        'pct_tested_ap_ib', names(df)
      )
      #this data looks very different across years
      if (ap_tables == 'apib_advanced_course_participation') {
        df2 <- df2 %>% 
          filter(students_taking == 'One or More Test') %>%
          select(county_code, district_code, school_code, school_participation) %>%
          rename(
            pct_tested_ap_ib = school_participation
          )
        
        df <- df %>%
          left_join(df2, by = c('county_code', 'district_code', 'school_code'))
      }
      if (ap_tables == 'apib_coursework_part_perf') {
        df2 <- df2 %>% 
          filter(report_category == 'StudentsTakingInOneOrMoreAPorIBExam') %>%
          select(county_code, district_code, school_code, school_percent) %>%
          rename(
            pct_tested_ap_ib = school_percent
          )
        
        df <- df %>%
          left_join(df2, by = c('county_code', 'district_code', 'school_code'))
      }    
      
      #pct 3 or higher
      names(df) <- gsub(
        'perc_stud_ap_score_3_above1|ap_ib_test_score_3|perc_stud_ap_score_3_above|ap_3_ib4_4',
        'pct_ap_scoring_3', names(df)
      )
      
      if (ap_tables == 'apib_coursework_part_perf') {
        df <- df %>%
          filter(report_category == 'StudentsWithOneOrMoreExamsWithAScoreOfAtLest3AP4IBExams') %>%
          rename(
            pct_ap_scoring_3 = school_percent
          )
      }  
      
      df <- df %>%
        select(one_of(
          'county_code', 'district_code', 'school_code', 
          'pct_ap_scoring_3', 'pct_tested_ap_ib', 'end_year'
        ))
      
      if ('pct_ap_scoring_3' %in% names(df)) {
        df$pct_ap_scoring_3 <- rc_numeric_cleaner(df$pct_ap_scoring_3)
      }
      
      if ('pct_tested_ap_ib' %in% names(df)) {
        df$pct_tested_ap_ib <- rc_numeric_cleaner(df$pct_tested_ap_ib)
      }
      
      df
    }
  )
  
  out <- bind_rows(all_ap)
  
  if (cds_identifiers) {
    all_cds <- extract_rc_cds(list_of_prs)
    out <- out %>%
      left_join(
        all_cds, 
        by = c('county_code', 'district_code', 'school_code', 'end_year')
      )
  }
  
  out
}


#' Extract Report Card CDS
#'
#' @param list_of_prs output of get_rc_databases (ie, a list where each element is)
#' a list of data.frames
#'
#' @return data frame with county, district and school ids and identifiers for all 
#' years present in the input
#' @export

extract_rc_cds <- function(list_of_prs) {
  
  all_cds <- map(
    list_of_prs,
    function(.x) {
      #find the table      
      cds_table <- grep('s_header|sch_header|school_header|header_and_contact', names(.x), value = TRUE)
      #extract the table
      df <- .x %>% extract2(cds_table)
      #make cds names consistent
      df <- clean_cds_fields(df)
      
      df %>%
        select(county_code, district_code, school_code,
               county_name, district_name, school_name,
               end_year)
    }
  )
  
  bind_rows(all_cds)
}
  

#' Clean Report Card Enrollment
#'
#' @param list_of_dfs output of get_one_rc_database (ie, a list  of data.frames)
#'
#' @return data frame with school enrollment
#' @export
clean_rc_enrollment <- function(list_of_dfs) {
   enr_df_name <- grep('enrollment\\b|enrollment_by_grade|enrollment_trends_by_grade|enrollment_trendsby_grade',
        names(list_of_dfs), value = T) 
   
   df <- list_of_dfs %>%
      extract2(enr_df_name) 
   
   # wide
   if (any(str_detect(colnames(df), "grade0+"))) {
      
      colnames(df) <- colnames(df) %>%
         str_replace("co_code", "county_code") %>%
         str_replace("dist_code", "district_code") %>%
         str_replace("sch_code", "school_code") %>%
         str_replace("^pk$", "gradepk") %>%
         str_replace("grade_pk", "gradepk") %>%
         str_replace("^kg$", "gradekg") %>%
         str_replace("grade_kg", "gradekg") %>%
         str_replace("^ug$", "gradeungraded") %>%
         str_replace("^ungraded$", "gradeungraded") %>%
         str_replace("rowtotal|total|row_total", "graderow_total") 
      
      grade_names <- c(
         "grade01", "grade02", "grade03", "grade04", "grade05", "grade06",
         "grade07", "grade08", "grade09", "grade10", "grade11", "grade12",
         "gradepk", "gradekg", "graderow_total", "gradeungraded"
      )
      
      df <- pivot_longer(
         data = df,
         cols = one_of(grade_names), 
         names_to = 'grade_level',
         names_prefix = 'grade',
         values_to = 'n_enrolled' 
      ) %>%
         mutate(
            grade_level = case_when(
               grade_level == "pk" ~ "PK",
               grade_level == "kg" ~ "KG",
               grade_level == "row_total" ~ "TOTAL",
               grade_level == "ungraded" ~ NA_character_,
               TRUE ~ grade_level
               )
         )
   
   } else { # if long
         colnames(df) <- colnames(df) %>%
            str_replace("grade$", "grade_level") %>%
            str_replace("^count$", "n_enrolled")
         
         df <- df %>%
            mutate(
               grade_level = str_replace_all(grade_level, "Grade ", ""),
               grade_level = case_when(
                  grade_level %in% c("Ungraded", "UG") ~ NA_character_,
                  grade_level == "Total Enrollment" ~ "TOTAL",
                  TRUE ~ grade_level
               )
            )
      }

   return(df)
}

#' Extract Report Card Enrollment
#'
#' @param list_of_prs output of get_rc_databases (ie, a list where each element is
#' a list of data.frames)
#' @param cds_identifiers add the county, district and school name?  default is TRUE
#'
#' @return data frame with school enrollment
#' @export
extract_rc_enrollment <- function(list_of_prs, cds_identifiers = TRUE) {
   
   all_enr <- map(
    list_of_prs,
    function(.x) {
      clean_rc_enrollment(.x) %>%
         clean_cds_fields() %>%
         select(
            one_of(c("county_code", "district_code", "school_code",
                     "end_year", "grade_level", "n_enrolled"))
            ) %>%
          return()
    }
  )
  
  out <- bind_rows(all_enr)
  
  if (cds_identifiers) {
    all_cds <- extract_rc_cds(list_of_prs)
    out <- out %>%
      left_join(
        all_cds, 
        by = c('county_code', 'district_code', 'school_code', 'end_year')
      )
  }
  
  out %>%
    select(-school_name) %>%
    return()

}


#' Enrich report card subgroup percentages with best guesses at 
#' subgroup numbers
#' 
#' @param df data frame of including subgroup percentages
#' 
#' @return data_frame
#' @export
enrich_rc_enrollment <- function(df) {
# not totally sure if this should be here - or where to put it!
# writing the tests in test_charter.R for now, I think.
  enr_count <- df %>%
    pull(end_year) %>%
    unique() %>%
    get_rc_databases() %>%
    extract_rc_enrollment() %>%
    filter(grade_level == "TOTAL")
  
  # prevent n_enrolled from being duplicated
  if ('n_enrolled' %in% names(df)) {
    df <- df %>% select(-n_enrolled)
  }
  
  df <- df %>% 
    left_join(
      enr_count,
      # line below will likely cause problems later
      by = c("end_year", "county_id" = "county_code", 
             "district_id" = "district_code", "school_id" = "school_code")
    ) %>%
    # is there a better solution here w/ recover_enrollment() ?
    mutate(n_enrolled = as.numeric(n_enrolled),
           n_students = round(percent / 100 * n_enrolled),
           n_students = if_else(n_students == 0 & percent > 0, 1, n_students)) 
  
  return(df)
}



#' Enrich matriculation rates with counts from grad counts
#' 
#' @param df data frame of including matriculation percentages
#' 
#' @return data_frame
#' @export
enrich_matric_counts <- function(df, type = '16 month') {
  if (min(df$end_year) < 2013) stop("end_year needs to be > 2012")
  
  grad_count_yrs <- df %>%
    pull(end_year) %>%
    unique()
  
  if (type == '16 month') {
    
    # 16 month matriculation requires grad counts from prior year
    grad_count_yrs <- c((min(grad_count_yrs) - 1):(max(grad_count_yrs) - 1))
    
    # but there is no grad count data before 2011
    grad_count_yrs <- grad_count_yrs[grad_count_yrs > 2011]
    
    postsec_rates <- data.frame(
      end_year = 2012:2019,
      is_16mo = rep(T, 8)
    )
  } else if (type == '12 month') {
    postsec_rates <- data.frame(
    end_year = 2017:2019,
    is_16mo = rep(F, 3)
    )
  } else {
    stop("type should be one of {\'16 month\' or \'12 month\'}")
  }
  
  
  warning("Subgroups have been converted for compatibility across data
          sources.")
  
  # prepare matric df to join grad count
  df <- df %>%
    id_rc_aggs() %>%
    # 2012 rc reports only 16 month matriculation rates, which would
    # require 2011 grad counts, which are not available
    filter(end_year != 2012) %>%
    left_join(postsec_rates, by = "end_year") %>%
    # 16 month matriculation rates requires last year's grad counts
    mutate(orig_end_year = end_year,
           end_year = if_else(is_16mo, end_year - 1, as.numeric(end_year))) %>%
    mutate(
      subgroup = tolower(subgroup),
      subgroup = case_when(
        subgroup == "economically disadvantaged students" ~ "economically disadvantaged",
        subgroup %in% c("english language learners",
                        "english learners") ~ "limited english proficiency",
        subgroup == "students with disabilities" ~ "students with disability",
        subgroup == "two or more races" ~ "multiracial",
        subgroup == "native hawaiian" ~ "pacific islander", # or native amer.?
        subgroup == "american indian or alaska native" ~
          "american indian",
        subgroup == "asian, native hawaiian, or pacific islander" ~
          "asian", #>=2018... what to do here?
        TRUE ~ subgroup
      )
    ) %>%
    rename(county_id = county_code, 
           district_id = district_code,
           school_id = school_code)
  
  matric_by_yr <- split(df, df$end_year)
  
  
  out <- map_dfr(
    matric_by_yr, 
    function(.x) enrich_grad_count(.x, unique(.x$end_year))
    ) %>%
    # restore original end year
    select(-end_year) %>%
    rename(end_year = orig_end_year) %>%
    distinct(.keep_all = T) %>%
    mutate(
      enroll_any_count = round(enroll_any / 100 * graduated_count),
      enroll_4yr_count = round(enroll_4yr / 100 * enroll_any_count),
      enroll_2yr_count = round(enroll_2yr / 100 * enroll_any_count)
    )
  
  return(out)
}


#' Identify reportcard aggregation levels
#'
#' @param df enrollment dataframe, output of extract_rc_*
#'
#' @return data.frame with boolean aggregation flags
#' @export

id_rc_aggs <- function(df) {
  df %>%
    mutate(
      is_state = district_code == '9999' & county_code == '99',
      is_county = district_code == '9999' & !county_code =='99',
      is_district = school_code %in% c('997', '999') & !is_state,
      is_charter_sector = FALSE,
      is_allpublic = FALSE,
      is_school = !school_code %in% c('997', '999') & !is_state,
      is_charter = county_code == '80'
    ) %>%
    return()
}


#' Matriculation column order
#'
#' @param df processed postsecondary matriculation df
#'
#' @return df in correct order
#' @export
matric_column_order <- function(df) {
  df %>%
    select(
      end_year,
      county_id, county_name,
      district_id, district_name,
      school_id, school_name,
      subgroup,
      cohort_count, graduated_count,
      enroll_any_count, enroll_any,
      enroll_4yr_count, enroll_4yr,
      enroll_2yr_count, enroll_2yr,
      n_charter_rows,
      is_16mo,
      is_state,
      is_district,
      is_school,
      is_charter,
      is_charter_sector,
      is_allpublic
    )
}
almartin82/njschooldata documentation built on Nov. 23, 2023, 1:33 a.m.