R/geo.R

Defines functions ward_gcount_aggs ward_matric_aggs ward_grate_aggs ward_parcc_aggs ward_enr_aggs enrich_school_city_ward enrich_school_latlong

Documented in enrich_school_city_ward enrich_school_latlong ward_enr_aggs ward_gcount_aggs ward_grate_aggs ward_matric_aggs ward_parcc_aggs

#' Enrich School Data with Lat / Long
#'
#' @param df dataframe to be enriched
#' @param use_cache if TRUE, will read from cache of school info / lat lng stored on TODO
#' @param api_key optional, personal google maps API key
#'
#' @return dataframe enriched with lat lng
#' @export

enrich_school_latlong <- function(df, use_cache=TRUE, api_key='') {

  # download and clean
  nj_sch <- httr::GET('https://homeroom5.doe.state.nj.us/directory/schoolDL.php') %>%
    httr::content(as="text") %>%
    readr::read_csv(skip=3) %>%
    rename(
      district_id = `District Code`,
      school_id = `School Code`
    ) %>%
    clean_names() %>%
    select(
      district_id,
      school_id,
      address1,
      city,
      state,
      zip
    ) %>%
    mutate(
      district_id = kill_padformulas(district_id),
      school_id = kill_padformulas(school_id),
      zip = kill_padformulas(zip),
      address = paste0(address1, ', ', city, ', ', state, ' ', zip, ' USA')
      )
  
  
  old_nwk_addresses_RAW <- read_csv("data/nwk_address_addendum.csv",
                                    col_types = "ccccl")
  
  old_nwk_addresses <- old_nwk_addresses_RAW %>%
    mutate(
      school_id = str_pad(school_id, 3, pad = '0')
      )

  # geocode
  if (use_cache) {
    data("geocoded_cached")
  } else {
    geocoded <- placement::geocode_url(
      nj_sch$address,
      auth='standard_api',
      privkey=api_key,
      clean=TRUE,
      verbose=TRUE
    )
  }

  geocoded_merge <- geocoded %>%
    select(locations, lat, lng) %>%
    rename(
      address = locations
    ) %>%
    mutate(address = str_to_lower(address),
           address = str_replace(address, "-\\d{4}\\susa", " usa"))
  
  nj_sch <- nj_sch %>%
    bind_rows(old_nwk_addresses) %>%
    mutate(
      address = gsub("\\s+", ' ', address),
      address = str_to_lower(address),
      address = str_replace_all(address, "-\\d{4}\\susa", " usa"),
      address_2 = case_when(
        str_detect(address, "street") ~ str_replace(address, "street", "st"),
        str_detect(address, "avenue") ~ str_replace(address, "avenue", "ave"),
        str_detect(address, "boulevard") ~ str_replace(address, "boulevard", "blvd"),
        TRUE ~ address),
      address_3 = case_when(
        str_detect(address, "ave,") ~ str_replace(address, "ave,", "avenue,"),
        str_detect(address, "st,") ~ str_replace(address, "st,", "street,"),
        str_detect(address, "blvd,") ~ str_replace(address, "blvd,", "boulevard,"),
        TRUE ~ address1)
    ) %>%
    select(district_id, school_id, address, address_2, address_3) %>%
    left_join(geocoded_merge, by = 'address') %>%
    left_join(geocoded_merge, by = c('address_2' = 'address')) %>%
    left_join(geocoded_merge, by = c('address_3' = 'address')) %>%
    mutate(lat = coalesce(lat.x, lat.y, lat),
           lng = coalesce(lng.x, lng.y, lng)) %>%
    select(district_id, school_id, address, lat, lng) %>%
    unique()

  # join on district and school and return
  df %>% left_join(nj_sch, by = c('district_id', 'school_id'))
}


#' Enrich School Data with City Ward
#'
#' @param df any dataframe with a district_id
#'
#' @return df enriched with ward, if geographic data is 'registered' for a given district
#' @export

enrich_school_city_ward <- function(df) {
  supported_geos <- c('3570')

  # say what fraction of the rows are supported
  supported <- df$district_id %in% supported_geos
  pct_supported <- supported %>%
    mean() %>%
    multiply_by(100) %>%
    round(1)

  message(
    paste0('ward information available for ', pct_supported,
           '% (', sum(supported), '/', length(supported),
           ') rows in this data set.')
  )
  # split into supported / unsupported
  geo_mask <- df$district_id %in% supported_geos
  latlong_mask <- !is.na(df$lat) & !is.na(df$lng)
  final_mask <- geo_mask & latlong_mask

  df_supported <- df %>%
     ungroup() %>%
     filter(final_mask)

  df_unsupported <- df %>%
     ungroup() %>%
     filter(!final_mask)

  # add specific geos here
  # newark (3570)
  if ('3570' %in% df$district_id) {
    newark_wards <- geojsonio::geojson_read(
      "http://data.ci.newark.nj.us/dataset/ba8f41a3-584b-4021-b8c3-30a7d1ae8ac3/resource/5b9c86cd-b57b-4341-8c4c-ee975d9e1904/download/wards2012.geojson",
      what = "sp"
    )
    newark_wards$WARD_NAME <- as.character(newark_wards$WARD_NAME)
    sp::coordinates(df_supported) <- ~lng+lat
    sp::proj4string(df_supported) <- sp::proj4string(newark_wards)

    df_supported$ward <- sp::over(df_supported, newark_wards)$WARD_NAME
    df_supported <- as_tibble(df_supported)
  }
  # combine and return
  bind_rows(df_supported, df_unsupported)
}


#' Aggregates enrollment data by ward
#'
#'
#' @param list_of_dfs output of \code{fetch_enr}
#'
#' @return A data frame of ward aggregations
#' @export

ward_enr_aggs <- function(df) {

  # enrich
  df <- enrich_school_latlong(df) %>%
    enrich_school_city_ward()

  df <- df %>%
    filter(!is.na(ward)) %>%
    group_by(
      end_year,
      county_id, county_name,
      district_id, district_name,
      ward,
      program_code, program_name, grade_level,
      subgroup
    ) %>%
    summarize(
      n_students = sum(n_students, na.rm = TRUE),
      n_schools = n()
    ) %>%
    ungroup()

  df <- df %>%
    mutate(
      CDS_Code = NA_character_,
      district_id = paste0(district_id, ' ', ward),
      district_name = paste0(district_name, ' ', ward),
      school_id = '999W',
      school_name = 'Ward Total',
      is_state = FALSE,
      is_county = FALSE,
      is_citywide = FALSE,
      is_district = FALSE,
      is_charter_sector = FALSE,
      is_allpublic = FALSE,
      is_school = FALSE,
      is_subprogram = !program_code == '55'
    ) %>%
    select(-ward)

  # calculate percent
  df <- agg_enr_pct_total(df)

  # column order and return
  agg_enr_column_order(df)
}

#' Aggregates assessment data by ward
#'
#'
#' @param list_of_dfs output of \code{fetch_all_parcc} or \code{fetch_parcc}
#'
#' @return A data frame of ward aggregations
#' @export
ward_parcc_aggs <- function(list_of_dfs) {

   df <- list_of_dfs %>%
      bind_rows() %>% # convert to df
      enrich_school_latlong() %>%
      enrich_school_city_ward()

   df <- df %>%
      filter(!is.na(ward)) %>%
      group_by(
         testing_year,
         assess_name, test_name,
         county_id, county_name,
         district_id, district_name,
         ward,
         grade,
         subgroup, subgroup_type
      ) %>%
     parcc_aggregate_calcs() %>%
     ungroup()

   df <- df %>%
      mutate(
         district_id = paste0(district_id, ' ', ward),
         district_name = paste0(district_name, ' ', ward),
         school_id = '999W',
         school_name = 'Ward Total',
         is_state = FALSE,
         is_county = FALSE,
         is_citywide = FALSE,
         is_district = FALSE,
         is_charter_sector = FALSE,
         is_allpublic = FALSE,
         is_charter = FALSE,
         is_school = FALSE,
         is_dfg = (county_id == 'DFG')
      ) %>%
      select(-ward)

   return(df)
}


#' Aggregates grad rate data by ward
#'
#'
#' @param df output of \code{fetch_grad_rate}
#'
#' @return A data frame of ward aggregations
#'
#'
#' @export
ward_grate_aggs <- function(df) {
  enriched_df <- df %>%
    enrich_school_latlong() %>%
    enrich_school_city_ward()
  
  ward_df <- enriched_df %>% 
    filter(!is.na(ward)) %>%
    group_by(
      end_year,
      county_id, county_name,
      district_id, district_name,
      ward,
      subgroup, 
      methodology
    ) %>%
    grate_aggregate_calcs() %>%
    ungroup()
  
  ward_df %>%
    mutate(
      district_id = paste0(district_id, ' ', ward),
      district_name = paste0(district_name, ' ', ward),
      school_id = '999W',
      school_name = 'Ward Total',
      is_state = FALSE,
      is_county = FALSE,
      is_district = FALSE,
      is_charter = FALSE,
      is_school = FALSE,      
      is_charter_sector = FALSE,
      is_allpublic = FALSE
    ) %>%
    grate_column_order() %>%
    return()
}

#' Aggregates matriculation data by ward
#'
#'
#' @param df output of \code{enrich_matric_counts}
#'
#' @return A data frame of ward aggregations
#' @export
ward_matric_aggs <- function(df) {
  enriched_df <- df %>%
    enrich_school_latlong() %>%
    enrich_school_city_ward()
  
  ward_df <- enriched_df %>% 
    filter(!is.na(ward), 
           !is.na(enroll_any)) %>%
    group_by(
      end_year,
      county_id, county_name,
      district_id, district_name,
      ward,
      subgroup,
      is_16mo
    ) %>%
    matric_aggregate_calcs() %>%
    ungroup()
  
  ward_df %>%
    mutate(
      district_id = paste0(district_id, ' ', ward),
      district_name = paste0(district_name, ' ', ward),
      school_id = '999W',
      school_name = 'Ward Total',
      is_state = FALSE,
      is_county = FALSE,
      is_district = FALSE,
      is_charter = FALSE,
      is_school = FALSE,      
      is_charter_sector = FALSE,
      is_allpublic = FALSE
    ) %>%
    matric_column_order() %>%
    return()
}


#' Aggregates grad counts data by ward
#'
#'
#' @param df output of \code{fetch_grad_count}
#'
#' @return A data frame of ward aggregations
#'
#'
#' @export
ward_gcount_aggs <- function(df) {
  enriched_df <- df %>%
    enrich_school_latlong() %>%
    enrich_school_city_ward()
  
  ward_df <- enriched_df %>% 
    filter(!is.na(ward)) %>%
    group_by(
      end_year,
      county_id, county_name,
      district_id, district_name,
      ward,
      subgroup
    ) %>%
    gcount_aggregate_calcs() %>%
    ungroup()
  
  ward_df %>%
    mutate(
      district_id = paste0(district_id, ' ', ward),
      district_name = paste0(district_name, ' ', ward),
      school_id = '999W',
      school_name = 'Ward Total',
      is_state = FALSE,
      is_county = FALSE,
      is_district = FALSE,
      is_charter = FALSE,
      is_school = FALSE,      
      is_charter_sector = FALSE,
      is_allpublic = FALSE
    ) %>%
    gcount_column_order() %>%
    return()
}
almartin82/njschooldata documentation built on Nov. 23, 2023, 1:33 a.m.