data-raw/master_nycdata_package_ingest.R

################################################################################
################################################################################
################################################################################

# Reinstall: devtools::install_github(repo = "DerekYves/nycschools", lib=.libPaths()[4])

#### Master Script to Ingest NYC School Survey Data ####
rm(list=ls())

# !diagnostics off

# User Defined Parameters

na_strings <- c('N/A', '', 'Not Scored')

prjdir <- '~/projects/nycschools/data-raw/'
report_statabbr <- "NY"

years <- c(2013, 2016, 2017)

library(readr)
library(dplyr)
library(rgdal)    # for readOGR and others
library(proj4)



################################################################################
################################################################################
################################################################################

##### Load all files
setwd(prjdir)

#### 2013 Data
yr <- 1
cat("Starting", years[yr], "\n")

## NOTE: CSV files for this analysis were culled for the XLS files provided
# by the DOE. Due to heavy formatting and merged cells in the source, the CSV
# files read by this script were created manually. The original xls files are
# included in the raw-data folder and are named, e.g.: 2017PublicDataFileParent.xlsx

## 2013 Parent File (checked raw for q1c, 01M019, matches)
# Note: Data for this year has a composite score for each question ranging from 0-10

p <- read_csv(paste0(years[yr], '/data/parent_', years[yr], '_score.csv'), na=na_strings)

# View(p[, sapply(p, function(i) all(is.na(i)))])

p = p[, sapply(p, function(i) !all(is.na(i)))] # Drop entirely NA cols

# This Should be empty
unique(select_if(select(p, -dbn, -locationname, -school_type), is.character))


## 2013 Student File (checked raw for q5a, 01M034/roosevelt, matches)
s <- read_csv(paste0(years[yr], '/data/student_', years[yr], '_score.csv'), na=na_strings)
s <- s[, sapply(s, function(i) !all(is.na(i)))]


## 2013 Total File (checked raw for 01M064, eng_tot_13, matches)
tot <- read_csv(paste0(years[yr], '/data/total_', years[yr], '_score.csv'), na=na_strings)
tot <- tot[, sapply(tot, function(i) !all(is.na(i)))]

# Should match
stopifnot(all(all.equal(nrow(s), nrow(p)), all.equal(nrow(s), nrow(tot))))


## Join the student/parent/toal files
test <- nrow(s)
out <- left_join(s, p)
out <- left_join(out, tot)
stopifnot(test==nrow(out))

# Write files to disk
out <- cbind(year=years[yr], out)
saveRDS(out, paste0(years[yr], '/data/merged_', years[yr], '_scores.Rds'))
write.csv(out, paste0(years[yr], '/data/merged_', years[yr], '_scores.Rds'), row.names=FALSE)

saveRDS(data.frame(dbn=unique(out$dbn)), paste0(years[yr], '/data/merged_', years[yr], '_dbns.Rds'))



################################################################################
################################################################################
################################################################################

#### 2016 Data
yr <- 2
cat("Starting", years[yr], "\n")

## 2016 Parent File: N=3,086 in raw source file (verified)
p <- read_csv(paste0(years[yr], '/data/parent_', years[yr], '_score.csv'), na=na_strings)
nrow(p)

# View(p[, sapply(p, function(i) all(is.na(i)))]) # only five empty cols, q5
p = p[, sapply(p, function(i) !all(is.na(i)))] # Drop entirely NA cols

p <- rename(p, p_rr_par=p_rr, locationname_par=locationname)

# Data Checks:  DBN 02M051
p$p_q1h_sa[p$dbn=='02M051'] # 100 (100 in nsource)
p$p_q2c_sd[p$dbn=='02M051'] # 10 (10 in source)
p$p_q5a_s[p$dbn=='02M051']  # 104 (104 in source)

# Should be empty
unique(select_if(select(p, -dbn, -locationname_par), is.character))


## 2016 Student File (N=1,137 in raw source file)
s <- read_csv(paste0(years[yr], '/data/student_', years[yr], '_score.csv'), na=na_strings)
nrow(s)

s <- s[, sapply(s, function(i) !all(is.na(i)))] # no change
s <- rename(s, s_rr_stu=s_rr, locationname_stu=locationname)

# Data Checks:  DBN 02M051
s$s_q1e_a[s$dbn=='02M418']     # 238 (238 in nsource)
s$s_q2d_most[s$dbn=='02M418']  # 312 (312in source)
s$s_q6e_none[ s$dbn=='02M418'] # 9 (9 in source)


## 2016 Total File: N=3,098 in raw source file
# (checked raw for 01M064, eng_tot_13, matches)
# Note: These contain a composite score, 0:1
tot <- read_csv(paste0(years[yr], '/data/total_', years[yr], '_score.csv'), na=na_strings)
nrow(tot)

tot <- tot[, sapply(tot, function(i) !all(is.na(i)))]
tot <- rename(tot, p_rr_tot=p_rr, s_rr_tot=s_rr, t_rr_tot=t_rr, locationname_tot=locationname)

tot$eff_leaders[tot$dbn=='05M030'] # .82 (82% in nsource)
tot$supp_env[tot$dbn=='05M030']    # .68 (68% in source)
tot$stu_sat[tot$dbn=='05M030']     #  NA (N/A in source)


## Join the parent/student/total files (inner on student)
test <- nrow(s)
out <- inner_join(s, p) # Reduce record count
nrow(s); (test<-nrow(out)) # lose no rows

out <- inner_join(tot, out) # lose no rows
stopifnot(test==nrow(out))

# These variables be similar or match exactly
# View(select(out, contains("rr")))
out$p_rr_par <- out$s_rr_stu <- NULL

# Write files to disk
out <- cbind(year=years[yr], out)
saveRDS(out, paste0(years[yr], '/data/merged_', years[yr], '_scores.Rds'))
write.csv(out, paste0(years[yr], '/data/merged_', years[yr], '_scores.csv'), row.names=FALSE)

saveRDS(data.frame(dbn=unique(out$dbn)), paste0(years[yr], '/data/merged_', years[yr], '_dbns.Rds'))

assign(paste0("surveys_", years[yr]), out)

# Stubs in this file
table(sub("^p_q\\d+._", "", names(p)))


################################################################################
################################################################################
################################################################################

#### 2017 Data
yr <- 3
cat("Starting", years[yr], "\n")

## 2017 Parent File: N=2,966 in raw source file (verified)
p <- read_csv(paste0(years[yr], '/data/parent_', years[yr], '_score.csv'), na=na_strings)
nrow(p)

# 5d and 5e have mostly missing data
p <- select(p, -starts_with('p_q5d'))
p <- select(p, -starts_with('p_q5e'))

# View(p[, sapply(p, function(i) all(is.na(i)))]) # only five empty cols, q5

# Should not be any dropped cols
p = p[, sapply(p, function(i) !all(is.na(i)))] # Drop entirely NA cols

p <- rename(p, p_rr_par=p_rr, locationname_par=locationname)

# Lots of missng data: p$p_q5e_idk

# Data Checks:  DBN 02M051
p$p_q1e_a[p$dbn== '02M051'] # 85  (85 in nsource)
p$p_q2b_sd[p$dbn=='02M051'] # 2   (2 in source)
p$p_q5a_vs[p$dbn=='02M051'] # 113 (113 in source)

# Should be empty
unique(select_if(select(p, -dbn, -locationname_par), is.character))

## 2017 Student File (N=1,121 in raw source file)
s <- read_csv(paste0(years[yr], '/data/student_', years[yr], '_score.csv'), na=na_strings)
nrow(s)

# These are grade 6-8 student questons, many therefore missing in HS file, droppped
s <- select(s, -starts_with('s_q9'))

s <- s[, sapply(s, function(i) !all(is.na(i)))] # no change
s <- rename(s, s_rr_stu=s_rr, locationname_stu=locationname)

# Data Checks:  DBN 29Q248
s$s_q1h_a[s$dbn=='29Q248']     # 122 (122 in nsource)
s$s_q2d_most[s$dbn=='29Q248']  # 103 (103 in source)
s$s_q6e_none[ s$dbn=='29Q248'] # 50 (50 in source)


## 2017 Total File: N=3,254 in raw source file
# (checked raw for 01M064, eng_tot_13, matches)
# Note: These contain a composite score, 0:1
tot <- read_csv(paste0(years[yr], '/data/total_', years[yr], '_score.csv'), na=na_strings)
nrow(tot)

tot <- tot[, sapply(tot, function(i) !all(is.na(i)))]
tot <- rename(tot, p_rr_tot=p_rr, s_rr_tot=s_rr, t_rr_tot=t_rr, locationname_tot=locationname)

tot$eff_leaders[tot$dbn=='26Q191'] # 4.24 (4.24 in nsource)
tot$supp_env[tot$dbn=='26Q191']    # 4.14 (4.14in source)
tot$trust[ tot$dbn=='26Q191']      # 4.18 (4.18 in source)


## Join the parent/student/total files (inner on student)
test <- nrow(s)
out <- inner_join(s, p) # Reduce record count
nrow(s); (test<-nrow(out)) # lose 3 rows

out <- inner_join(tot, out) # lose no rows
stopifnot(test==nrow(out))

# These variables be similar or match exactly
# View(select(out, contains("rr")))
out$p_rr_par <- out$s_rr_stu <- NULL

# Write files to disk
out <- cbind(year=years[yr], out)
saveRDS(out, paste0(years[yr], '/data/merged_', years[yr], '_scores.Rds'))
write.csv(out, paste0(years[yr], '/data/merged_', years[yr], '_scores.csv'), row.names=FALSE)

assign(paste0("surveys_", years[yr]), out)
saveRDS(data.frame(dbn=unique(out$dbn)), paste0(years[yr], '/data/merged_', years[yr], '_dbns.Rds'))

# Stubs in this file
table(sub("^p_q\\d+._", "", names(p)))


################################################################################
################################################################################
################################################################################

#### Load Points Data

# Source: https://data.cityofnewyork.us/Education/School-Point-Locations/jfju-ynrr

(loc <- path.expand(paste0(getwd(), "/meta_data/Public_School_Locations/")))
file.exists( paste0(loc, 'Public_Schools_Points_2011-2012A.shp') )

shp <- readOGR(loc, layer="Public_Schools_Points_2011-2012A")
(pstr <- proj4string(shp))

summary(shp)

pj <- data.frame(project(shp@coords, pstr, inverse=TRUE))
pj <- cbind(pj, shp@data)

school_points <- select(pj, lat=X2, lng=X1, dbn=ATS_CODE, points_schoolname=SCHOOLNAME)
school_points$dbn <- stringr::str_trim(as.character(school_points$dbn))

# View(shp@data)
# View(shp@coords)

# plot(shp)

# library(ggplot2)
# # now create a ggplot map
# ggplot() +  geom_point( data= school_points, aes(x=lng, y=lat), color="red")

# devtools::use_data(school_points, overwrite = TRUE)


################################################################################
################################################################################
################################################################################

#### Load the Census data

# Source: This was downloaded with a custom ACS script not included with this package

# Join the Census Redux file
cen <- readRDS(paste0(prjdir, "census/", report_statabbr, "_ACS5_df_", 2016, ".Rds"))

# Do census tract numbers repeat across counties? Yes
ny=cen$tract[cen$county=="061"]       # Manhattan
queens=cen$tract[cen$county=="081"]   # Queens
bronx=cen$tract[cen$county=="005"]    # Bronx
kings=cen$tract[cen$county=="047"]    # Kings/Brooklyn
richmond=cen$tract[cen$county=="085"] # Richmond/Staten Island
nass=cen$tract[cen$county=="059"]     # Nassau

# Should be unique within county
all(sum(sum(duplicated(ny)), sum(duplicated(queens)), sum(duplicated(kings)), sum(duplicated(bronx)), sum(duplicated(richmond)))==0)

# Duplicated tract numbers have different data FYI. They are not repeated values across counties
sum(duplicated(c(ny, queens, kings,bronx,richmond)))



################################################################################
################################################################################
################################################################################

# Load DOE school file to save census tract and related school-level info

# Source: DOE

doe <- readxl::read_excel("meta_data/LCGMS_SchoolData_20180107_1420.xlsx")
doe <- filter(doe, !is.na(`ATS System Code`))

table(doe$City)
doe$City1 <- gsub("CITY", "", toupper(doe$City))
table(doe$City1)

# Construct state and county ids
doe$stateid <- 36
doe$countyid <- NA
doe$countyid <- with(doe,
                  ifelse(grepl("NEW YORK|NEW YORK|MANHATTAN", City1), "061",
                         ifelse(grepl("SYOSSET", City1), "059",
                                ifelse(grepl("BROOKLYN", City1), "047",
                                       ifelse(grepl("BRONX", City1), "005",
                                       ifelse(grepl("STATEN", City1), "085",
                                              ifelse(grepl("QUEENS|JACKSON|BAYSIDE|JAMAICA|ROCKAWAY|LONG ISLAND", City1), "081", NA)))))))


# Fix the tract for Syosset
doe$`Census Tract`[doe$City1=="SYOSSET"] <- 518800

# Every school has a county code
table(doe$countyid, useNA='always')

doe$`Census Tract` <- sprintf('%06d', as.numeric(doe$`Census Tract`))


# Build the GEOID field
doe$GEOID <- with(doe, paste0(stateid, countyid, `Census Tract`))

# Save a subset of fields
doerdx <- select(doe,
                 GEOID,
                 dbn=`ATS System Code`,
                 doe_name=`Location Name`,
                 doe_managed_by=`Managed By Name`,
                 doe_school_type=`Location Type Description`,
                 doe_grade_cat=`Location Category Description`,
                 doe_grade_levels=`Grades`,
                 #doe_grade_levels_final=`Grades Final`,
                 doe_open_year=`Open Date`,
                 doe_address=`Primary Address`,
                 doe_zip=Zip,
                 doe_city=City1,
                 doe_tract=`Census Tract`,
                 doe_bor_block=`Borough Block Lot`,
                 doe_superintendent_code=`Superintendent Location Code`)

table(doerdx$doe_city)

# Join Points data to DOE school roster

outersect <- function(x, y) {
    sort(c(setdiff(x, y),
           setdiff(y, x)))
}
# outersect(doerdx$dbn, school_points$dbn)


################################################################################
################################################################################
################################################################################

# Join the data


# DBNs in the school metafile
table(surveys_2016$dbn %in% doerdx$dbn)

# Missing appear to be mostly magnet schools (67):
# View(surveys_2016[!surveys_2016$dbn %in% doerdx$dbn, ])

# 2017
table(surveys_2017$dbn %in% doerdx$dbn)


## Points data matches for the survey file
table(surveys_2016$dbn %in% school_points$dbn) # Missing 175
table(surveys_2017$dbn %in% school_points$dbn) # Missing 176

# Missing Census tracts from DOE tract list:
table(doerdx$GEOID %in% cen$GEOID) # None

# Final merges:

# Join Census data to DOE list
sch_dat <- left_join(doerdx, cen)
nrow(sch_dat)==nrow(doerdx)

# Join the points data where available
sch_dat <- left_join(sch_dat, school_points)

# Join metadata to survey data
s16 <- right_join(sch_dat, surveys_2016)
nrow(s16)==nrow(surveys_2016)
saveRDS(s16, paste0(2016, '/data/merged_', 2016, '_all_vars.Rds'))
save(list="s16", file=paste0(2016, '/data/merged_', 2016, '_all_vars.RData'))

# Depracated
# devtools::use_data(s16, overwrite = TRUE)


s17 <- right_join(sch_dat, surveys_2017)
nrow(s17)==nrow(surveys_2017)
saveRDS(s17, paste0(2017, '/data/merged_', 2017, '_all_vars.Rds'))
save(list="s17", file=paste0(2017, '/data/merged_', 2017, '_all_vars.RData'))

# Depracated
# Save out the master files to the package
# devtools::use_data(s17, overwrite = TRUE)


# Function to calculate scores for a stub
df=s17

score_calc <- function(df, stub, percent_positive=TRUE) {
    names <- names(df)[grepl(stub, names(df))]
    cat( "Return formula: 100 * (", names[3], " + ", names[4], ") / (", names[1], " + ", names[2], " + ", names[3], " + ", names[4], ")\n", sep="")
    num <- apply( df[, c(names[3], names[4])], 1, sum, na.rm=TRUE)
    denom <- apply( df[,c(names[1:4])], 1, sum, na.rm=TRUE)
    denom[denom==0] <- NA
    return(100 * ( num / denom ))
}

# Save only highschools in four boroughs
hs <-filter(df, doe_grade_cat=="High school") 
table(hs$doe_city)

hs <-filter(hs, doe_city %in% c("BRONX", "BROOKLYN", "MANHATTAN", "QUEENS", "JACKSON HEIGHTS", "JAMAICA", "ROCKAWAY PARK") )
hs$doe_city[hs$doe_city %in% c("JACKSON HEIGHTS", "JAMAICA", "ROCKAWAY PARK")] <- "QUEENS"
table(hs$doe_city)

# Save composite scores

# Get the stubs:

studstub <- names(hs)[ grepl("^s_q[[:digit:]]", names(hs)) ]
(studstub <- unique(gsub("(^s_q[[:digit:]]{1,2}[[:alpha:]])_[[:alpha:]]{1,6}$", "\\1", studstub)))


parstub <- names(hs)[ grepl("^p_q[[:digit:]]", names(hs)) ]
(parstub <- unique(gsub("(^p_q[[:digit:]]{1,2}[[:alpha:]])_[[:alpha:]]{1,6}$", "\\1",parstub)))

stubs <- c(studstub, parstub)

for(i in seq_along(stubs) ) {
    hs[[ paste(stubs[i], '_score', sep='')]] <- round(score_calc(hs,  stubs[i]))
}


# Missing Census Data by city
group_by(hs, doe_city) %>% summarize(n=n(),missing=sum(is.na(pct_pov)))
table(is.na(hs$pct_pov))

# Drop schools without census poverty measures
hs <- filter(hs, !is.na(pct_pov))

# Fix open year
table(hs$doe_open_year, useNA='always')
hs$oyr <- gsub("\\s{2,}", " ", hs$doe_open_year)
table(hs$oyr, useNA='always')
hs$open_year <- as.Date(x=hs$oyr, format='%b %d %Y')
table(hs$open_year, useNA='always')
# View(select(hs, contains("year")) )

hs$years_open <- round( (Sys.Date() - hs$open_year) / 365.25, digits=1)

table(hs$doe_grade_cat)
hs$doe_address

hs$p_rr_tot

hs_out <- select(hs,
             doe_city,
             doe_name,
             doe_address,
             doe_zip,
             open_year,
             years_open,
             doe_grade_levels,
             lat,
             lng,
             
             # Census
             pct_pop_change,
             pct_any_health_care_u65=pct_any_hc_u65,
             pct_unv_deg, pct_under_hs,
             med_house_income=est_med_hinc, # Median Household Income in the Past 12 Months (In 2014 Inflation-Adjusted Dollars) By Household Size
             pct_poverty=pct_pov, pct_income_100k_up=pct_inc_100k_up, pct_income_150k_up=pct_inc_150k_up,
             pct_black, pct_amind, pct_asian, pct_nhpi, pct_oth, pct_mlt_r, pct_nonwht,pct_latin,
             
             parent_resp_rate=p_rr_tot,
             student_resp_rate=s_rr_tot,
             total_resp_rate=t_rr_tot,
             
             # School variables
             colab,
             eff_leaders,
             rig_instr,
             supp_env,
             stg_ties,
             trust,
             
             ends_with('_score'),
             
             
             # Modify this to select the question you are interested in
             s_q5a_none, s_q5a_few, s_q5a_most, s_q5a_all, # 5a. In how many of your classes are you challenged?
             s_q5c_none, s_q5c_few, s_q5c_most, s_q5c_all, # 5c. In how many of your classes do your teachers ask difficult questions in class?
             s_q7a_sd, s_q7a_d, s_q7a_a, s_q7a_sa         # 7d. I feel safe in my classes at this school.
             
)

class(hs_out)

s17 <- as.data.frame(hs_out)
class(s17)
str(s17)

devtools::use_data(s17, overwrite = TRUE)
write.csv(s17, '2017_high_school_file.csv', row.names=FALSE)

# #### Load the GSS data
# 
# GSS <- readRDS('gss/gss.Rds')
# devtools::use_data(GSS, overwrite = TRUE)


# Load World Bank Data
# install.packages('WDI')

library(WDI)
WDIsearch('gdp.*capita.*constant')
gdpna = WDI(indicator='NY.GDP.PCAP.KD', country=c('MX','CA','US'), start=1960, end=2016)
saveRDS(gdpna, 'world_bank/gdp_north_america.Rds')
devtools::use_data(gdpna, overwrite = TRUE)



WDIsearch('unemployment')
unemp = WDI(indicator='SL.UEM.TOTL.ZS', start=1991, end=2016)
table(unemp$country)

unemp$country <- stringi::stri_trans_general(unemp$country, "latin-ascii")

saveRDS(unemp, 'world_bank/unemployment.Rds')
devtools::use_data(unemp, overwrite = TRUE)


# library(ggplot2)
# ggplot(filter(unemp, country=="United States")) + geom_line(aes(x=year, y=SL.UEM.TOTL.ZS))
# ggplot(filter(gdp, country=="United States")) + geom_line(aes(x=year, y=NY.GDP.PCAP.KD))

# Gapminder data
# install.packages("gapminder")
library(gapminder)
dir.create('gapminder', showWarnings=FALSE)
gp <- gapminder
write.csv(gp, 'gapminder/gp.csv', row.names=FALSE)
devtools::use_data(gp, overwrite = TRUE)

setwd('..')
roxygen2::roxygenise()
DerekYves/nycschools documentation built on May 16, 2019, 6:39 p.m.