load packages

# R imports - might have to install some
library(sf)
library(tidyverse)
library(tmap)
library(tmaptools)
library(tidycensus)
library(tigris)
library(rmapshaper)
library(matrixStats)
library(SpatialAcc)

library(tidygeocoder)
library(osrm)
library(data.table)

read in our data

trade_schools <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/less_two_geoid_ct.csv")
trade_schools <- trade_schools[, c("tot_enrol", "latitude", "longitude", "geoid")]

two_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/two_year_geoid_ct.csv")
two_year_colleges <- two_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")]

four_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/four_year_geoid_ct.csv")
four_year_colleges <- four_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")]

step 1: start with health districts

health_district <- read.csv("/project/biocomplexity/sdad/projects_data/vdh/va_county_to_hd.csv")
health_district$county_id <- as.character(health_district$county_id)

con <- get_db_conn(db_pass = "rsu8zvrsu8zv")
health_district_geoids <- st_read(con, query = "SELECT * FROM dc_geographies.va_hd_vdh_2021_health_district_geo_names")
DBI::dbDisconnect(con)

health_district_2 <- left_join(health_district, health_district_geoids, by = c("health_district" = "region_name"))

trade_hd_supply <- merge(trade_schools %>% mutate(geoid = as.character(geoid)), health_district_2[, c("county_id", "health_district", "geoid")], by.x = "geoid", by.y = "county_id")
two_year_hd_supply <- merge(two_year_colleges %>% mutate(geoid = as.character(geoid)), health_district_2[, c("county_id", "health_district", "geoid")], by.x = "geoid", by.y = "county_id")
four_year_hd_supply <- merge(four_year_colleges %>% mutate(geoid = as.character(geoid)), health_district_2[, c("county_id", "health_district", "geoid")], by.x = "geoid", by.y = "county_id")

health_district$county_id <- as.character(health_district$county_id)

# get population under 15 years old
va.co <- get_acs(geography = "county",
              year = 2019,
              variables = c(tpop = "B01003_001"),
              state = "VA",
              survey = "acs5",
              output = "wide",
              geometry = TRUE)
va.co.utm <- st_transform(va.co, crs = "+proj=longlat +datum=WGS84")
va.co.utm <- va.co.utm[!st_is_empty(va.co.utm),]

new_demand <- left_join(va.co.utm, health_district_2[, c("county_id", "health_district", "geoid")], by = c("GEOID" = "county_id"))
new_demand <- new_demand  %>%
  group_by(health_district, geoid) %>%
  summarise(geometry = sf::st_union(geometry),
            tpop = sum(tpopE)) %>%
  ungroup()
new_demand <- st_sf(health_district = new_demand$health_district, tpop = new_demand$tpop, geometry = new_demand$geometry) %>% mutate(centroid = st_centroid(geometry))

new_demand$longitude = st_coordinates(new_demand$centroid)[,1]
new_demand$latitude = st_coordinates(new_demand$centroid)[,2]

trade schools drive times

supply <- trade_hd_supply

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$health_district
trade_times <- as.data.frame(as.table(all_data))
colnames(trade_times) <- c("origin", "dest", "cost")
trade_times$origin <- rep(new_demand$health_district, times = dim(supply)[1])

2-year colleges drive times

supply <- two_year_hd_supply

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$health_district
twoyear_times <- as.data.frame(as.table(all_data))
colnames(twoyear_times) <- c("origin", "dest", "cost")
twoyear_times$origin <- rep(new_demand$health_district, times = dim(supply)[1])

4-year colleges - drive times

supply <- four_year_hd_supply

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$health_district
fouryear_times <- as.data.frame(as.table(all_data))
colnames(fouryear_times) <- c("origin", "dest", "cost")
fouryear_times$origin <- rep(new_demand$health_district, times = dim(supply)[1])

getting the data in the correct format

do we look at mean and median drive times to 2-year colleges, 4-year colleges, and trade schools for all 3 levels of geography

Health district closest 5 and 10

trade_hd_times.5 <- trade_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_hd_times.10 <- trade_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_hd_times <- rbind(trade_hd_times.5, trade_hd_times.10)

twoyear_hd_times.5 <- twoyear_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_hd_times.10 <- twoyear_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_hd_times <- rbind(twoyear_hd_times.5, twoyear_hd_times.10)

fouryear_hd_times.5 <- fouryear_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_hd_times.10 <- fouryear_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_hd_times <- rbind(fouryear_hd_times.5, fouryear_hd_times.10)

repeating everything from above for counties!!!

# get population under 15 years old
va.co <- get_acs(geography = "county",
              year = 2019,
              variables = c(male_under_5 = "B01001_003",
                            male_9 = "B01001_004",
                            male_14 = "B01001_005",
                            female_under_5 = "B01001_027",
                            female_9 = "B01001_028",
                            female_14 = "B01001_029"),
              state = "VA",
              survey = "acs5",
              output = "wide",
              geometry = TRUE)
va.co.utm <- st_transform(va.co, crs = "+proj=longlat +datum=WGS84")
va.co.utm <- va.co.utm[!st_is_empty(va.co.utm),]
va.co.utm <- va.co.utm %>%
  mutate(centroid = st_centroid(st_geometry(va.co.utm)))

va.co.utm$longitude = st_coordinates(va.co.utm$centroid)[,1]
va.co.utm$latitude = st_coordinates(va.co.utm$centroid)[,2]
new_demand <- va.co.utm
### trade schools
trade_schools <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/less_two_geoid_ct.csv")
trade_schools <- trade_schools[, c("tot_enrol", "latitude", "longitude", "geoid")]
supply <- trade_schools %>%
  rename(county = geoid)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$county
trade_co_times <- as.data.frame(as.table(all_data))
colnames(trade_co_times) <- c("origin", "dest", "cost")
trade_co_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])


### two-year colleges
two_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/two_year_geoid_ct.csv")
two_year_colleges <- two_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")]
supply <- two_year_colleges %>%
  rename(county = geoid)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$county
two_year_co_times <- as.data.frame(as.table(all_data))
colnames(two_year_co_times) <- c("origin", "dest", "cost")
two_year_co_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])


### four-year colleges
four_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/four_year_geoid_ct.csv")
four_year_colleges <- four_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")]
supply <- four_year_colleges %>%
  rename(county = geoid)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$county
four_year_co_times <- as.data.frame(as.table(all_data))
colnames(four_year_co_times) <- c("origin", "dest", "cost")
four_year_co_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])
trade_co_times.5 <- trade_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_co_times.10 <- trade_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_co_times.final <- rbind(trade_co_times.5, trade_co_times.10)

twoyear_co_times.5 <- two_year_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_co_times.10 <- two_year_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_co_times <- rbind(twoyear_co_times.5, twoyear_co_times.10)

fouryear_co_times.5 <- four_year_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_co_times.10 <- four_year_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_co_times <- rbind(fouryear_co_times.5, fouryear_co_times.10)

repeating everything from above for tracts!!!

# get population under 15 years old
va.tr <- get_acs(geography = "tract",
              year = 2019,
              variables = c(male_under_5 = "B01001_003",
                            male_9 = "B01001_004",
                            male_14 = "B01001_005",
                            female_under_5 = "B01001_027",
                            female_9 = "B01001_028",
                            female_14 = "B01001_029"),
              state = "VA",
              survey = "acs5",
              output = "wide",
              geometry = TRUE)
va.tr.utm <- st_transform(va.tr, crs = "+proj=longlat +datum=WGS84")
va.tr.utm <- va.tr.utm[!st_is_empty(va.tr.utm),]
va.tr.utm <- va.tr.utm %>%
  mutate(centroid = st_centroid(st_geometry(va.tr.utm)))

va.tr.utm$longitude = st_coordinates(va.tr.utm$centroid)[,1]
va.tr.utm$latitude = st_coordinates(va.tr.utm$centroid)[,2]
new_demand <- va.tr.utm

I only have county level geoid - using this code chunk to go from long/lat to tract level geoid

so goal with python code below is to quickly get tract geoids!

load packages in python

# generic imports
import numpy as np
import pandas as pd

# geospatial imports
import geopandas as gpd
import pygeos
from pygeos import box, area, intersection

# FCA pakage
from access import Access, weights

get tract geoid

### ADD GEOID COLUMN
# create daycare locations array
def get_tract_geoid(df):
  schools = gpd.GeoDataFrame(df, geometry = gpd.points_from_xy(df.longitude, df.latitude))
  pg_schools_geoms = np.array([pygeos.Geometry(str(schools.geometry[i])) for i in range(len(schools))]) # convert daycare location to Pygeos

  # create Virginia block group array
  va_tracts = gpd.read_file("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_tr.shp") # read in virginia block group data
  va_tracts = va_tracts.loc[va_tracts.geometry.notna()].reset_index(drop = True) # drop empty geometries for geographies
  pg_geog_geoms = np.array([pygeos.Geometry(str(va_tracts.geometry[i])) for i in range(len(va_tracts))]) # convert to Pygeos

  # get indices of intersection and geoids
  idxs = pygeos.contains(pg_geog_geoms[:, np.newaxis], pg_schools_geoms[np.newaxis, :]) # get intersection (this can be done in sql)
  new_idxs = np.where(idxs)[1].argsort() # sort indices
  schools_geoids = va_tracts.GEOID.values[np.where(idxs)[0][new_idxs]] # get GEOIDs

  # add geoids column to dataframe
  missing_indices = np.where(np.diff(np.where(idxs)[1][new_idxs]) == 2)[0] + 1 # find rows where GEOID information is missing
  bad_df = schools.index.isin(missing_indices)
  schools = schools[~bad_df] # drop rows where we couldn't get GEOID
  schools['GEOID'] = schools_geoids # add GEOID column

  # create simplified supply data with GEOID, longitude, latitude, and capacity
  supply = schools[['GEOID', 'longitude', 'latitude', 'tot_enrol']]
  return supply

get drive times for schools at tract level

supply <- py$get_tract_geoid(trade_schools)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$GEOID
trade_tr_times <- as.data.frame(as.table(all_data))
colnames(trade_tr_times) <- c("origin", "dest", "cost")
trade_tr_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])


### two-year colleges
supply <- py$get_tract_geoid(two_year_colleges)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$GEOID
two_year_tr_times <- as.data.frame(as.table(all_data))
colnames(two_year_tr_times) <- c("origin", "dest", "cost")
two_year_tr_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])


### four-year colleges
supply <- py$get_tract_geoid(four_year_colleges)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$GEOID
four_year_tr_times <- as.data.frame(as.table(all_data))
colnames(four_year_tr_times) <- c("origin", "dest", "cost")
four_year_tr_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])

tract level 5 and 10 closest

trade_tr_times.5 <- trade_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_tr_times.10 <- trade_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_tr_times.final <- rbind(trade_tr_times.5, trade_tr_times.10)


twoyear_tr_times.5 <- two_year_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_tr_times.10 <- two_year_tr_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_tr_times <- rbind(twoyear_tr_times.5, twoyear_tr_times.10)

fouryear_tr_times.5 <- four_year_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_tr_times.10 <- four_year_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_tr_times <- rbind(fouryear_tr_times.5, fouryear_tr_times.10)

send data to db

source("~/git/VDH/src/helper_functions.R")
con <- get_db_conn(db_pass = "rsu8zvrsu8zv")
dc_dbWriteTable(con, "dc_education_training", "va_hd_osrm_2021_drive_times_nearest_trade_schools", trade_hd_times)
dc_dbWriteTable(con, "dc_education_training", "va_hd_osrm_2021_drive_times_nearest_2year_colleges", twoyear_hd_times)
dc_dbWriteTable(con, "dc_education_training", "va_hd_osrm_2021_drive_times_nearest_4year_colleges", fouryear_hd_times)

dc_dbWriteTable(con, "dc_education_training", "va_ct_osrm_2021_drive_times_nearest_trade_schools", trade_co_times.final)
dc_dbWriteTable(con, "dc_education_training", "va_ct_osrm_2021_drive_times_nearest_2year_colleges", twoyear_co_times)
dc_dbWriteTable(con, "dc_education_training", "va_ct_osrm_2021_drive_times_nearest_4year_colleges", fouryear_co_times)

dc_dbWriteTable(con, "dc_education_training", "va_tr_osrm_2021_drive_times_nearest_trade_schools", trade_tr_times.final)
dc_dbWriteTable(con, "dc_education_training", "va_tr_osrm_2021_drive_times_nearest_2year_colleges", twoyear_tr_times)
dc_dbWriteTable(con, "dc_education_training", "va_tr_osrm_2021_drive_times_nearest_4year_colleges", fouryear_tr_times)
dbDisconnect(con)


uva-bi-sdad/dc.osrm.school.drve documentation built on June 2, 2022, 10:31 p.m.