In uva-bi-sdad/dc.osrm.school.drve: Social Data Commons: Education & Training: School Drive Times

load packages

# R imports - might have to install some
library(sf)
library(tidyverse)
library(tmap)
library(tmaptools)
library(tidycensus)
library(tigris)
library(rmapshaper)
library(matrixStats)
library(SpatialAcc)

library(tidygeocoder)
library(osrm)
library(data.table)

read in our data

trade_schools <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/less_two_geoid_ct.csv")
trade_schools <- trade_schools[, c("tot_enrol", "latitude", "longitude", "geoid")]

two_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/two_year_geoid_ct.csv")
two_year_colleges <- two_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")]

four_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/four_year_geoid_ct.csv")
four_year_colleges <- four_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")]

step 1: start with health districts

health_district <- read.csv("/project/biocomplexity/sdad/projects_data/vdh/va_county_to_hd.csv")
health_district$county_id <- as.character(health_district$county_id)

con <- get_db_conn(db_pass = "rsu8zvrsu8zv")
health_district_geoids <- st_read(con, query = "SELECT * FROM dc_geographies.va_hd_vdh_2021_health_district_geo_names")
DBI::dbDisconnect(con)

health_district_2 <- left_join(health_district, health_district_geoids, by = c("health_district" = "region_name"))

trade_hd_supply <- merge(trade_schools %>% mutate(geoid = as.character(geoid)), health_district_2[, c("county_id", "health_district", "geoid")], by.x = "geoid", by.y = "county_id")
two_year_hd_supply <- merge(two_year_colleges %>% mutate(geoid = as.character(geoid)), health_district_2[, c("county_id", "health_district", "geoid")], by.x = "geoid", by.y = "county_id")
four_year_hd_supply <- merge(four_year_colleges %>% mutate(geoid = as.character(geoid)), health_district_2[, c("county_id", "health_district", "geoid")], by.x = "geoid", by.y = "county_id")

health_district$county_id <- as.character(health_district$county_id)

# get population under 15 years old
va.co <- get_acs(geography = "county",
              year = 2019,
              variables = c(tpop = "B01003_001"),
              state = "VA",
              survey = "acs5",
              output = "wide",
              geometry = TRUE)
va.co.utm <- st_transform(va.co, crs = "+proj=longlat +datum=WGS84")
va.co.utm <- va.co.utm[!st_is_empty(va.co.utm),]

new_demand <- left_join(va.co.utm, health_district_2[, c("county_id", "health_district", "geoid")], by = c("GEOID" = "county_id"))
new_demand <- new_demand  %>%
  group_by(health_district, geoid) %>%
  summarise(geometry = sf::st_union(geometry),
            tpop = sum(tpopE)) %>%
  ungroup()
new_demand <- st_sf(health_district = new_demand$health_district, tpop = new_demand$tpop, geometry = new_demand$geometry) %>% mutate(centroid = st_centroid(geometry))

new_demand$longitude = st_coordinates(new_demand$centroid)[,1]
new_demand$latitude = st_coordinates(new_demand$centroid)[,2]

trade schools drive times

supply <- trade_hd_supply

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$health_district
trade_times <- as.data.frame(as.table(all_data))
colnames(trade_times) <- c("origin", "dest", "cost")
trade_times$origin <- rep(new_demand$health_district, times = dim(supply)[1])

2-year colleges drive times

supply <- two_year_hd_supply

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$health_district
twoyear_times <- as.data.frame(as.table(all_data))
colnames(twoyear_times) <- c("origin", "dest", "cost")
twoyear_times$origin <- rep(new_demand$health_district, times = dim(supply)[1])

4-year colleges - drive times

supply <- four_year_hd_supply

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("health_district", "longitude", "latitude")],
                      dst = supply[, c("health_district", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$health_district
fouryear_times <- as.data.frame(as.table(all_data))
colnames(fouryear_times) <- c("origin", "dest", "cost")
fouryear_times$origin <- rep(new_demand$health_district, times = dim(supply)[1])

getting the data in the correct format

do we look at mean and median drive times to 2-year colleges, 4-year colleges, and trade schools for all 3 levels of geography

Health district closest 5 and 10

trade_hd_times.5 <- trade_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_hd_times.10 <- trade_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_hd_times <- rbind(trade_hd_times.5, trade_hd_times.10)

twoyear_hd_times.5 <- twoyear_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_hd_times.10 <- twoyear_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_hd_times <- rbind(twoyear_hd_times.5, twoyear_hd_times.10)

fouryear_hd_times.5 <- fouryear_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_hd_times.10 <- fouryear_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "health district",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>%
  rename(region_name = origin) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_hd_times <- rbind(fouryear_hd_times.5, fouryear_hd_times.10)

repeating everything from above for counties!!!

# get population under 15 years old
va.co <- get_acs(geography = "county",
              year = 2019,
              variables = c(male_under_5 = "B01001_003",
                            male_9 = "B01001_004",
                            male_14 = "B01001_005",
                            female_under_5 = "B01001_027",
                            female_9 = "B01001_028",
                            female_14 = "B01001_029"),
              state = "VA",
              survey = "acs5",
              output = "wide",
              geometry = TRUE)
va.co.utm <- st_transform(va.co, crs = "+proj=longlat +datum=WGS84")
va.co.utm <- va.co.utm[!st_is_empty(va.co.utm),]
va.co.utm <- va.co.utm %>%
  mutate(centroid = st_centroid(st_geometry(va.co.utm)))

va.co.utm$longitude = st_coordinates(va.co.utm$centroid)[,1]
va.co.utm$latitude = st_coordinates(va.co.utm$centroid)[,2]
new_demand <- va.co.utm

### trade schools
trade_schools <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/less_two_geoid_ct.csv")
trade_schools <- trade_schools[, c("tot_enrol", "latitude", "longitude", "geoid")]
supply <- trade_schools %>%
  rename(county = geoid)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$county
trade_co_times <- as.data.frame(as.table(all_data))
colnames(trade_co_times) <- c("origin", "dest", "cost")
trade_co_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])


### two-year colleges
two_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/two_year_geoid_ct.csv")
two_year_colleges <- two_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")]
supply <- two_year_colleges %>%
  rename(county = geoid)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$county
two_year_co_times <- as.data.frame(as.table(all_data))
colnames(two_year_co_times) <- c("origin", "dest", "cost")
two_year_co_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])


### four-year colleges
four_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/four_year_geoid_ct.csv")
four_year_colleges <- four_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")]
supply <- four_year_colleges %>%
  rename(county = geoid)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("county", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$county
four_year_co_times <- as.data.frame(as.table(all_data))
colnames(four_year_co_times) <- c("origin", "dest", "cost")
four_year_co_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])

trade_co_times.5 <- trade_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_co_times.10 <- trade_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_co_times.final <- rbind(trade_co_times.5, trade_co_times.10)

twoyear_co_times.5 <- two_year_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_co_times.10 <- two_year_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_co_times <- rbind(twoyear_co_times.5, twoyear_co_times.10)

fouryear_co_times.5 <- four_year_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_co_times.10 <- four_year_co_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "county",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_co_times <- rbind(fouryear_co_times.5, fouryear_co_times.10)

repeating everything from above for tracts!!!

# get population under 15 years old
va.tr <- get_acs(geography = "tract",
              year = 2019,
              variables = c(male_under_5 = "B01001_003",
                            male_9 = "B01001_004",
                            male_14 = "B01001_005",
                            female_under_5 = "B01001_027",
                            female_9 = "B01001_028",
                            female_14 = "B01001_029"),
              state = "VA",
              survey = "acs5",
              output = "wide",
              geometry = TRUE)
va.tr.utm <- st_transform(va.tr, crs = "+proj=longlat +datum=WGS84")
va.tr.utm <- va.tr.utm[!st_is_empty(va.tr.utm),]
va.tr.utm <- va.tr.utm %>%
  mutate(centroid = st_centroid(st_geometry(va.tr.utm)))

va.tr.utm$longitude = st_coordinates(va.tr.utm$centroid)[,1]
va.tr.utm$latitude = st_coordinates(va.tr.utm$centroid)[,2]
new_demand <- va.tr.utm

I only have county level geoid - using this code chunk to go from long/lat to tract level geoid

so goal with python code below is to quickly get tract geoids!

load packages in python

# generic imports
import numpy as np
import pandas as pd

# geospatial imports
import geopandas as gpd
import pygeos
from pygeos import box, area, intersection

# FCA pakage
from access import Access, weights

get tract geoid

### ADD GEOID COLUMN
# create daycare locations array
def get_tract_geoid(df):
  schools = gpd.GeoDataFrame(df, geometry = gpd.points_from_xy(df.longitude, df.latitude))
  pg_schools_geoms = np.array([pygeos.Geometry(str(schools.geometry[i])) for i in range(len(schools))]) # convert daycare location to Pygeos

  # create Virginia block group array
  va_tracts = gpd.read_file("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_tr.shp") # read in virginia block group data
  va_tracts = va_tracts.loc[va_tracts.geometry.notna()].reset_index(drop = True) # drop empty geometries for geographies
  pg_geog_geoms = np.array([pygeos.Geometry(str(va_tracts.geometry[i])) for i in range(len(va_tracts))]) # convert to Pygeos

  # get indices of intersection and geoids
  idxs = pygeos.contains(pg_geog_geoms[:, np.newaxis], pg_schools_geoms[np.newaxis, :]) # get intersection (this can be done in sql)
  new_idxs = np.where(idxs)[1].argsort() # sort indices
  schools_geoids = va_tracts.GEOID.values[np.where(idxs)[0][new_idxs]] # get GEOIDs

  # add geoids column to dataframe
  missing_indices = np.where(np.diff(np.where(idxs)[1][new_idxs]) == 2)[0] + 1 # find rows where GEOID information is missing
  bad_df = schools.index.isin(missing_indices)
  schools = schools[~bad_df] # drop rows where we couldn't get GEOID
  schools['GEOID'] = schools_geoids # add GEOID column

  # create simplified supply data with GEOID, longitude, latitude, and capacity
  supply = schools[['GEOID', 'longitude', 'latitude', 'tot_enrol']]
  return supply

get drive times for schools at tract level

supply <- py$get_tract_geoid(trade_schools)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$GEOID
trade_tr_times <- as.data.frame(as.table(all_data))
colnames(trade_tr_times) <- c("origin", "dest", "cost")
trade_tr_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])


### two-year colleges
supply <- py$get_tract_geoid(two_year_colleges)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$GEOID
two_year_tr_times <- as.data.frame(as.table(all_data))
colnames(two_year_tr_times) <- c("origin", "dest", "cost")
two_year_tr_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])


### four-year colleges
supply <- py$get_tract_geoid(four_year_colleges)

# options for OSRM
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk

# where do we get actual longitude-latitude data for the stores
start.time <- Sys.time() # using this to see run-time
all_data <- matrix(, nrow = 0, ncol = nrow(supply))

# maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound
max.size <- 1000000

n <- floor(max.size / nrow(supply))
chunks <- ceiling((nrow(new_demand)) / n)
for (i in 1 : chunks)
{
  # if not at the final chunk
  if (i != chunks)
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # if at final chunk, only go until final row
  else
  {
    matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")],
                      dst = supply[, c("GEOID", "longitude", "latitude")])$durations
  }
  # show percentage completion
  if (i == ceiling(chunks / 4)) {print( "25%" )}
  if (i == ceiling(chunks / 2)) {print( "50%" )}
  if (i == ceiling(3 * chunks / 4)) {print( "75%" )}
  all_data <- rbind(all_data, matrix)
}

end.time <- Sys.time() # using this to see run-time
print(end.time - start.time)

# convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas)
colnames(all_data) <- supply$GEOID
four_year_tr_times <- as.data.frame(as.table(all_data))
colnames(four_year_tr_times) <- c("origin", "dest", "cost")
four_year_tr_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])

tract level 5 and 10 closest

trade_tr_times.5 <- trade_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_tr_times.10 <- trade_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
trade_tr_times.final <- rbind(trade_tr_times.5, trade_tr_times.10)


twoyear_tr_times.5 <- two_year_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_tr_times.10 <- two_year_tr_times %>%
  group_by(origin) %>%
  top_n(cost, n = -10) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
twoyear_tr_times <- rbind(twoyear_tr_times.5, twoyear_tr_times.10)

fouryear_tr_times.5 <- four_year_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top5 = mean(cost),
            median_drive_time_top5 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_tr_times.10 <- four_year_tr_times %>%
  group_by(origin) %>%
  arrange(cost) %>% 
  slice(1:5) %>%
  summarize(mean_drive_time_top10 = mean(cost),
            median_drive_time_top10 = median(cost)) %>%
  gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>%
  mutate(region_type = "tract",
         year = "2021",
         measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"),
         measure_units = "minutes") %>%
  merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>%
  rename(geoid = origin,
         region_name = NAME) %>%
  relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units")
fouryear_tr_times <- rbind(fouryear_tr_times.5, fouryear_tr_times.10)

send data to db

source("~/git/VDH/src/helper_functions.R")
con <- get_db_conn(db_pass = "rsu8zvrsu8zv")
dc_dbWriteTable(con, "dc_education_training", "va_hd_osrm_2021_drive_times_nearest_trade_schools", trade_hd_times)
dc_dbWriteTable(con, "dc_education_training", "va_hd_osrm_2021_drive_times_nearest_2year_colleges", twoyear_hd_times)
dc_dbWriteTable(con, "dc_education_training", "va_hd_osrm_2021_drive_times_nearest_4year_colleges", fouryear_hd_times)

dc_dbWriteTable(con, "dc_education_training", "va_ct_osrm_2021_drive_times_nearest_trade_schools", trade_co_times.final)
dc_dbWriteTable(con, "dc_education_training", "va_ct_osrm_2021_drive_times_nearest_2year_colleges", twoyear_co_times)
dc_dbWriteTable(con, "dc_education_training", "va_ct_osrm_2021_drive_times_nearest_4year_colleges", fouryear_co_times)

dc_dbWriteTable(con, "dc_education_training", "va_tr_osrm_2021_drive_times_nearest_trade_schools", trade_tr_times.final)
dc_dbWriteTable(con, "dc_education_training", "va_tr_osrm_2021_drive_times_nearest_2year_colleges", twoyear_tr_times)
dc_dbWriteTable(con, "dc_education_training", "va_tr_osrm_2021_drive_times_nearest_4year_colleges", fouryear_tr_times)
dbDisconnect(con)

uva-bi-sdad/dc.osrm.school.drve documentation built on June 2, 2022, 10:31 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

uva-bi-sdad/dc.osrm.school.drve
Social Data Commons: Education & Training: School Drive Times

In uva-bi-sdad/dc.osrm.school.drve: Social Data Commons: Education & Training: School Drive Times

load packages

read in our data

step 1: start with health districts

trade schools drive times

2-year colleges drive times

4-year colleges - drive times

getting the data in the correct format

do we look at mean and median drive times to 2-year colleges, 4-year colleges, and trade schools for all 3 levels of geography

Health district closest 5 and 10

repeating everything from above for counties!!!

repeating everything from above for tracts!!!

I only have county level geoid - using this code chunk to go from long/lat to tract level geoid

so goal with python code below is to quickly get tract geoids!

load packages in python

get tract geoid

get drive times for schools at tract level

tract level 5 and 10 closest

send data to db

R Package Documentation

Browse R Packages

We want your feedback!

uva-bi-sdad/dc.osrm.school.drve Social Data Commons: Education & Training: School Drive Times

In uva-bi-sdad/dc.osrm.school.drve: Social Data Commons: Education & Training: School Drive Times

load packages

read in our data

step 1: start with health districts

trade schools drive times

2-year colleges drive times

4-year colleges - drive times

getting the data in the correct format

do we look at mean and median drive times to 2-year colleges, 4-year colleges, and trade schools for all 3 levels of geography

Health district closest 5 and 10

repeating everything from above for counties!!!

repeating everything from above for tracts!!!

I only have county level geoid - using this code chunk to go from long/lat to tract level geoid

so goal with python code below is to quickly get tract geoids!

load packages in python

get tract geoid

get drive times for schools at tract level

tract level 5 and 10 closest

send data to db

R Package Documentation

Browse R Packages

We want your feedback!

uva-bi-sdad/dc.osrm.school.drve
Social Data Commons: Education & Training: School Drive Times