# R imports - might have to install some library(sf) library(tidyverse) library(tmap) library(tmaptools) library(tidycensus) library(tigris) library(rmapshaper) library(matrixStats) library(SpatialAcc) library(tidygeocoder) library(osrm) library(data.table)
trade_schools <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/less_two_geoid_ct.csv") trade_schools <- trade_schools[, c("tot_enrol", "latitude", "longitude", "geoid")] two_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/two_year_geoid_ct.csv") two_year_colleges <- two_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")] four_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/four_year_geoid_ct.csv") four_year_colleges <- four_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")]
health_district <- read.csv("/project/biocomplexity/sdad/projects_data/vdh/va_county_to_hd.csv") health_district$county_id <- as.character(health_district$county_id) con <- get_db_conn(db_pass = "rsu8zvrsu8zv") health_district_geoids <- st_read(con, query = "SELECT * FROM dc_geographies.va_hd_vdh_2021_health_district_geo_names") DBI::dbDisconnect(con) health_district_2 <- left_join(health_district, health_district_geoids, by = c("health_district" = "region_name")) trade_hd_supply <- merge(trade_schools %>% mutate(geoid = as.character(geoid)), health_district_2[, c("county_id", "health_district", "geoid")], by.x = "geoid", by.y = "county_id") two_year_hd_supply <- merge(two_year_colleges %>% mutate(geoid = as.character(geoid)), health_district_2[, c("county_id", "health_district", "geoid")], by.x = "geoid", by.y = "county_id") four_year_hd_supply <- merge(four_year_colleges %>% mutate(geoid = as.character(geoid)), health_district_2[, c("county_id", "health_district", "geoid")], by.x = "geoid", by.y = "county_id") health_district$county_id <- as.character(health_district$county_id) # get population under 15 years old va.co <- get_acs(geography = "county", year = 2019, variables = c(tpop = "B01003_001"), state = "VA", survey = "acs5", output = "wide", geometry = TRUE) va.co.utm <- st_transform(va.co, crs = "+proj=longlat +datum=WGS84") va.co.utm <- va.co.utm[!st_is_empty(va.co.utm),] new_demand <- left_join(va.co.utm, health_district_2[, c("county_id", "health_district", "geoid")], by = c("GEOID" = "county_id")) new_demand <- new_demand %>% group_by(health_district, geoid) %>% summarise(geometry = sf::st_union(geometry), tpop = sum(tpopE)) %>% ungroup() new_demand <- st_sf(health_district = new_demand$health_district, tpop = new_demand$tpop, geometry = new_demand$geometry) %>% mutate(centroid = st_centroid(geometry)) new_demand$longitude = st_coordinates(new_demand$centroid)[,1] new_demand$latitude = st_coordinates(new_demand$centroid)[,2]
supply <- trade_hd_supply # options for OSRM options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk # where do we get actual longitude-latitude data for the stores start.time <- Sys.time() # using this to see run-time all_data <- matrix(, nrow = 0, ncol = nrow(supply)) # maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound max.size <- 1000000 n <- floor(max.size / nrow(supply)) chunks <- ceiling((nrow(new_demand)) / n) for (i in 1 : chunks) { # if not at the final chunk if (i != chunks) { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("health_district", "longitude", "latitude")], dst = supply[, c("health_district", "longitude", "latitude")])$durations } # if at final chunk, only go until final row else { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("health_district", "longitude", "latitude")], dst = supply[, c("health_district", "longitude", "latitude")])$durations } # show percentage completion if (i == ceiling(chunks / 4)) {print( "25%" )} if (i == ceiling(chunks / 2)) {print( "50%" )} if (i == ceiling(3 * chunks / 4)) {print( "75%" )} all_data <- rbind(all_data, matrix) } end.time <- Sys.time() # using this to see run-time print(end.time - start.time) # convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas) colnames(all_data) <- supply$health_district trade_times <- as.data.frame(as.table(all_data)) colnames(trade_times) <- c("origin", "dest", "cost") trade_times$origin <- rep(new_demand$health_district, times = dim(supply)[1])
supply <- two_year_hd_supply # options for OSRM options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk # where do we get actual longitude-latitude data for the stores start.time <- Sys.time() # using this to see run-time all_data <- matrix(, nrow = 0, ncol = nrow(supply)) # maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound max.size <- 1000000 n <- floor(max.size / nrow(supply)) chunks <- ceiling((nrow(new_demand)) / n) for (i in 1 : chunks) { # if not at the final chunk if (i != chunks) { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("health_district", "longitude", "latitude")], dst = supply[, c("health_district", "longitude", "latitude")])$durations } # if at final chunk, only go until final row else { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("health_district", "longitude", "latitude")], dst = supply[, c("health_district", "longitude", "latitude")])$durations } # show percentage completion if (i == ceiling(chunks / 4)) {print( "25%" )} if (i == ceiling(chunks / 2)) {print( "50%" )} if (i == ceiling(3 * chunks / 4)) {print( "75%" )} all_data <- rbind(all_data, matrix) } end.time <- Sys.time() # using this to see run-time print(end.time - start.time) # convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas) colnames(all_data) <- supply$health_district twoyear_times <- as.data.frame(as.table(all_data)) colnames(twoyear_times) <- c("origin", "dest", "cost") twoyear_times$origin <- rep(new_demand$health_district, times = dim(supply)[1])
supply <- four_year_hd_supply # options for OSRM options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk # where do we get actual longitude-latitude data for the stores start.time <- Sys.time() # using this to see run-time all_data <- matrix(, nrow = 0, ncol = nrow(supply)) # maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound max.size <- 1000000 n <- floor(max.size / nrow(supply)) chunks <- ceiling((nrow(new_demand)) / n) for (i in 1 : chunks) { # if not at the final chunk if (i != chunks) { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("health_district", "longitude", "latitude")], dst = supply[, c("health_district", "longitude", "latitude")])$durations } # if at final chunk, only go until final row else { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("health_district", "longitude", "latitude")], dst = supply[, c("health_district", "longitude", "latitude")])$durations } # show percentage completion if (i == ceiling(chunks / 4)) {print( "25%" )} if (i == ceiling(chunks / 2)) {print( "50%" )} if (i == ceiling(3 * chunks / 4)) {print( "75%" )} all_data <- rbind(all_data, matrix) } end.time <- Sys.time() # using this to see run-time print(end.time - start.time) # convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas) colnames(all_data) <- supply$health_district fouryear_times <- as.data.frame(as.table(all_data)) colnames(fouryear_times) <- c("origin", "dest", "cost") fouryear_times$origin <- rep(new_demand$health_district, times = dim(supply)[1])
trade_hd_times.5 <- trade_times %>% group_by(origin) %>% top_n(cost, n = -5) %>% summarize(mean_drive_time_top5 = mean(cost), median_drive_time_top5 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>% mutate(region_type = "health district", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"), measure_units = "minutes") %>% merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>% rename(region_name = origin) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") trade_hd_times.10 <- trade_times %>% group_by(origin) %>% top_n(cost, n = -10) %>% summarize(mean_drive_time_top10 = mean(cost), median_drive_time_top10 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>% mutate(region_type = "health district", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"), measure_units = "minutes") %>% merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>% rename(region_name = origin) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") trade_hd_times <- rbind(trade_hd_times.5, trade_hd_times.10) twoyear_hd_times.5 <- twoyear_times %>% group_by(origin) %>% top_n(cost, n = -5) %>% summarize(mean_drive_time_top5 = mean(cost), median_drive_time_top5 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>% mutate(region_type = "health district", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"), measure_units = "minutes") %>% merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>% rename(region_name = origin) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") twoyear_hd_times.10 <- twoyear_times %>% group_by(origin) %>% top_n(cost, n = -10) %>% summarize(mean_drive_time_top10 = mean(cost), median_drive_time_top10 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>% mutate(region_type = "health district", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"), measure_units = "minutes") %>% merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>% rename(region_name = origin) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") twoyear_hd_times <- rbind(twoyear_hd_times.5, twoyear_hd_times.10) fouryear_hd_times.5 <- fouryear_times %>% group_by(origin) %>% top_n(cost, n = -5) %>% summarize(mean_drive_time_top5 = mean(cost), median_drive_time_top5 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>% mutate(region_type = "health district", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"), measure_units = "minutes") %>% merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>% rename(region_name = origin) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") fouryear_hd_times.10 <- fouryear_times %>% group_by(origin) %>% top_n(cost, n = -10) %>% summarize(mean_drive_time_top10 = mean(cost), median_drive_time_top10 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>% mutate(region_type = "health district", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"), measure_units = "minutes") %>% merge(health_district_2[, c("geoid", "health_district")], by.x = "origin", by.y = "health_district") %>% rename(region_name = origin) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") fouryear_hd_times <- rbind(fouryear_hd_times.5, fouryear_hd_times.10)
# get population under 15 years old va.co <- get_acs(geography = "county", year = 2019, variables = c(male_under_5 = "B01001_003", male_9 = "B01001_004", male_14 = "B01001_005", female_under_5 = "B01001_027", female_9 = "B01001_028", female_14 = "B01001_029"), state = "VA", survey = "acs5", output = "wide", geometry = TRUE) va.co.utm <- st_transform(va.co, crs = "+proj=longlat +datum=WGS84") va.co.utm <- va.co.utm[!st_is_empty(va.co.utm),] va.co.utm <- va.co.utm %>% mutate(centroid = st_centroid(st_geometry(va.co.utm))) va.co.utm$longitude = st_coordinates(va.co.utm$centroid)[,1] va.co.utm$latitude = st_coordinates(va.co.utm$centroid)[,2] new_demand <- va.co.utm
### trade schools trade_schools <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/less_two_geoid_ct.csv") trade_schools <- trade_schools[, c("tot_enrol", "latitude", "longitude", "geoid")] supply <- trade_schools %>% rename(county = geoid) # options for OSRM options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk # where do we get actual longitude-latitude data for the stores start.time <- Sys.time() # using this to see run-time all_data <- matrix(, nrow = 0, ncol = nrow(supply)) # maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound max.size <- 1000000 n <- floor(max.size / nrow(supply)) chunks <- ceiling((nrow(new_demand)) / n) for (i in 1 : chunks) { # if not at the final chunk if (i != chunks) { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")], dst = supply[, c("county", "longitude", "latitude")])$durations } # if at final chunk, only go until final row else { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")], dst = supply[, c("county", "longitude", "latitude")])$durations } # show percentage completion if (i == ceiling(chunks / 4)) {print( "25%" )} if (i == ceiling(chunks / 2)) {print( "50%" )} if (i == ceiling(3 * chunks / 4)) {print( "75%" )} all_data <- rbind(all_data, matrix) } end.time <- Sys.time() # using this to see run-time print(end.time - start.time) # convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas) colnames(all_data) <- supply$county trade_co_times <- as.data.frame(as.table(all_data)) colnames(trade_co_times) <- c("origin", "dest", "cost") trade_co_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1]) ### two-year colleges two_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/two_year_geoid_ct.csv") two_year_colleges <- two_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")] supply <- two_year_colleges %>% rename(county = geoid) # options for OSRM options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk # where do we get actual longitude-latitude data for the stores start.time <- Sys.time() # using this to see run-time all_data <- matrix(, nrow = 0, ncol = nrow(supply)) # maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound max.size <- 1000000 n <- floor(max.size / nrow(supply)) chunks <- ceiling((nrow(new_demand)) / n) for (i in 1 : chunks) { # if not at the final chunk if (i != chunks) { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")], dst = supply[, c("county", "longitude", "latitude")])$durations } # if at final chunk, only go until final row else { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")], dst = supply[, c("county", "longitude", "latitude")])$durations } # show percentage completion if (i == ceiling(chunks / 4)) {print( "25%" )} if (i == ceiling(chunks / 2)) {print( "50%" )} if (i == ceiling(3 * chunks / 4)) {print( "75%" )} all_data <- rbind(all_data, matrix) } end.time <- Sys.time() # using this to see run-time print(end.time - start.time) # convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas) colnames(all_data) <- supply$county two_year_co_times <- as.data.frame(as.table(all_data)) colnames(two_year_co_times) <- c("origin", "dest", "cost") two_year_co_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1]) ### four-year colleges four_year_colleges <- fread("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_post_hs_education/four_year_geoid_ct.csv") four_year_colleges <- four_year_colleges[, c("tot_enrol", "latitude", "longitude", "geoid")] supply <- four_year_colleges %>% rename(county = geoid) # options for OSRM options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk # where do we get actual longitude-latitude data for the stores start.time <- Sys.time() # using this to see run-time all_data <- matrix(, nrow = 0, ncol = nrow(supply)) # maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound max.size <- 1000000 n <- floor(max.size / nrow(supply)) chunks <- ceiling((nrow(new_demand)) / n) for (i in 1 : chunks) { # if not at the final chunk if (i != chunks) { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")], dst = supply[, c("county", "longitude", "latitude")])$durations } # if at final chunk, only go until final row else { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")], dst = supply[, c("county", "longitude", "latitude")])$durations } # show percentage completion if (i == ceiling(chunks / 4)) {print( "25%" )} if (i == ceiling(chunks / 2)) {print( "50%" )} if (i == ceiling(3 * chunks / 4)) {print( "75%" )} all_data <- rbind(all_data, matrix) } end.time <- Sys.time() # using this to see run-time print(end.time - start.time) # convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas) colnames(all_data) <- supply$county four_year_co_times <- as.data.frame(as.table(all_data)) colnames(four_year_co_times) <- c("origin", "dest", "cost") four_year_co_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])
trade_co_times.5 <- trade_co_times %>% group_by(origin) %>% top_n(cost, n = -5) %>% summarize(mean_drive_time_top5 = mean(cost), median_drive_time_top5 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>% mutate(region_type = "county", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") trade_co_times.10 <- trade_co_times %>% group_by(origin) %>% top_n(cost, n = -10) %>% summarize(mean_drive_time_top10 = mean(cost), median_drive_time_top10 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>% mutate(region_type = "county", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") trade_co_times.final <- rbind(trade_co_times.5, trade_co_times.10) twoyear_co_times.5 <- two_year_co_times %>% group_by(origin) %>% top_n(cost, n = -5) %>% summarize(mean_drive_time_top5 = mean(cost), median_drive_time_top5 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>% mutate(region_type = "county", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") twoyear_co_times.10 <- two_year_co_times %>% group_by(origin) %>% top_n(cost, n = -10) %>% summarize(mean_drive_time_top10 = mean(cost), median_drive_time_top10 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>% mutate(region_type = "county", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") twoyear_co_times <- rbind(twoyear_co_times.5, twoyear_co_times.10) fouryear_co_times.5 <- four_year_co_times %>% group_by(origin) %>% top_n(cost, n = -5) %>% summarize(mean_drive_time_top5 = mean(cost), median_drive_time_top5 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>% mutate(region_type = "county", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") fouryear_co_times.10 <- four_year_co_times %>% group_by(origin) %>% top_n(cost, n = -10) %>% summarize(mean_drive_time_top10 = mean(cost), median_drive_time_top10 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>% mutate(region_type = "county", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.co[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") fouryear_co_times <- rbind(fouryear_co_times.5, fouryear_co_times.10)
# get population under 15 years old va.tr <- get_acs(geography = "tract", year = 2019, variables = c(male_under_5 = "B01001_003", male_9 = "B01001_004", male_14 = "B01001_005", female_under_5 = "B01001_027", female_9 = "B01001_028", female_14 = "B01001_029"), state = "VA", survey = "acs5", output = "wide", geometry = TRUE) va.tr.utm <- st_transform(va.tr, crs = "+proj=longlat +datum=WGS84") va.tr.utm <- va.tr.utm[!st_is_empty(va.tr.utm),] va.tr.utm <- va.tr.utm %>% mutate(centroid = st_centroid(st_geometry(va.tr.utm))) va.tr.utm$longitude = st_coordinates(va.tr.utm$centroid)[,1] va.tr.utm$latitude = st_coordinates(va.tr.utm$centroid)[,2] new_demand <- va.tr.utm
# generic imports import numpy as np import pandas as pd # geospatial imports import geopandas as gpd import pygeos from pygeos import box, area, intersection # FCA pakage from access import Access, weights
### ADD GEOID COLUMN # create daycare locations array def get_tract_geoid(df): schools = gpd.GeoDataFrame(df, geometry = gpd.points_from_xy(df.longitude, df.latitude)) pg_schools_geoms = np.array([pygeos.Geometry(str(schools.geometry[i])) for i in range(len(schools))]) # convert daycare location to Pygeos # create Virginia block group array va_tracts = gpd.read_file("/project/biocomplexity/sdad/projects_data/mc/data_commons/dc_education_training/va_tr.shp") # read in virginia block group data va_tracts = va_tracts.loc[va_tracts.geometry.notna()].reset_index(drop = True) # drop empty geometries for geographies pg_geog_geoms = np.array([pygeos.Geometry(str(va_tracts.geometry[i])) for i in range(len(va_tracts))]) # convert to Pygeos # get indices of intersection and geoids idxs = pygeos.contains(pg_geog_geoms[:, np.newaxis], pg_schools_geoms[np.newaxis, :]) # get intersection (this can be done in sql) new_idxs = np.where(idxs)[1].argsort() # sort indices schools_geoids = va_tracts.GEOID.values[np.where(idxs)[0][new_idxs]] # get GEOIDs # add geoids column to dataframe missing_indices = np.where(np.diff(np.where(idxs)[1][new_idxs]) == 2)[0] + 1 # find rows where GEOID information is missing bad_df = schools.index.isin(missing_indices) schools = schools[~bad_df] # drop rows where we couldn't get GEOID schools['GEOID'] = schools_geoids # add GEOID column # create simplified supply data with GEOID, longitude, latitude, and capacity supply = schools[['GEOID', 'longitude', 'latitude', 'tot_enrol']] return supply
supply <- py$get_tract_geoid(trade_schools) # options for OSRM options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk # where do we get actual longitude-latitude data for the stores start.time <- Sys.time() # using this to see run-time all_data <- matrix(, nrow = 0, ncol = nrow(supply)) # maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound max.size <- 1000000 n <- floor(max.size / nrow(supply)) chunks <- ceiling((nrow(new_demand)) / n) for (i in 1 : chunks) { # if not at the final chunk if (i != chunks) { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")], dst = supply[, c("GEOID", "longitude", "latitude")])$durations } # if at final chunk, only go until final row else { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")], dst = supply[, c("GEOID", "longitude", "latitude")])$durations } # show percentage completion if (i == ceiling(chunks / 4)) {print( "25%" )} if (i == ceiling(chunks / 2)) {print( "50%" )} if (i == ceiling(3 * chunks / 4)) {print( "75%" )} all_data <- rbind(all_data, matrix) } end.time <- Sys.time() # using this to see run-time print(end.time - start.time) # convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas) colnames(all_data) <- supply$GEOID trade_tr_times <- as.data.frame(as.table(all_data)) colnames(trade_tr_times) <- c("origin", "dest", "cost") trade_tr_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1]) ### two-year colleges supply <- py$get_tract_geoid(two_year_colleges) # options for OSRM options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk # where do we get actual longitude-latitude data for the stores start.time <- Sys.time() # using this to see run-time all_data <- matrix(, nrow = 0, ncol = nrow(supply)) # maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound max.size <- 1000000 n <- floor(max.size / nrow(supply)) chunks <- ceiling((nrow(new_demand)) / n) for (i in 1 : chunks) { # if not at the final chunk if (i != chunks) { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")], dst = supply[, c("GEOID", "longitude", "latitude")])$durations } # if at final chunk, only go until final row else { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")], dst = supply[, c("GEOID", "longitude", "latitude")])$durations } # show percentage completion if (i == ceiling(chunks / 4)) {print( "25%" )} if (i == ceiling(chunks / 2)) {print( "50%" )} if (i == ceiling(3 * chunks / 4)) {print( "75%" )} all_data <- rbind(all_data, matrix) } end.time <- Sys.time() # using this to see run-time print(end.time - start.time) # convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas) colnames(all_data) <- supply$GEOID two_year_tr_times <- as.data.frame(as.table(all_data)) colnames(two_year_tr_times) <- c("origin", "dest", "cost") two_year_tr_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1]) ### four-year colleges supply <- py$get_tract_geoid(four_year_colleges) # options for OSRM options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car") # can change option to car, bike, or walk # where do we get actual longitude-latitude data for the stores start.time <- Sys.time() # using this to see run-time all_data <- matrix(, nrow = 0, ncol = nrow(supply)) # maximum number of requests that OSRM can handle at a time - I don't know if there is still a limit on this, but I still use 1 million as the upper bound max.size <- 1000000 n <- floor(max.size / nrow(supply)) chunks <- ceiling((nrow(new_demand)) / n) for (i in 1 : chunks) { # if not at the final chunk if (i != chunks) { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):(n * i), c("GEOID", "longitude", "latitude")], dst = supply[, c("GEOID", "longitude", "latitude")])$durations } # if at final chunk, only go until final row else { matrix <- osrmTable(src = new_demand[(1 + n * (i - 1)):nrow(new_demand), c("GEOID", "longitude", "latitude")], dst = supply[, c("GEOID", "longitude", "latitude")])$durations } # show percentage completion if (i == ceiling(chunks / 4)) {print( "25%" )} if (i == ceiling(chunks / 2)) {print( "50%" )} if (i == ceiling(3 * chunks / 4)) {print( "75%" )} all_data <- rbind(all_data, matrix) } end.time <- Sys.time() # using this to see run-time print(end.time - start.time) # convert data to times dataframe with origin, dest, and cost columns (needed for floating catchment areas) colnames(all_data) <- supply$GEOID four_year_tr_times <- as.data.frame(as.table(all_data)) colnames(four_year_tr_times) <- c("origin", "dest", "cost") four_year_tr_times$origin <- rep(new_demand$GEOID, times = dim(supply)[1])
trade_tr_times.5 <- trade_tr_times %>% group_by(origin) %>% arrange(cost) %>% slice(1:5) %>% summarize(mean_drive_time_top5 = mean(cost), median_drive_time_top5 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>% mutate(region_type = "tract", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") trade_tr_times.10 <- trade_tr_times %>% group_by(origin) %>% arrange(cost) %>% slice(1:5) %>% summarize(mean_drive_time_top10 = mean(cost), median_drive_time_top10 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>% mutate(region_type = "tract", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") trade_tr_times.final <- rbind(trade_tr_times.5, trade_tr_times.10) twoyear_tr_times.5 <- two_year_tr_times %>% group_by(origin) %>% arrange(cost) %>% slice(1:5) %>% summarize(mean_drive_time_top5 = mean(cost), median_drive_time_top5 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>% mutate(region_type = "tract", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") twoyear_tr_times.10 <- two_year_tr_times %>% group_by(origin) %>% top_n(cost, n = -10) %>% summarize(mean_drive_time_top10 = mean(cost), median_drive_time_top10 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>% mutate(region_type = "tract", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") twoyear_tr_times <- rbind(twoyear_tr_times.5, twoyear_tr_times.10) fouryear_tr_times.5 <- four_year_tr_times %>% group_by(origin) %>% arrange(cost) %>% slice(1:5) %>% summarize(mean_drive_time_top5 = mean(cost), median_drive_time_top5 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top5, median_drive_time_top5)) %>% mutate(region_type = "tract", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top5"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") fouryear_tr_times.10 <- four_year_tr_times %>% group_by(origin) %>% arrange(cost) %>% slice(1:5) %>% summarize(mean_drive_time_top10 = mean(cost), median_drive_time_top10 = median(cost)) %>% gather(measure, value, c(mean_drive_time_top10, median_drive_time_top10)) %>% mutate(region_type = "tract", year = "2021", measure_type = ifelse(measure %in% c("mean_drive_time_top10"), "mean", "median"), measure_units = "minutes") %>% merge(st_drop_geometry(va.tr[, c("GEOID", "NAME")]), by.x = "origin", by.y = "GEOID") %>% rename(geoid = origin, region_name = NAME) %>% relocate("geoid", "region_type", "region_name", "year", "measure", "value", "measure_type", "measure_units") fouryear_tr_times <- rbind(fouryear_tr_times.5, fouryear_tr_times.10)
source("~/git/VDH/src/helper_functions.R") con <- get_db_conn(db_pass = "rsu8zvrsu8zv") dc_dbWriteTable(con, "dc_education_training", "va_hd_osrm_2021_drive_times_nearest_trade_schools", trade_hd_times) dc_dbWriteTable(con, "dc_education_training", "va_hd_osrm_2021_drive_times_nearest_2year_colleges", twoyear_hd_times) dc_dbWriteTable(con, "dc_education_training", "va_hd_osrm_2021_drive_times_nearest_4year_colleges", fouryear_hd_times) dc_dbWriteTable(con, "dc_education_training", "va_ct_osrm_2021_drive_times_nearest_trade_schools", trade_co_times.final) dc_dbWriteTable(con, "dc_education_training", "va_ct_osrm_2021_drive_times_nearest_2year_colleges", twoyear_co_times) dc_dbWriteTable(con, "dc_education_training", "va_ct_osrm_2021_drive_times_nearest_4year_colleges", fouryear_co_times) dc_dbWriteTable(con, "dc_education_training", "va_tr_osrm_2021_drive_times_nearest_trade_schools", trade_tr_times.final) dc_dbWriteTable(con, "dc_education_training", "va_tr_osrm_2021_drive_times_nearest_2year_colleges", twoyear_tr_times) dc_dbWriteTable(con, "dc_education_training", "va_tr_osrm_2021_drive_times_nearest_4year_colleges", fouryear_tr_times) dbDisconnect(con)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.