knitr::opts_chunk$set(echo = TRUE)

libraries

library(sf)
library(tidygeocoder)
library(dplyr)
library(community)
library(readr)

clean df

#start clean
rm(list=ls())

working directory

setwd("~/VDH/Floating_Catchment_Areas/va/1. obgyn_va_model/")

load data provider

#provider <- read.csv("pediat.dmv.geo.csv", row.names = 1)

#from pgadmin
#a) conn
con <- RPostgreSQL::dbConnect(drv = RPostgreSQL::PostgreSQL(),
                               dbname = "sdad",
                               host = "postgis1",
                               port = 5432,
                               user = Sys.getenv(x = "DB_USR"),
                               password = Sys.getenv(x = "DB_PWD"))

#b) query
provider  <- sf::st_read(
  con, 
  query= "
    SELECT *
  FROM dc_health_behavior_diet.va_pl_webmd_2021_doctors_obgyn"
)

#c) Disconnect
RPostgreSQL::dbDisconnect(con)

names(provider)[names(provider)=='latitude'] <- 'lat'
names(provider)[names(provider)=='longitude'] <- 'lon'
provider <- st_drop_geometry(provider)
#write
#write.csv(provider, "urgent_care_va.csv" )

fix data

## collapse by location
provider$doctors <- 1
provider$location <- paste0(provider$lat, ",", provider$lon)
#identify unique values
provider <- provider %>% distinct(name, location, .keep_all = TRUE)

counts <- tapply(provider$doctors, provider$location, sum)
locations <- which(!duplicated(provider$location))
provider <- provider[locations,]
provider$doctors <- counts[provider$location]

## assign IDs just to be explicit
provider$ID <- paste0("l", seq_len(nrow(provider)))


provider <- provider %>% filter(!is.na(lat))
# provider$state <- substr(provider$geoid_blk,1,2)
# provider %>% group_by(state) %>% summarise(doctors=sum(doctors))

data combined

library(tidycensus)
library(tidyverse)

census_api_key("eba406410c653b81d6a795ac4e989221f7bdf302")

# Bring in census tract data. 
# va <- get_acs(geography = "block group", 
#                         year = 2019,
#                         variables = c(population = "B01003_001E",
#                                       medinc = "B19013_001E"
#                         ),
#                         state = "VA",
#                         survey = "acs5",
#                         output = "wide",
#                         geometry = TRUE)

pop_va_fem <- get_acs(geography = "block group", 
                  year = 2019,
                  variables = c(population = "B01001_001",
                                male= "B01001_002",
                                female = "B01001_026", 
                                FemaleUnder5years="B01001_027",
                                Female5to9years="B01001_028",
                                Female10to14years="B01001_029",
                                Female15to17years="B01001_030",
                                Female18and19years="B01001_031",
                                Female20years="B01001_032",
                                Female21years="B01001_033",
                                Female22to24years="B01001_034",
                                Female25to29years="B01001_035",
                                Female30to34years="B01001_036",
                                Female35to39years="B01001_037",
                                Female40to44years="B01001_038",
                                Female45to49years="B01001_039",
                                Female50to54years="B01001_040",
                                Female55to59years="B01001_041",
                                Female60and61years="B01001_042",
                                Female62to64years="B01001_043",
                                Female65and66years="B01001_044",
                                Female67to69years="B01001_045",
                                Female70to74years="B01001_046",
                                Female75to79years="B01001_047",
                                Female80to84years="B01001_048",
                                Female85yearsandover="B01001_049"

                  ),
                  state = "VA",
                  survey = "acs5",
                  output = "wide",
                  geometry = TRUE)


va <- pop_va_fem %>% mutate( obgyn_pop= femaleE - FemaleUnder5yearsE -Female5to9yearsE - Female10to14yearsE ) %>% 
  select(GEOID, NAME, population=populationE, pop_fem= femaleE, obgyn_pop )

# data combined
#centroid and coordinates

data_combined <- data.frame(
  GEOID = va$GEOID,
  population = va$population,
  obgyn_pop = va$obgyn_pop,
st_coordinates(st_centroid(va$geometry))
)

data_combined <- data_combined %>% filter(!is.na(data_combined$X))

data_combined <- data_combined %>% filter(GEOID != 517000323001 )
data_combined <- data_combined %>% filter(GEOID != 517000323002 )
data_combined <- data_combined %>% filter(GEOID != 517000323003 )

travel time

library(osrm)
options(osrm.server = Sys.getenv("OSRM_SERVER"), osrm.profile = "car")
if(!file.exists("traveltimes_exercise.csv")){
  traveltimes <- osrmTable(
    src = data_combined[, c("GEOID", "X", "Y")],  #population-demand
    dst = st_drop_geometry( provider[, c("ID", "lon", "lat")] )    #providers supply
  )$duration
  write.csv(
    cbind(GEOID = rownames(traveltimes), as.data.frame(traveltimes)),
    "traveltimes_exercise.csv", row.names = FALSE
  )
}

traveltimes <- read.csv("traveltimes_exercise.csv", row.names = 1)

add1. Define geography id. This is because the Geography-GEOID from initial file may be outdated

library(tigris)
library(maps)
library(sf)
# add block geoids
# get US blocks shapefile
blocks_VA <- st_as_sf(block_groups(state="VA", year=2019)) #, year=2010

blocks <- blocks_VA
# lon and lat to geo-points
geopts <- provider %>%
  st_as_sf(coords = c("lon", "lat"), crs = 4269) #4326. initial: 4269
# indeces of bgs which contain a geopoint
inds <- st_within(geopts$geometry, blocks$geometry, sparse=T)
blk_list <- c()
for (i in inds){
  if (identical(blocks$NAME[i],character(0))){
    blk_list<- append(blk_list, NA)}
  else{
    blk_list <- append(blk_list, blocks$GEOID[i])}
}
provider['GEOID'] <- blk_list

add2 count providers per geography using matching codes

#providers w geoid
#provider$GEOID <- substr(provider$geoid_blk_new, 1, 12 ) 
#data_combined with geoid: ok
num_providers <- provider %>% group_by(GEOID) %>% summarise(prov_cnt = sum(doctors) )

#join providers to block groups
data_combined$GEOID <-  as.character( data_combined$GEOID)
data_combined <- data_combined %>% left_join(num_providers, by= "GEOID" )

sum(data_combined$prov_cnt, na.rm = TRUE)

add3 mean and median of 10 nearest drive times

#mean of 10 nearest
top_mean <- function(x) {  
   mean(head(sort(x ), 10) ) }
#median of 10 nearest
top_median <- function(x) {  
   median(head(sort(x ), 10) ) }
#apply rowwise
traveltimes_near <- data.frame(near_10_mean=apply(traveltimes, 1, top_mean), 
                               near_10_median=apply(traveltimes, 1, top_median)) 
#rownames_to_column(traveltimes_near, var = "GEOID")
traveltimes_near$GEOID <- row.names(traveltimes_near) 
#join mean median traveltimes to geographies
data_combined <- data_combined %>% left_join(traveltimes_near, by= "GEOID")
#data_combined <- st_drop_geometry(data_combined)

prepara data for save

#raw traveltimes: traveltimes matrix already estimated and with colnames arranged 
traveltimes <- read.csv("traveltimes_exercise.csv", row.names = 1)
#population: always recheck relevant population: ie. for pediatrics: pop 0-17 years
population <- data_combined %>% select(GEOID, obgyn_pop, prov_cnt, near_10_mean, near_10_median)
# realign travel times
traveltimes <- traveltimes[as.character(population$GEOID), provider$ID]

save new data

write.csv(provider[, c("ID", "address", "lat", "lon", "doctors")], "provider.csv", row.names = FALSE)
write.csv(cbind(GEOID = rownames(traveltimes), traveltimes), "traveltimes_trimmed.csv", row.names = FALSE)
write.csv(population, "population.csv", row.names = FALSE)


uva-bi-sdad/dc.webmd.obgyn documentation built on June 20, 2022, 3:57 p.m.