Idea here - from Corelogic, get a couple of addresses from each block group (or nearby)

load packages

library(sf)
library(tidyverse)
library(tmap)
library(tmaptools)
library(tigris)
library(tidycensus)
library(rmapshaper)
library(matrixStats)
library(SpatialAcc)
library(reticulate)
library(dplyr)
library(tidygeocoder)
library(readxl)
library(DBI)

get residential mail addresses within state - for Maryland at the moment, but can change to VA OR DC

con <- get_db_conn(db_pass = "rsu8zvrsu8zv")
md_locs <- st_read(con, query = "
                         SELECT a.*
                         FROM corelogic_usda.current_tax_200627_latest_all_add_vars_add_progs_geom_blk as a
                         WHERE a.geoid_cnty LIKE '24%'") # CHANGE 24 to 11, 51, etc.
dbDisconnect(con)

data <- md_locs
data$property_indicator <- as.factor(data$property_indicator)

# Residential: 10 (Single Family Residence), 11 (Condominium), 21 (Duplex), 22 (Apartment)
# Considered 24 (Commercial Condominium) but cross-checking with land use code, county use code, and googling property addresses
# showed these were in fact all medical offices, business offices, etc. Excluding here.
data <- data %>% filter(property_indicator == 10 | # single family residence
                        property_indicator == 11 | # condo
                        property_indicator == 21 | # duplex
                        property_indicator == 22)  # apartment

data$geoid_bg <- substr(data$geoid_blk, 1, 12)
data$geoid_tr <- substr(data$geoid_blk, 1, 11)
md.data <- data[grepl(" MD ", data$mail_address),] # change to DC, VA, etc.

get top 3 addresses near block group

APPROACH - GET 5 ADDRESSES LISTED WITHIN EACH BLOCK GROUP, THEN find closest addresses to bg centroids within this group

md.bg <- get_acs(geography = "block group",
                 year = 2019,
                 variables = c(median_household_income = "B19013_001"),
                 state = "MD",
                 survey = "acs5",
                 output = "wide",
                 geometry = T) # need geography for st_centroid


md.data.2 <- md.data %>% distinct(mail_address, .keep_all = TRUE)

# randomly sample 5 addresses "within" each block group (within = based on the label for that address)
all.md.data <- md.data.2 %>%
  group_by(geoid_bg) %>%
  sample_n(5, replace = T) %>%
  select(mail_address, property_centroid_latitude, property_centroid_longitude, geoid_bg)

x <- st_distance(all.md.data, st_centroid(md.bg) %>% st_transform(st_crs(all.md.data))) # get distance matrix

# empty final sample matrix
all.md.data.2 <- as.data.frame(matrix(, nrow = 0, ncol = 3))

# loop over block groups
for (i in 1:(dim(x)[2]))
{
  # get closest 3 addresses to block group centroid within our random sample
  all.md.data.2 <- rbind(all.md.data.2, st_drop_geometry(all.md.data[which(x[, i] %in% sort(x[, i])[1:3]),]) %>% 
    select(mail_address, property_centroid_latitude, property_centroid_longitude) %>%
    mutate(geoid_bg = md.bg[i,]$GEOID) %>%
    sample_n(3)) # Not sure about - Adding missing grouping variables: `geoid_bg`
}

# APPROACH - GET 5 ADDRESSES LISTED WITHIN EACH BLOCK GROUP, THEN find closest addresses to bg centroids within this group

# there are some repeats (again this is the closest address to that block group, so it will be roughly accurate - given that there isn't a lot of variability I genuinely don't think it will matter)
write.csv(all.md.data.2, "md_three_address_in_block_group.csv") # use this CSV for part 2 !


uva-bi-sdad/dc.broadbandnow.broadband_prices documentation built on June 13, 2022, 4:41 a.m.