claims_db/phclaims/stage/tables/load_stage.address_clean_full.R

#### CODE TO SET UP OR UPDATE ADDRESS_CLEAN TABLES
# Alastair Matheson, PHSKC (APDE)
#
# 2019-04


#### Set up global parameter and call in libraries ####
options(max.print = 350, tibble.print_max = 50, scipen = 999)

library(tidyverse) # Manipulate data
library(odbc) # Read to and write from SQL
library(RCurl) # Read files from Github
library(sf) # Read shape files

db_apde51 <- dbConnect(odbc(), "PH_APDEStore51")
db_claims <- dbConnect(odbc(), "PHClaims51")

geocode_path <- "//dchs-shares01/DCHSDATA/DCHSPHClaimsData/Geocoding"
source("https://raw.githubusercontent.com/PHSKC-APDE/claims_data/main/claims_db/db_loader/scripts_general/create_table.R")

table_config <- yaml::yaml.load(getURL(
    "https://raw.githubusercontent.com/PHSKC-APDE/claims_data/main/claims_db/phclaims/stage/tables/load_stage.address_clean.yaml"))


#### INITAL ADDRESS_CLEAN SETUP ####
# Take address data from all claims sources (Medicaid)
# Add in addresses from other sources (Housing)
# Add in manually corrected addresses
# combine into a single file


### Create SQL table
create_table_f(conn = db_claims, 
               config_url = "https://raw.githubusercontent.com/PHSKC-APDE/claims_data/main/claims_db/phclaims/stage/tables/load_stage.address_clean.yaml",
               overall = T, ind_yr = F)


### Bring in manual corrections
manual_add <- read.csv(file.path(geocode_path,
                                 "Medicaid_eligibility_specific_addresses_fix_updated - DO NOT SHARE FILE.csv"),
                       stringsAsFactors = F)

manual_add <- manual_add %>% 
  mutate_all(funs(ifelse(. == "", NA_character_, .))) %>%
  mutate(geo_zip_raw = as.character(geo_zip_raw),
         geo_zip_clean = as.character(geo_zip_clean))


### Bring in all Medicaid addresses
# Seems slightly quicker to push addresses to a temp table than take distinct
try(dbRemoveTable(db_claims, "##add_temp", temporary = T))
dbGetQuery(db_claims, 
           "SELECT 
           RSDNTL_ADRS_LINE_1 AS geo_add1_raw, 
           RSDNTL_ADRS_LINE_2 AS geo_add2_raw,
           RSDNTL_CITY_NAME AS geo_city_raw, 
           RSDNTL_STATE_CODE AS geo_state_raw, 
           RSDNTL_POSTAL_CODE AS geo_zip_raw,
           1 AS geo_source_mcaid,
           NULL AS geo_source_pha 
           INTO ##add_temp 
           FROM stage.mcaid_elig")


mcaid_add <- dbGetQuery(db_claims, 
                        "SELECT DISTINCT geo_add1_raw, geo_add2_raw, 
                        geo_city_raw, geo_state_raw, geo_zip_raw, 
                        geo_source_mcaid, geo_source_pha
                        FROM ##add_temp ")


### Bring in complete public housing authority addresses
pha_add_full <- dbGetQuery(db_apde51, 
                      "SELECT unit_add, unit_apt, unit_apt2, 
                      unit_city, unit_state, unit_zip,
                      geo_add1_raw, geo_add2_raw, 
                      geo_city_raw, geo_state_raw, geo_zip_raw,
                      NULL AS geo_source_mcaid,
                      1 AS geo_source_pha  
                      FROM stage.pha_address_new")


# Subset to partially cleaned addresses and set missing add to NA to match Medicaid
pha_add <- pha_add_full %>%
  distinct(geo_add1_raw, geo_add2_raw, geo_city_raw, geo_state_raw, geo_zip_raw,
           geo_source_mcaid, geo_source_pha) %>%
  mutate_all(funs(ifelse(. == "" | . == "NA", NA_character_, .)))


### Find any address common to both data sets
joint_add <- inner_join(select(mcaid_add, geo_add1_raw, geo_add2_raw, 
                               geo_city_raw, geo_state_raw, geo_zip_raw, geo_source_mcaid),
                        select(pha_add, geo_add1_raw, geo_add2_raw, 
                               geo_city_raw, geo_state_raw, geo_zip_raw, geo_source_pha),
                        by = c("geo_add1_raw", "geo_add2_raw", 
                               "geo_city_raw", "geo_state_raw", "geo_zip_raw"))



# Remove common addresses from source
# Make second version of mcaid_add because it takes so long to read in
mcaid_add2 <- anti_join(mcaid_add, 
                     select(joint_add, geo_add1_raw, geo_add2_raw, 
                            geo_city_raw, geo_state_raw, geo_zip_raw),
                     by = c("geo_add1_raw", "geo_add2_raw", 
                            "geo_city_raw", "geo_state_raw", "geo_zip_raw"))

pha_add <- anti_join(pha_add, 
                     select(joint_add, geo_add1_raw, geo_add2_raw, 
                            geo_city_raw, geo_state_raw, geo_zip_raw),
                     by = c("geo_add1_raw", "geo_add2_raw", 
                            "geo_city_raw", "geo_state_raw", "geo_zip_raw"))


### Combine files
combined_add <- bind_rows(mcaid_add2, pha_add, joint_add)


### Remove manually corrected addresses from combined list
combined_add <- anti_join(combined_add, manual_add,
                          by = c("geo_add1_raw", "geo_add2_raw",  
                                 "geo_city_raw", "geo_state_raw", "geo_zip_raw"))


# Make all NA actual NA
combined_add <- combined_add %>%
  mutate_at(vars(geo_add1_raw, geo_add2_raw, geo_city_raw, geo_state_raw, geo_zip_raw),
            funs(ifelse(. == "" | . == "NA" | is.na(.), NA_character_, .)))



#### SUBSEQUENT ADDRESS_CLEAN SETUP ####
# Take addresses from source
# Anti-join to existing ref.address_clean

### Create SQL table
create_table_f(conn = db_claims, 
               config_url = "https://raw.githubusercontent.com/PHSKC-APDE/claims_data/main/claims_db/phclaims/stage/tables/create_stage.address_clean.yaml",
               overall = T, ind_yr = F)

# MORE CODE TO COME



#### ADDRESS_CLEAN INFORMATICA STEP ####
# Run through Informatica
# Make any manual changes
# Create/add to existing table

### Write out to Informatica
combined_add_out <- combined_add %>%
  distinct(geo_add1_raw, geo_add2_raw, geo_city_raw, geo_state_raw, geo_zip_raw)

write.csv(combined_add_out, paste0("//kcitetldepim001/Informatica/address/combined_add_",
                                   Sys.Date(), ".csv"))


### Read in addresses run though Informatica
combined_add_in <- data.table::fread(file = "//kcitetldepim001/Informatica/address/cleaned_addresses_new.csv")



#### FOR FUTURE FIXING HERE ####
# 1) Load in multiple informatica output files
# 2) Remove manually corrected addresses from Informatica results
#   (some addresses were corrected after being run through Informatica)





### Convert missing to NA so joins work and take distinct
combined_add_in_clean <- combined_add_in %>%
  mutate_at(vars(add1, add2, po_box, city, state, zip, 
                 old_add1, old_add2, old_city, old_state, old_zip),
            funs(ifelse(. == "" | . == "NA" | is.na(.), NA_character_, .))) %>%
  select(-`#id`) %>%
  distinct()

### Tidy up some PO box and other messiness
combined_add_in_clean <- combined_add_in_clean %>%
  mutate(movetype = case_when(
    # PO and BOX were split = movetype 1
    str_detect(add1, "^BOX[:space:]?[0-9]*$") & str_detect(po_box, "PO BOX") ~ 1,
    # add1 had number before PO BOX = movetype 2
    str_detect(add1, "^[0-9]* PO BOX$") & str_detect(po_box, "PO BOX") ~ 2,
    # add1 one has PO Box number but nothing else = movetype 3
    str_detect(add1, "^[0-9]*$") & po_box == "PO BOX" ~ 3,
    # add2 has secondary designator and po_box has details = movetype 4
    !is.na(add1) & !is.na(add2) & !is.na(po_box) ~ 4,
    # add2 has orphan secondary designator and po_box has details = movetype 5
    is.na(add1) & !is.na(add2) & !is.na(po_box) ~ 5,
    # add1 is street address, add2 is NA and po_box has box details = movetype 6
    !is.na(add1) & is.na(add2) & !is.na(po_box) ~ 6,
    # only address is the po_box = movetype 8
    is.na(add1) & is.na(add2) & !is.na(po_box) ~ 7
  )
  ) %>%
  mutate(add1 = case_when(
    movetype == 1 ~ paste0("PO ", str_sub(add1, 1, nchar(add1))),
    movetype == 2 ~ po_box,
    movetype == 3 ~ paste(po_box, add1),
    movetype == 4 ~ add1,
    movetype == 5 ~ po_box,
    movetype == 6 ~ add1,
    movetype == 7 ~ po_box,
    TRUE ~ add1
  ),
  add2 = case_when(
    movetype == 1 ~ add2,
    movetype == 2 ~ NA_character_,
    movetype == 3 ~ NA_character_,
    movetype == 4 ~ paste(add2, po_box),
    movetype == 5 ~ add2,
    movetype == 6 ~ po_box,
    TRUE ~ add2
  ),
  # Now set PO box field to be a simple flag
  po_box = as.numeric(ifelse(!is.na(po_box), 1, 0)),
  # Fix up when some numbers when leading zeros stripped
  old_add1 = case_when(
    old_add1 == "10040" ~ "00010040",
    old_add1 == "241" ~ "0241",
    old_add1 == "269001164" ~ "0269001164",
    old_add1 == "3204" ~ "03204",
    old_add1 == "1.00E-101" ~ "1E-101",
    old_add1 == "1.10E-201" ~ "11E-202",
    TRUE ~ old_add1),
  old_zip = as.character(old_zip),
  zip = as.character(zip)
  ) %>%
  # The 2019-04-29 run through Informatica had secondary addresses as dates
  # This is now fixed but must be corrected here for join to work
  mutate_at(vars(old_add1, old_add2),
            funs(case_when(
              . == "1-Jan" ~ "1-1",
              . == "2-Jan" ~ "1/2",
              . == "4-Jan" ~ "1-4",
              . == "7-Jan" ~ "1-7",
              . == "1-Feb" ~ "2-1",
              . == "2-Feb" ~ "2-2",
              . == "5-Feb" ~ "2-5",
              . == "6-Feb" ~ "2-6",
              . == "8-Feb" ~ "2-8",
              . == "13-Feb" ~ "2-13",
              . == "4-Mar" ~ "3-4",
              . == "Mar-32" ~ "3-32",
              . == "1-Apr" ~ "4-1",
              . == "4-Apr" ~ "4-4",
              . == "6-Apr" ~ "4-6",
              . == "7-Apr" ~ "4-7",
              . == "16-Apr" ~ "4-16",
              . == "3-May" ~ "5-3",
              . == "4-May" ~ "5-4",
              . == "7-May" ~ "5-7",
              . == "2-Jun" ~ "6-2",
              . == "3-Jun" ~ "6-3",
              . == "4-Jun" ~ "6-4",
              . == "5-Jun" ~ "6-5",
              . == "6-Jun" ~ "6-6",
              . == "9-Jun" ~ "6-9",
              . == "Jun-31" ~ "6-31",
              . == "Jun-45" ~ "6-45",
              . == "3-Jul" ~ "7-3",
              . == "5-Jul" ~ "7-5",
              . == "8-Jul" ~ "7-8",
              . == "1-Aug" ~ "8-1",
              . == "2-Aug" ~ "8-2",
              . == "3-Aug" ~ "8-3",
              . == "28-Aug" ~ "8-28",
              . == "1-Sep" ~ "9-1",
              . == "2-Sep" ~ "9-2",
              . == "4-Sep" ~ "9-4",
              . == "8-Sep" ~ "9-8",
              . == "Sep-24" ~ "9-24",
              . == "Sep-40" ~ "9-40",
              . == "1-Oct" ~ "10-1",
              . == "3-Oct" ~ "10-3",
              . == "4-Oct" ~ "10-4",
              . == "5-Oct" ~ "10-5",
              . == "1-Nov" ~ "11-1",
              . == "3-Nov" ~ "11-3",
              . == "4-Nov" ~ "11-4",
              . == "6-Nov" ~ "11-6",
              . == "2-Dec" ~ "12-2",
              . == "8-Dec" ~ "12-08",
              . == "22-Dec" ~ "12-22",
              TRUE ~ .
            ))) %>%
  select(-`#id`, -movetype) %>%
  distinct()

# Tidy up some situations where Informatica stripped out the secondary unit
#    but it seems to actually exist
secondary_init <- c("^#", "^# ", "^\\$", "^APT", "^APPT","^APARTMENT", "^APRT", "^ATPT", 
                    "^BOX", "^BLDG", "^BLD", "^BLG", "^BUILDING", "^DUPLEX", "^FL ", 
                    "^FLOOR", "^HOUSE", "^LOT", "^LOWER", "^LOWR", "^LWR", "^REAR", 
                    "^RM", "^ROOM", "^SLIP", "^STE", "^SUITE", "^SPACE", "^SPC", 
                    "^STUDIO", "^TRAILER", "^TRAILOR", "^TLR", "^TRL", "^TRLR", 
                    "^UNIT", "^UPPER", "^UPPR", "^UPSTAIRS")

combined_add_in_clean <- combined_add_in_clean %>%
  mutate(add2 = case_when(
    is.na(add2) & !is.na(old_add2) & 
      str_detect(old_add2, paste(secondary_init, collapse = "|")) ~ old_add2,
    TRUE ~ add2
  ),
  add2 = case_when(
    is.na(add2) & !is.na(old_add1) & 
      str_detect(old_add1, paste(secondary_init, collapse = "|")) ~ old_add1,
    TRUE ~ add2
  )
  ) %>%
  distinct()


### Make new combined file
combined_add_clean <- left_join(combined_add, combined_add_in_clean, 
                          by = c("geo_add1_raw" = "old_add1", 
                                 "geo_add2_raw" = "old_add2", 
                                 "geo_city_raw" = "old_city", 
                                 "geo_state_raw" = "old_state", 
                                 "geo_zip_raw" = "old_zip")) %>%
  rename(geo_add1_clean = add1,
         geo_add2_clean = add2,
         geo_city_clean = city,
         geo_state_clean = state,
         geo_zip_clean = zip)


#### COMBINE ALL ADDDRESSES ####
combined_add_full <- bind_rows(combined_add_clean, manual_add) %>%
  distinct()

# Show how many matched
combined_add_full %>% mutate(matched = ifelse(is.na(mailabilty_score), 0, 1)) %>%
  group_by(matched) %>% summarise(count = n())
# How many matched after removing manual corrections
combined_add_full %>% mutate(matched = ifelse(is.na(mailabilty_score), 0, 1)) %>%
  filter(is.na(overridden)) %>%
  group_by(matched) %>% summarise(count = n())

# Tidy up columns
combined_add_full <- combined_add_full %>%
  mutate(po_box = ifelse(po_box == 1 | !is.na(mailbox), 1, 0)) %>%
  #select(-homeless, -mailbox, -care_of, -notes) %>%
  distinct()


### Join back to PHA address full to add manual corrections and retrieve original
#  PHA addresses


combined_add_full2 <- full_join(combined_add_full, 
                                pha_add_full %>% 
                                  mutate_all(funs(ifelse(. == "" | . == "NA", NA_character_, .))),
                                by = c("geo_add1_raw", "geo_add2_raw",
                                       "geo_city_raw", "geo_state_raw", "geo_zip_raw"))
combined_add_full2 <- combined_add_full2 %>%
  mutate(pha_xfer = ifelse(!is.na(geo_source_pha.y) & is.na(geo_add1_clean) & 
                             is.na(geo_add2_clean) & is.na(geo_city_clean) & 
                             is.na(geo_state_clean) & is.na(geo_zip_clean), 1, 0),
         geo_add1_clean = ifelse(pha_xfer == 1, geo_add1_raw, geo_add1_clean),
         geo_add2_clean = ifelse(pha_xfer == 1, geo_add2_raw, geo_add2_clean),
         geo_city_clean = ifelse(pha_xfer == 1, geo_city_raw, geo_city_clean),
         geo_state_clean = ifelse(pha_xfer == 1, geo_state_raw, geo_state_clean),
         geo_zip_clean = ifelse(pha_xfer == 1, geo_zip_raw, geo_zip_clean),
         geo_add1_raw = ifelse(geo_source_pha.y == 1 & !is.na(geo_source_pha.y), 
                               unit_add, geo_add1_raw),
         geo_add2_raw = ifelse(geo_source_pha.y == 1 & !is.na(geo_source_pha.y),
                               unit_apt, geo_add2_raw),
         geo_add3_raw = unit_apt2,
         geo_city_raw = ifelse(geo_source_pha.y == 1 & !is.na(geo_source_pha.y), 
                               unit_city, geo_city_raw),
         geo_state_raw = ifelse(geo_source_pha.y == 1 & !is.na(geo_source_pha.y), 
                                unit_state, geo_state_raw),
         geo_zip_raw = ifelse(geo_source_pha.y == 1 & !is.na(geo_source_pha.y), 
                              unit_zip, geo_zip_raw),
         geo_source_mcaid = case_when(
           geo_source_mcaid.x == 1 & !is.na(geo_source_mcaid.x) ~ 1,
           overridden == 1 & !is.na(overridden) ~ 1,
           TRUE ~ 0),
         geo_source_pha = case_when(
           geo_source_pha.x == 1 & !is.na(geo_source_pha.x) ~ 1,
           geo_source_pha.y == 1 & !is.na(geo_source_pha.y) ~ 1,
           TRUE ~ 0),
         overridden = ifelse(is.na(overridden), 0, 1)
         ) %>%
  select(geo_add1_raw, geo_add2_raw, geo_add3_raw, geo_city_raw, geo_state_raw, geo_zip_raw,
         geo_add1_clean, geo_add2_clean, geo_city_clean, geo_state_clean, geo_zip_clean,
         geo_source_mcaid, geo_source_pha, po_box, overridden) %>%
  # Need to update the sources now that the original PHA addresses are back
  group_by(geo_add1_raw, geo_add2_raw, geo_add3_raw, 
           geo_city_raw, geo_state_raw, geo_zip_raw) %>%
  mutate(geo_source_mcaid = max(geo_source_mcaid),
         geo_source_pha = max(geo_source_pha)) %>%
  ungroup() %>%
  distinct()





#### WRITE TO SQL ####
### Pull out relevant fields
combined_add_full_load <- combined_add_full2 %>%
  distinct(geo_add1_raw, geo_add2_raw, geo_add3_raw, geo_city_raw, geo_state_raw, geo_zip_raw,
           geo_add1_clean, geo_add2_clean, geo_city_clean, geo_state_clean, geo_zip_clean,
           geo_source_mcaid, geo_source_pha) %>%
  mutate(last_run = Sys.time())

### One last check that missing are actually NA
combined_add_full_load <- combined_add_full_load %>%
  mutate_all(., list( ~ ifelse(. %in% c("", "NA"), NA, .)))



dbWriteTable(db_claims,
             name = DBI::Id(schema = as.character(table_config$schema), 
                            table = as.character(table_config$table)), 
             combined_add_full_load, overwrite = T,
             field.types = unlist(table_config$vars))







#### GEOCODING STEP ####
### Eventually the results of the geocoding might be used to clean addresses further,
#      though ArcGIS doesn't do a great job of this

### Bring in previously geocoded data (ESRI)
geocoded_2018_06_12 <- read_sf(file.path(geocode_path, 
                                         "Distinct_addresses_geocoded_2018-06-20.shp"))

geocoded_2018_06_12 <- geocoded_2018_06_12 %>%
  as.data.frame() %>%
  filter(Status == "M" & Loc_name != "zip_5_digit_gc" & !is.na(Loc_name)) %>%
  select(add1, city, state, zip_1, -geometry) %>%
  rename(geo_add1_raw = add1,
         geo_city_raw = city,
         geo_state_raw = state,
         geo_zip_raw = zip_1) %>%
  distinct() %>%
  mutate(geocoded = 1,
         geo_zip_raw = as.character(geo_zip_raw))


### Flag those that have been geocoded already
combined_add_for_geo <- left_join(combined_add_full, geocoded_2018_06_12,
                                  by = c("geo_add1_raw", "geo_city_raw", 
                                         "geo_state_raw", "geo_zip_raw"))


### Write out file for geocoding (drop add2)
combined_add_for_geo <- combined_add_for_geo %>%
  filter(is.na(geocoded)) %>%
  distinct(geo_add1_clean, geo_city_clean, geo_state_clean, geo_zip_clean) %>%
  # Add a unique ID also for binding later
  mutate(row_id = row_number())

write.csv(combined_add_for_geo,
          paste0("//dchs-shares01/DCHSDATA/DCHSPHClaimsData/Geocoding/",
                    "distinct_addresses_", Sys.Date(), ".csv"),
          row.names = F)





##### TEMP #####
combined_add_clean %>% as.data.frame() %>% sample_n(10)




library(data.table)
combined_add_check <- setDT(combined_add_clean)
combined_add_check[, rows := .N, 
                   by = .(geo_add1_raw, geo_add2_raw, geo_city_raw, geo_state_raw, geo_zip_raw)]
combined_add_check <- combined_add_check[order(geo_add1_raw, geo_add2_raw, geo_city_raw, geo_state_raw, geo_zip_raw)]


combined_add_check <- combined_add_clean %>%
  group_by(geo_add1_raw, geo_add2_raw, geo_city_raw, geo_state_raw, geo_zip_raw) %>%
  mutate(rows = n()) %>%
  ungroup() %>% 
  arrange(geo_add1_raw, geo_add2_raw, geo_city_raw, geo_state_raw, geo_zip_raw)


combined_add_check %>% filter(rows > 1) %>% head() %>% as.data.frame()



manual_add_check = left_join(manual_add, combined_add_in,
                             by = c("geo_add1_raw" = "old_add1",
                                    "geo_add2_raw" = "old_add2",
                                    "geo_city_raw" = "old_city",
                                    "geo_state_raw" = "old_state",
                                    "geo_zip_raw" = "old_zip")) %>%
  select(-overridden, -`#id`)

manual_add_check_same <- manual_add_check %>%
  distinct() %>%
  mutate(rows = row_number() + 1) %>%
  filter(notes == "" & is.na(homeless) & is.na(care_of) & !is.na(mailabilty_score) &
           paste0("APT ", geo_add2_raw) == geo_add2_clean) %>%
  select(-notes, -homeless, -care_of, -mailabilty_score, -mailbox, -geo_state_clean)
PHSKC-APDE/claims_data documentation built on May 8, 2024, 1:20 a.m.