dev/puf_scratch.r

library(magrittr)
library(plyr) # needed for ldply; must be loaded BEFORE dplyr
library(tidyverse)
options(tibble.print_max = 65, tibble.print_min = 65) # if more than 60 rows, print 60 - enough for states
# ggplot2 tibble tidyr readr purrr dplyr stringr forcats
library(scales)
library(hms) # hms, for times
library(lubridate) # lubridate, for date/times
library(vctrs)

library(btools) # has a few useful functions -- devtools::install_github("donboyd5/btools")

library(geoWeight)


dbdir <- "C:/Users/donbo/Dropbox (Personal)/"
puf_2017 <- read_csv(file = paste0(dbdir, 'puf_2017_filter.csv'))
ht(puf_2017)

puf_targ <- read_csv(file = paste0(dbdir, 'puf_2017_targets.csv'))
ht(puf_targ)
names(puf_targ)

puf_targ %>%
  select(AGI_STUB, STATE, N1_nnz, MARS1_nnz, MARS2_nnz) %>%
  mutate(mars12=MARS1_nnz + MARS2_nnz, mdiff=N1_nnz - mars12,
         mdpct=mdiff / N1_nnz * 100,
         amdpct=abs(mdpct)) %>%
  arrange(mdpct)

possible_target_vars <- c("N1_nnz", "MARS1_nnz", "MARS2_nnz", "A00100_sum", "pos_AGI_nnz", "A00200_sum", "N00200_nnz", "A01000_sum",
                          "N01000_nnz", "A04470_sum", "N04470_nnz", "A17000_sum", "N17000_nnz", "A04800_sum", "N04800_nnz",
                          "A05800_sum", "N05800_nnz", "A09600_sum", "N09600_nnz",
                          "A00700_sum", "N00700_nnz")
feasible_target_vars <- intersect(possible_target_vars, names(puf_targ))
setdiff(possible_target_vars, names(puf_targ))


target_vars <- feasible_target_vars[1:10]
# check the targets
checkwide <- puf_targ %>%
  select(AGI_STUB, STATE, all_of(target_vars)) %>%
  group_by(AGI_STUB) %>%
  mutate(across(all_of(target_vars), ~ . / sum(.) * 100))
checkwide %>% filter(AGI_STUB==2)

check <- puf_targ %>%
  select(AGI_STUB, STATE, all_of(target_vars)) %>%
  group_by(AGI_STUB) %>%
  mutate(across(all_of(target_vars), ~ . / sum(.) * 100)) %>%
  pivot_longer(-c(AGI_STUB, STATE), names_to="target", values_to="state_pct")
ht(check)
summary(check)

check %>%
  group_by(AGI_STUB, STATE) %>%
  mutate(pop_pct=state_pct[target=="N1_nnz"],
         pdiff=state_pct / pop_pct * 100 - 100,
         apdiff=abs(pdiff)) %>%
  ungroup %>%
  arrange(desc(apdiff))



# make mypuf ----
puf_targ_djb <- puf_targ %>%
  select(AGI_STUB, STATE, all_of(feasible_target_vars)) %>%
  select(-c(A04470_sum, N04470_nnz,
            A17000_sum, N17000_nnz,
            A04800_sum, N04800_nnz))
summary(puf_targ_djb) # no NA's
count(puf_targ_djb, STATE)

# compute new state targets, leaving out the current OA
# we need national sums to redistribute across states
natsums <- puf_targ_djb %>%
  select(-STATE) %>%
  group_by(AGI_STUB) %>%
  summarise(across(everything(), sum), .groups="drop")

puf_targ_revised <- puf_targ_djb %>%
  filter(STATE != "OA") %>%
  pivot_longer(-c(AGI_STUB, STATE)) %>%
  left_join(natsums %>% pivot_longer(-AGI_STUB, values_to="value_US"), by = c("AGI_STUB", "name")) %>%
  group_by(AGI_STUB, name) %>%
  mutate(target_revised=value / sum(value) * value_US) %>%
  select(AGI_STUB, STATE, name, value=target_revised) %>%
  pivot_wider() %>%
  ungroup
summary(puf_targ_revised)
count(puf_targ_revised, STATE)

# now revise the puf, and then check wtd sums against target values
feasible_target_vars
ivars <- c(1, 4, 5, 6)
djb_target_vars <- feasible_target_vars[ivars]

getbase <- function(suffix) {
  var_backend <- str_extract(target_vars, "_.*$") # gets the ending part of each variable name
  base_vars <- target_vars[which(var_backend==suffix)] %>% str_remove(suffix)
  names(base_vars) <- base_vars
  base_vars
}

nnz_vars <-getbase("_nnz") # variables for which we want the weighted number of nonzero values
sum_vars <- getbase("_sum") # variables for which we want the weighted sum
sumneg_vars <- getbase("_sumneg") # variables for which we want the weighted sum

pufdjb <- puf_2017 %>%
  mutate_at(nnz_vars,
            list(nnz = ~ 1 * (. != 0))) %>%
  mutate_at(sum_vars,
            list(sum = ~ . * (. != 0))) %>%
  mutate_at(sumneg_vars,
            list(sumneg = ~ . * (. < 0)))
donboyd5/geoWeight documentation built on July 5, 2020, 8:55 p.m.