data-raw/Census/MakeStateLevelSummaryDataFromVariousCensusRetirementFiles.r

# MakeStateLevelSummaryDataFromVariousCensusRetirementFiles.r
# Don Boyd
# 07/15/2017

# This program creates state-level summary data on retirement systems using Census data, for as many years as possible.
# While I had wanted to build state summaries from details for individual systems, several years have imputations for
# individual systems that at present are not available publicly and thus I use summary data already compiled by the Census Bureau.

# The program gets the data from several different Census Bureau sources:

# 1) 1957 through 1992:
#    1957-2007 selected years are in the 59mb historical database found in the Census Bureau's "special60" directory at:
#      http://www2.census.gov/pub/outgoing/govs/special60/1_Emp_Retire_Historical_DB.zip
#    According to the Census Bureau documentation:

#         The primary purpose of this data base is to provide public employee-retirement
#         system data for years FY 1992 and earlier, the period predating the statistics
#         found on the Census Bureau web site. For comparison purposes, data from the census
#         of governments for 1997, 2002, 2007 are also included. Data for intercensal years

#    The file we need is the 7mb workbook:
#      1_Emp_Retire_Sys_Nat_State_Data.xls

#   I store the full archive at E:\Data\CensusRetirementData\RetireDatabase and include only this spreadsheet here.

# 2) 1993 through 2003:
#    State summary excel files available for download from the Census web site (http://www.census.gov/govs/retire/) as a set
#    of state-level tables (Tables 2-5, for revenues, expenditures, cash, membership), plus a national summary (Table 1).
#    The file names for 1993-2002 follow the format ret99t1.xls through ret99t5.xls, ..., ret02t1.xls through ret02t5.xls
#    For 2003 the files are 2003Ret01.xls through 2003Ret05.xls

#    There are some files available for later years in this same format, but we will use a different format for them (see #3 below).

# 3) 2004 through 2011
#    The data are already summarized at the state level in the files:
#    2004retest_data.xls through 2011retest_data.xls
#    These files are in the directory:
#    E:\Data\CensusRetirementData\SummaryTables\SummaryExcelFiles

# 4) 2012+
#    Currently it looks like I need to use the data available through American Fact Finder, available through the same
#    https://factfinder.census.gov
#    OLD:  Census download site http://www.census.gov/govs/retire/



#************************************************************************************************************************
#                Load packages ####
#************************************************************************************************************************

library("plyr")
library("dplyr")
options(dplyr.print_min = 60) # default is 10
options(dplyr.print_max = 60) # default is 20
library("ggplot2")
library("magrittr")
library("readr")
library("readxl")
library("stringr")
library("tidyr")

library("bdata")
library("btools")


#************************************************************************************************************************
#                Directories, filenames, constants ####
#************************************************************************************************************************

cendir <- paste0("./data-raw/Census/")
censs <- paste0(cendir, "StateSummaries/")
cenxl1 <- paste0(censs, "Excel1993to2003/")
cenxl2 <- paste0(censs, "Excel2004to2011/")
affdir <- paste0(censs, "AmericanFactFinder2012plus/")


#************************************************************************************************************************
#                Downloads, ONLY when needed ####
#************************************************************************************************************************
# get the Census historical database
histdb <- "1_Emp_Retire_Historical_DB.zip"
urldir <- "http://www2.census.gov/pub/outgoing/govs/special60/"
url <- paste0(urldir, histdb)
download.file(url, paste0(cendir, histdb), mode="wb")


#************************************************************************************************************************
#                1957-1992: Get and clean state summary data in the historical database ####
#************************************************************************************************************************
# admin type codes
acodesv <- "admin, adminf
0, National total
1, State-local total
2, State
3, Local
4, Fed govt
5, County
6, Municipal
7, Township
8, Special district
9, School district
x, Other"
acodes <- read_csv(acodesv) %>% mutate_each(funs(str_trim))
names(acodes) <- str_trim(names(acodes)) # just in case spaces crept into the names

# get finance data
ssdbfin <- read_excel(paste0(cendir, ssdbfn), sheet="Finances", col_names=FALSE) # superfast;
glimpse(ssdbfin)
sasnames <- ssdbfin[12, ] %>% unlist %>% unname
sasnames
sasnames[sasnames=="SAS VAR NAME>"] <- "stadmin"
names(ssdbfin) <- tolower(sasnames)
ht(ssdbfin)
count(ssdbfin, type)
count(ssdbfin, str_sub(id, 4))
# filter(ssdbfin, grepl("52.0", state)) # 52 is national govt
# filter(ssdbfin, grepl("32.0", state)) # NM


ssdbfin2 <- ssdbfin %>%
  mutate(year=as.numeric(year),
         id=str_trim(id),
         admin=round(as.numeric(str_trim(type)))) %>% # type=x will become NA but we only want admin types 1:3
  filter(admin %in% 1:3) %>%
  mutate(adminf=acodes$adminf[match(admin, acodes$admin)],
         stabbr=as.character(stcodes$stabbr[match(str_sub(id, 1, 2), stcodes$stcen)]),
         stabbr=ifelse(str_sub(id, 1, 2)=="52", "NATL", stabbr)) %>%
  filter(!is.na(year))

count(ssdbfin2, admin, adminf)
count(ssdbfin2, pasis)
count(ssdbfin2, stabbr) # good - just 50 states, DC, and US, with no NAs
glimpse(ssdbfin2)

# now create a long file
ssdbfl <- ssdbfin2 %>% select(-c(sortcode, id, state, fips, pasis, stadmin, type)) %>%
  gather(variable, value, -c(year, stabbr, admin, adminf)) %>%
  mutate(value=ifelse(grepl("-11111", value), NA, value),
         value=cton(value))
ht(ssdbfl)
count(ssdbfl, year)
count(ssdbfl, admin, adminf)
count(ssdbfl, stabbr)
count(ssdbfl, variable)

saveRDS(ssdbfl, paste0(cendir, "statesummaryhist_fin.rds"))


# get membership data
ssdbmem <- read_excel(paste0(cendir, ssdbfn), sheet="Membership", col_names=FALSE) # superfast;
glimpse(ssdbmem)
ssdbmem[10:14, 1:10]
sasnames <- ssdbmem[12, ] %>% unlist %>% unname
sasnames
sasnames[sasnames=="SAS VAR NAME>"] <- "stadmin"
names(ssdbmem) <- tolower(sasnames)
ht(ssdbmem)
count(ssdbmem, type)
count(ssdbmem, str_sub(id, 4))
# filter(ssdbmem, grepl("52.0", state)) # 52 is national govt (not same as US)
# filter(ssdbmem, grepl("32.0", state)) # NM

ssdbmem2 <- ssdbmem %>%
  mutate(year=as.numeric(year),
         id=str_trim(id),
         admin=round(as.numeric(str_trim(type)))) %>% # type=x will become NA but we only want admin types 1:3
  filter(admin %in% 1:3) %>%
  mutate(adminf=acodes$adminf[match(admin, acodes$admin)],
         stabbr=as.character(stcodes$stabbr[match(str_sub(id, 1, 2), stcodes$stcen)]),
         stabbr=ifelse(str_sub(id, 1, 2)=="52", "NATL", stabbr)) %>%
  filter(!is.na(year))


count(ssdbmem2, admin, adminf)

# now create a long file
ssdbml <- ssdbmem2 %>% select(-c(sortcode, id, state, fips, stadmin, type)) %>%
  gather(variable, value, -c(year, stabbr, admin, adminf)) %>%
  mutate(value=ifelse(grepl("-11111", value), NA, value),
         value=cton(value))
ht(ssdbml)
count(ssdbml, year)
count(ssdbml, admin, adminf)
count(ssdbml, stabbr)
count(ssdbml, variable)

saveRDS(ssdbml, paste0(cendir, "statesummaryhist_mem.rds"))


# combine finance and membership files
df1 <- readRDS(paste0(cendir, "statesummaryhist_fin.rds")) %>% mutate(source="finance")
df2 <- readRDS(paste0(cendir, "statesummaryhist_mem.rds")) %>% mutate(source="membership")
df3 <- bind_rows(df1, df2)
count(df3, variable)
count(df3, source)
saveRDS(df3, paste0(cendir, "statesummaryhist.rds"))



#************************************************************************************************************************
#                1993-2003: Get and clean state summary data ####
#************************************************************************************************************************
# gettab handles tables 2, 3, and 4, which have similar formats (but tab 4 has different variables over time)
gettab <- function(year, tabnum, valnames) {
  # create a uniform df constructed from the nonuniform input files
  tabnumc <- str_pad(tabnum, 2, side="left", pad="0")
  if(year==2003) fn <- paste0(year, "Ret", tabnumc, ".xls") else fn <- paste0("ret", str_sub(year, 3, 4), "t", tabnum, ".xls")

  # get the basic data frame in shape so that converting to long, etc., will be the same for all tables
  if(tabnum %in% c(2, 3)) { # same format
    vnames <- c("govt.admin", valnames)
    df <- read_excel(paste0(cenxl1, fn), col_names=vnames)
  } else if(tabnum==4) { # Table 4 investment categories have varied over the years JUST GET TOTAL FOR NOW
    df <- read_excel(paste0(cenxl1, fn), col_names = FALSE)
    names(df)[1:2] <- c("govt.admin", valnames)
    df %<>% select_("govt.admin", valnames)
  }
  # get state name and administration, convert to long format
  dfl <- df %>% mutate(govt.admin=str_trim(govt.admin),
                       admin=ifelse(govt.admin %in% c("State", "Local"), govt.admin, "Total"),
                       slag1=lag(govt.admin, 1),
                       slag2=lag(govt.admin, 2),
                       state=ifelse(govt.admin=="State", slag1, ifelse(govt.admin=="Local", slag2, govt.admin)),
                       stabbr=stcodes$stabbr[match(state, stcodes$stname)]) %>%
    filter(stabbr %in% stcodes$stabbr) %>%
    select(-govt.admin, -state, -slag1, -slag2) %>%
    gather(variable, value, -stabbr, -admin) %>%
    mutate(year=year, value=cton(value)) %>%
    select(stabbr, admin, year, variable, value)
  return(dfl)
}

tab2names <- c("totrev", "eec", "erc", "erc.sg", "erc.lg", "invinc")
dft2 <- ldply(1993:2003, gettab, 2, tab2names)

tab3names <- c("totpay", "benefits", "withdraw", "otherpay")
dft3 <- ldply(1993:2003, gettab, 3, tab3names)

tab4names <- c("assets")
dft4 <- ldply(1993:2003, gettab, 4, tab4names)

# qplot(year, value, data=filter(dft4, variable=="assets", admin=="Total", stabbr=="US"), geom=c("point", "line"))

# Tab 5 we must do separately because (a) for the nation it has mutiple admin types and (b) for states it only has state-local
gettab5 <- function(year) {
  if(year==2003) fn <- "2003Ret05.xls" else fn <- paste0("ret", str_sub(year, 3, 4), "t5.xls")
  df <- read_excel(paste0(cenxl1, fn), col_names = c("govt.admin", "nsystems", "totmem", "actives", "inactives", "beneficiaries"))
  usrow <- which(df$govt.admin=="United States")
  df2 <- df %>% filter(row_number() >= usrow) %>%
    mutate(govt.admin=str_trim(govt.admin),
           state=as.character(stcodes$stname[match(govt.admin, stcodes$stname)]),
           admin=ifelse(state %in% stcodes$stname, "Total", govt.admin),
           admin=ifelse(grepl("State", admin), "State", admin),
           admin=ifelse(grepl("Local", admin), "Local", admin),
           state=ifelse(is.na(state), "United States", state),
           stabbr=stcodes$stabbr[match(state, stcodes$stname)])
  dfl <- df2 %>% select(-govt.admin, -state) %>%
    gather(variable, value, -stabbr, -admin) %>%
    mutate(year=year, value=cton(value))
  return(dfl)
}

dft5<- ldply(1993:2003, gettab5)
# clean up admin var
count(dft5, admin)
dft5 %<>% mutate(admin=ifelse(grepl("Municipal", admin), "Municipality", admin),
                admin=ifelse(grepl("Count", admin), "County", admin),
                admin=ifelse(grepl("School", admin), "School district", admin),
                admin=ifelse(grepl("Special", admin), "Special district", admin),
                admin=ifelse(grepl("Town", admin), "Township", admin))
count(dft5, admin)

dfall <- bind_rows(dft2, dft3, dft4, dft5) %>% arrange(stabbr, admin, variable, year)
ht(dfall)
glimpse(dfall)
count(dfall, stabbr) # includes US, DC
count(dfall, admin)
count(dfall, variable)
count(dfall, year)
saveRDS(dfall, paste0(cendir, "statesummary1993to2003.rds"))


#************************************************************************************************************************
#                2004-2011:  Get and clean state summary data ####
#************************************************************************************************************************
# type codes
# I think these are the same as for the historical DB, but for now create them separately here just in case we need to vary them
# These appear to differ from the historical DB
# 1=US total, 2=State total, 3=Local total, 4=County total, 5=Municipality total, 6=Township total, 7=Special District total, 8=School District total
tcodesv <- "type, typef
1, State-local
2, State
3, Local
4, County
5, Municipal
6, Township
7, Special district
8, School district
x, Other"
tcodes <- read_csv(tcodesv) %>% mutate_each(funs(str_trim))
names(tcodes) <- str_trim(names(tcodes)) # just in case spaces crept into the names

getss <- function(year){
  fn <- paste0(year, "retest_data.xls")
  df <- read_excel(paste0(cenxl2, fn), col_names=FALSE)
  vnrow <- grep("Level", t(df[, 3])) # row in which var names are
  vnames <- df[vnrow, ] %>% unlist %>% unname
  vnames[vnames=="Amount"] <- "value"
  vnames <- gsub("[ _]", "", vnames) # sometimes there is space, sometimes _, so get rid of both
  names(df) <- vnames
  # sometimes there is CV, sometimes not (2007), so don't use it

  df2 <- df %>% mutate(year=year,
                       ItemID=as.numeric(ItemID),
                       type=as.numeric(Level),
                       typef=tcodes$typef[match(type, tcodes$type)],
                       StateID=as.character(round(as.numeric(StateID))),
                       StateID=str_pad(StateID, 2, pad="0"),
                       stabbr=stcodes$stabbr[match(StateID, stcodes$stcen)],
                       value=cton(value)
                       ) %>%
    filter(!is.na(ItemID)) %>%
    select(year, stabbr, ItemID, ItemName, type, typef, value)
  return(df2)
}
# note that we don't have ItemID 32 (# systems) in 2004

ssdf <- ldply(2004:2011, getss) %>% mutate(ItemName=str_to_title(ItemName)) # create more uniform names
count(ssdf, year)
count(ssdf, ItemID, ItemName)


# make further changes to conform itemnames
ssdf2 <- ssdf
ssdf2$ItemName[ssdf2$ItemName=="From State Government"] <- "State Government Contributions"
ssdf2$ItemName[ssdf2$ItemName=="From Local Government"] <- "Local Government Contributions"
ssdf2$ItemName[ssdf2$ItemName=="Earnings On Investment"] <- "Earnings On Investments"
ssdf2$ItemName[ssdf2$ItemName=="Withdrawal"] <- "Withdrawals"
ssdf2$ItemName[ssdf2$ItemName=="Total Cash And Deposits"] <- "Total Cash And Short-Term Investments"
ssdf2$ItemName[ssdf2$ItemName=="Time, Savings Deposits, And Nonfederal Short-Term Investment"] <- "Time, Savings Deposits And Nonfederal Short-Term Investments"
ssdf2$ItemName[ssdf2$ItemName=="Other Securities"] <- "Other Nongovernmental Securities"

saveRDS(ssdf2, paste0(cendir, "statesummary2004to2011.rds"))

#************************************************************************************************************************
#                2012+ NEW ASPPP: Get and clean state ASPP summary data ####
#************************************************************************************************************************
# create statesummary2012plus_ASPP.rds
# These new data appeared on the Census site ~ June 2016 and may be an alternative to the
# American Fact Finder version of the 2012+ data
# Look for it through https://factfinder.census.gov
# https://www.census.gov/data/tables/2017/econ/aspp/aspp-historical-tables.html
# ASPP_data_2012plus
# format: year int, stabbr chr, admin int, adminf chr, variable, value
# variable: actives, assets, beneficiaries, benefits, eec, erc, erc.lg, erc.sg,
#   inactives, invinc, members, nsystems, payroll, totexpend
# admin: 1, 2, 3; adminf: State-local, State, Local
asppdir <- paste0(censs, "ASPP_data_2012plus/")

#..UPDATE this crosswalk with each new data year!!!! ----
asppxw <- read_excel(paste0(asppdir, "aspp_varxwalk.xlsx")) %>%
  gather(year, vartext, -vname) %>%
  mutate(year=as.integer(str_sub(year, 5, -1))) %>%
  filter(!is.na(vname))

# we have to do a LOT of year-specific handling
getASPP_2012 <- function(yr) {
  fn <- paste0(yr, "ASPPSummary.xlsx")
  tmp <- read_excel(paste0(asppdir, fn))
  # In 2012 there are 3 columns for each state: state-local, state, local
  stnames <- names(tmp)[-1]
  stvals <- seq(1, 154, 3)
  stnames[stvals]
  stnames[stvals+1] <- stnames[stvals]
  stnames[stvals+2] <- stnames[stvals]
  stabs <- stcodes$stabbr[match(stnames, stcodes$stname)] %>% as.character
  tmp2 <- tmp %>% setNames(c("vartext", paste0(stabs, "_", c(1:3)))) %>%
    gather(stabbadmin, value, -vartext) %>%
    separate(stabbadmin, c("stabbr", "admin")) %>%
    mutate(year=yr,
           admin=as.integer(admin),
           value=as.numeric(value)) %>%
    filter(!is.na(value)) %>%
    right_join(asppxw) %>%
    mutate(value=ifelse(value==0 & vname %in% c("payroll", "penob"), NA, value)) %>%
    filter(!is.na(value))
  return(tmp2)
}
# getASPP_2012(2012)
# yr <- 2017

getASPP_2013_2016 <- function(yr) {
  fn <- paste0(yr, "ASPPSummary.xlsx")
  tmp <- read_excel(paste0(asppdir, fn))
  # in 2013+ there are 6 columns, State & Local, CV for State & Local, State, CV for State, Local,	CV Local
  # in 2013, US name appears with 7 columns, so we have to shift all names up!
  # in 2014, United States is missing in col 2
  # 2015 looks good
  # fix names before manipulation
  if(yr==2013) {
    vnames <- names(tmp)
    vnames[8:(length(vnames)-1)] <- vnames[9:length(vnames)]
    names(tmp) <- vnames
  } else if(yr==2014) {
    names(tmp)[2] <- "United States"
  }
  stnames <- names(tmp)[-1]
  stvals <- seq(1, 307, 6)
  # stnames[stvals]
  for(i in 1:5) stnames[stvals+i] <- stnames[stvals]
  stabs <- stcodes$stabbr[match(stnames, stcodes$stname)] %>% as.character
  tmp2 <- tmp %>% setNames(c("vartext", paste0(stabs, "_", c(1, 19, 2, 29, 3, 39)))) %>%
    gather(stabbadmin, value, -vartext) %>%
    separate(stabbadmin, c("stabbr", "admin")) %>%
    mutate(year=yr,
           admin=as.integer(admin),
           value=as.numeric(value)) %>%
    filter(admin %in% 1:3, !is.na(value)) %>%
    right_join(asppxw) %>%
    mutate(value=ifelse(value==0 & vname %in% c("payroll", "penob"), NA, value)) %>%
    filter(!is.na(value))
  return(tmp2)
}

getASPP_2017 <- function(yr) {
  fn <- paste0(yr, "asppsummary.xlsx")
  tmp <- read_excel(paste0(asppdir, fn))
  # In 2017 there are 3 columns for each state: state-local, state, local
  stnames <- names(tmp)[-1]
  stvals <- seq(1, 154, 3)
  stnames[stvals]
  stnames[stvals+1] <- stnames[stvals]
  stnames[stvals+2] <- stnames[stvals]
  stabs <- stcodes$stabbr[match(stnames, stcodes$stname)] %>% as.character
  tmp2 <- tmp %>% setNames(c("vartext", paste0(stabs, "_", c(1:3)))) %>%
    gather(stabbadmin, value, -vartext) %>%
    separate(stabbadmin, c("stabbr", "admin")) %>%
    mutate(year=yr,
           admin=as.integer(admin),
           value=as.numeric(value)) %>%
    filter(!is.na(value)) %>%
    right_join(asppxw) %>%
    mutate(value=ifelse(value==0 & vname %in% c("payroll", "penob"), NA, value)) %>%
    filter(!is.na(value))
  return(tmp2)
}


getASPP <- function(yr) {
  if(yr==2012) df <- getASPP_2012(yr) else
    if(yr %in% 2013:2016) df <- getASPP_2013_2016(yr) else
      if(yr==2017) df <- getASPP_2017(yr)
  return(df)
}

getASPP(2015)
getASPP(2016)
getASPP(2017)

df <- ldply(2012:2017, getASPP) %>% select(stabbr, year, admin, vname, value)
count(df, year)
count(df, stabbr)
count(df, admin)
count(df, vname)
glimpse(df)


saveRDS(df, paste0(cendir, "statesummary2012plus_ASPP.rds"))




#************************************************************************************************************************
#                OLD 2012+: Get and clean state AFF summary data ####
#************************************************************************************************************************
# first get the data, then the metadata
# get the data
# first row is variable description
# variable name is 2 parts: 1) level, and 2) column id
# in 2012, lvl is 0, 1, or 2 for state-local, state, or local
# in 2013+, lvl is 0a or 0b for state-local amount and CV, 1 for state, and 2a or 2b for local amount and CV
# column id APPEARS to mean the same thing in each year but I'm going to grab the description from the first row, to be safe
# the first-row description has two parts - before semi-colon is the level, and after is the description
# the level description in 2013 identifies amount or CV, whereas in 2012 it does not

fn <- aff.files[1]

getaff <- function(fn) {
  # extract year from file name
  split_get <- function(alist, splitchar, itemget) {
    items <- str_split(alist, splitchar) %>% sapply(., "[", itemget)
    return(items)
  }
  year <- as.numeric(split_get(fn, "_", 2))
  df <- read_csv(paste0(affdir, fn))
  names(df)[1:3] <- c("geoid", "geoid2", "geoname")

  cenvnames <- data_frame(variable=names(df), text=str_trim(df[1, ])) %>%
    filter(!variable %in% names(df)[1:3]) %>%
    # separate(variable, c("lvl", "col"), sep="_", remove=FALSE, extra="drop") %>%
    separate(text, c("type", "description"), sep=";", extra="drop") %>%
    mutate(year=year, description=str_trim(description))

  df2 <- df %>% gather(variable, value, -c(geoid, geoid2, geoname)) %>%
    mutate(stabbr=stcodes$stabbr[match(geoname, stcodes$stname)]) %>%
    filter(stabbr %in% stcodes$stabbr) %>%
    mutate(description=cenvnames$description[match(variable, cenvnames$variable)]) %>%
    separate(variable, c("lvl", "col"), remove=FALSE) %>%
    mutate(admin=as.character(factor(str_sub(lvl, 4, 4), levels=c("0", "1", "2"), labels=c("Total", "State", "Local"))),
           year=year) %>%
    select(-geoid, -geoid2, -geoname) %>%
    select(stabbr, admin, variable, lvl, col, description, year, value) %>%
    arrange(stabbr, admin, lvl, col, year)

  # distinguish amount columns from CV columns and drop the CVs
  cvcols <- c("lvl0b", "lvl2a")
  df3 <- df2 %>% mutate(type=ifelse(lvl %in% cvcols, "cv", "amount")) %>%
    filter(type=="amount") %>%
    select(-type, -lvl) %>%
    mutate(value=cton(value))
  return(df3)
}

aff.files <- c("COG_2012_COGPP00SL_with_ann.csv", "PP_2013_PP00SL_with_ann.csv", "PP_2014_PP00SL_with_ann.csv")

df <- ldply(aff.files, getaff)
count(df, stabbr)
count(df, admin)
count(df, year)
count(df, variable)
tmp <- count(df, variable, description)
tmp <- df %>% count(year, col, description) %>% spread(year, n)

saveRDS(select(df, -variable), paste0(cendir, "statesummary2012plus.rds"))


#************************************************************************************************************************
#                1957-2013+ - Create unified summary data ####
#************************************************************************************************************************
# create a uniform file with variables stabbr, admin, adminf, year, variable, for 50 states, DC, US
df1 <- readRDS(paste0(cendir, "statesummaryhist.rds"))
df2 <- readRDS(paste0(cendir, "statesummary1993to2003.rds"))
df3 <- readRDS(paste0(cendir, "statesummary2004to2011.rds"))
# df4 <- readRDS(paste0(cendir, "statesummary2012plus.rds"))
df5 <- readRDS(paste0(cendir, "statesummary2012plus_ASPP.rds"))

glimpse(df1)
glimpse(df2)
glimpse(df3)
# glimpse(df4)
glimpse(df5)
count(df1, variable)
count(df2, variable)
count(df3, ItemID, ItemName)
# old tmp <- count(df4, col, description)
count(df5, year)


# variable, variable, ItemID, col -are the "native" variables for df1-df4 respectively
# not all files have total contributions (erc + eec), so calculate that
# don't bother with total revenue (contrib + invinc) - can calc if needed

xwalk <- read_csv(
  "uvname, file, fvalue
  nsystems, df1, systems
  nsystems, df2, nsystems
  nsystems, df3, 32
  nsystems, df4, C036

  members, df1, totmemb
  members, df2, totmem
  members, df3, 28
  members, df4, C038

  actives, df1, active
  actives, df2, actives
  actives, df3, 29
  actives, df4, C039

  inactives, df1, inactive
  inactives, df2, inactives
  inactives, df3, 30
  inactives, df4, C040

  beneficiaries, df1, totbenef
  beneficiaries, df2, beneficiaries
  beneficiaries, df3, 31
  beneficiaries, df4, C042

  payroll, df4, C044

  eec, df1, totempcontrb
  eec, df2, eec
  eec, df3, 2
  eec, df4, C003

  erc, df1, totgovcontrb
  erc, df2, erc
  erc, df3, 3
  erc, df4, C004

  erc.sg, df1, contrbstate
  erc.sg, df2, erc.sg
  erc.sg, df3, 4
  erc.sg, df4, C005

  erc.lg, df1, contrblocal
  erc.lg, df2, erc.lg
  erc.lg, df3, 5
  erc.lg, df4, C006

  invinc, df1, totalearns
  invinc, df2, invinc
  invinc, df3, 6
  invinc, df4, C008

  totexpend, df1, toterexp
  totexpend, df2, totpay
  totexpend, df3, 7
  totexpend, df4, C010

  benefits, df1, benefits
  benefits, df2, benefits
  benefits, df3, 8
  benefits, df4, C011

  assets, df1, totercashsec
  assets, df2, assets
  assets, df3, 11
  assets, df4, C015")
xwalk %<>% filter(!is.na(file))
xwalk

df1xw <- xwalk %>% filter(file=="df1") %>% select(uvname, fvalue)
df2xw <- xwalk %>% filter(file=="df2") %>% select(uvname, fvalue)
df3xw <- xwalk %>% filter(file=="df3") %>% select(uvname, fvalue)
# df4xw <- select(filter(xwalk, file=="df4"), uvname, fvalue)

# focus on getting adminf right for each file, and creating admin from it

glimpse(df1)
count(df1, variable)
count(df1, year)
count(df1, admin)
df1a <- df1 %>% filter(year<1993, admin %in% 1:3) %>%
  mutate(uvname=df1xw$uvname[match(variable, df1xw$fvalue)]) %>% # get the uniform variable name
  filter(!is.na(uvname)) %>%
  select(year, stabbr, admin, uvname, value)

#  mutate(adminf=as.character(factor(admin, levels=1:3, labels=c("State-local", "State", "Local")))) %>%

glimpse(df2)
count(df2, variable)
count(df2, year)
count(df2, admin)
df2a <- df2 %>% rename(adminf=admin) %>%
  filter(adminf %in% c("Total", "State", "Local")) %>%
  mutate(uvname=df2xw$uvname[match(variable, df2xw$fvalue)]) %>% # get the uniform variable name
  filter(!is.na(uvname)) %>%
  mutate(admin=as.integer(factor(adminf, levels=c("Total", "State", "Local"), labels=1:3))) %>%
  select(year, stabbr, admin, uvname, value)


glimpse(df3)
count(df3, year)
count(df3, type)
count(df3, ItemID, ItemName)
df3a <- df3 %>% rename(admin=type) %>%
  filter(admin %in% 1:3) %>%
  mutate(uvname=df3xw$uvname[match(ItemID, df3xw$fvalue)]) %>% # get the uniform variable name
  filter(!is.na(uvname)) %>%
  select(year, stabbr, admin, uvname, value)


# glimpse(df4)
# count(df4, admin)
# tmp <- count(df4, col, description)
# df4a <- df4 %>%
#   mutate(uvname=df4xw$uvname[match(col, df4xw$fvalue)]) %>% # get the uniform variable name
#   filter(!is.na(uvname)) %>%
#   mutate(adminf=ifelse(admin=="Total", "State-local", admin)) %>%
#   select(year, stabbr, adminf, uvname, value)


# Now put the 4 files together year, admin, stabbr, uvname, value
glimpse(df1a)
glimpse(df2a)
glimpse(df3a)
glimpse(df4a)
glimpse(df5)
count(df1a, year); count(df2a, year); count(df3a, year); count(df5, year)

dfall <- bind_rows(df1a, df2a, df3a, df5 %>% rename(uvname=vname)) %>%
  rename(variable=uvname) %>%
  mutate(year=as.integer(year),
         adminf=as.character(factor(admin, levels=1:3, labels=c("State-local", "State", "Local")))) %>%
  select(year, stabbr, admin, adminf, variable, value) %>%
  arrange(stabbr, admin, adminf, variable, year)
glimpse(dfall)
count(dfall, variable)
count(dfall, stabbr)
count(dfall, admin, adminf)
count(dfall, year)

qplot(year, value, data=filter(dfall, variable=="nsystems", stabbr=="US", admin==1), geom=c("point", "line"))
qplot(year, value, data=filter(dfall, variable=="erc", stabbr=="US", admin==1), geom=c("point", "line"))

cenretss <- dfall
devtools::use_data(cenretss, overwrite=TRUE)


# dfall %>% filter(stabbr=="US", variable=="assets", year>=2010)
donboyd5/pdata documentation built on Oct. 19, 2021, 9:31 p.m.