u35dir <- paste0(cendir, "Units-35char/")
u35web <- paste0(u35dir, "web-interface/") # files obtained from the Census web interface
u35nonweb <- paste0(u35dir, "nonweb/") # files obtained from the Census Bureau but NOT from the web interface
ssdbfn <- "1_Emp_Retire_Sys_Nat_State_Data.xls" # state summary data base file name
# TODO: Rewrite this info below
# 2b) 1993(?)-2001
# 1993-2013 unit-level data may be obtained, year-by-year, through drop-down menu at:
# http://www.census.gov/govs/retire/historical_data.html
# The actual files are located at:
# http://www2.census.gov/govs/retire/
# There also is an ftp site that appears to have identical info, but I am using the www2 site.
# Because of imputations, discussed below, I believe we can only use these data for 1993-2001;
# 1993 still has issues to resolve
# I have downloaded ALL of the unit files and all of the associated "id" files (names of systems) that appear through
# the web interface, as this is what Census make available officially.
# The downloads section below shows how to download all of the files through R.
# CENSUS BUREAU IMPUTATIONS ####
# From 2002 through 2011 the unit files are "no imputes" files.
# There is a good discussion of this in the 2003 methodology document: http://www2.census.gov/govs/retire/2003surveymeth.pdf
# It says that "Between years 2002 through 2007, individual government imputed data were not released to the public."
# From 2003-2005 unit files have "noimputes" appended to the name, as in: 2003ret35noimputes.txt
# The file for 2006 available through the web interface is called: 2006ret35imputes.txt
# I need to determine if the 2006 file has imputations or not.
# Beginning in 2007 filenames are labeled as "reported data" as in: 2007indiv_unit_reported_data.txt
# Ideally I would like to use files WITH imputations.
# Files NOT available through the web interface:
# The website http://www2.census.gov/govs/retire/ has all of the files available through the web interface, PLUS
# some files that appear to include imputations. For example, 2003Ret35.txt is much larger than 2003ret35noimputes.txt,
# suggesting it includes imputations. (Note, also, that there is a file 2003ret35.txt - lower-case "r" - that is the)
# same size as 2003Ret35.txt but has an earlier date.
# TENTATIVE CONCLUSION: We can use unit files for 1993-2001, and again from 2012+ (but not sure if we will)
# we need to use state summaries:
# 3) 2002-2003
# Again use the state summary data in Tables 2-5 at the Census download site discussed in 2a http://www.census.gov/govs/retire/
# 4) 2004-2011 the data are already summarized at the state level in files such as:
# 2011retest_data.xls in the directory
# E:\Data\CensusRetirementData\SummaryTables\SummaryExcelFiles
# 3) 2012+ I must summarize from the unit data
# retsysunits.rds in the directory
# cenretd "E:\\Data\\CensusRetirementData\\"
# Unit data
# http://www2.census.gov/govs/retire/93ret35.txt
# get files from the web THIS CODE IS OLD AND IS NOT VERIFIED YET!!!!
getfn<-function(year){
if(year>=2000) y2<-year-2000 else y2<-year-1900
y2<-sprintf("%02d",y2)
if(year<=2002) fn<-paste(y2,"ret35.txt",sep="") else
if(year<=2005) fn<-paste(year,"ret35noimputes.txt",sep="") else
if(year==2006) fn<-paste(year,"ret35imputes.txt",sep="") else
fn<-paste(year,"indiv_unit_reported_data.txt",sep="")
return(fn)
}
for(year in 1993:2009){
fn<-getfn(year)
# print(fn)
fget<-paste(web,fn,sep="")
fput<-paste(cendir,fn,sep="")
download.file(fget,fput,mode="wb")
}
#************************************************************************************************************************
#
# Create variable mappings ####
#
#************************************************************************************************************************
# get the recipe file
fn <- "CensusRetirementDataAnalysis(4).xlsx"
df <- read_excel(paste0(cenretd, fn), sheet="Recipe", col_names=FALSE)
# find last data column
lastdcol <- grep("beneficiaries", df[6, ])
df <- df[, 1:lastdcol]
# store the vname, SAS var, itemname, and itemnumber mappings, then move on to create the recipes
mappings <- as.data.frame(t(df[3:6, ]))
names(mappings) <- t(mappings[1, ])
mappings <- mappings[-1, ]
saveRDS(mappings, paste0(cenretd, "vnamemappings.rds"))
# create recipes
# icdf <- readRDS(paste0(cenretd, "retitemcodes.rds"))
# retdf <- readRDS(paste0(cenretd, "retsysunits.rds"))
df2 <- df
names(df2) <- df2[6, ]
datastart <- grep("codes", t(df2[, 1]))
recipe <- df2 %>% filter(row_number()>=datastart) %>%
select(-vname) %>%
gather(variable, ic) %>%
mutate(ic=trim(ic)) %>%
filter(!is.na(ic))
saveRDS(recipe, paste0(cenretd, "recipe.rds"))
#************************************************************************************************************************
#
# Summarise variables using recipes ####
#
#************************************************************************************************************************
ht(retdf)
count(retdf, stabbr)
# first, put Us on the file
retus <- retdf %>% group_by(year, ic, icf, type, typef) %>% summarise(value=sum(value, na.rm=TRUE), stabbr="US", id="US Total", pubname="US Total")
ht(retus)
ret2 <- bind_rows(select(retdf, year, stabbr, ic, icf, type, typef, pubname, value),
retus)
ret3 <- recipe %>% left_join(select(ret2, year, id, stabbr, type, typef, pubname, ic, value))
# system-level summaries of variables
ret4 <- ret3 %>% group_by(year, id, stabbr, type, typef, pubname, variable) %>%
summarise(value=sum(value, na.rm=TRUE))
saveRDS(ret3, paste0(cenretd, "sysyearvar.rds"))
#************************************************************************************************************************
#
# Create a time series of state-level summarized data ####
#
#************************************************************************************************************************
# sshistdb <- readRDS(paste0(cenretd, "statesummaryhist.rds")) # 1957:2007 selected years, state-level summary data, from historical database
# ssxl <- readRDS(paste0(cenretd, "statesummaryxl.rds")) # 2004-2011 state-level summary data from Excel state summary files
# retdf <- readRDS(paste0(cenretd, "retsysunits.rds")) # 2007-2013 system-level item code data, from most recent item-code files
# sysyearvar <- readRDS(paste0(cenretd, "sysyearvar.rds")) # 2007-2013 state-level summary data constructed from retsysunits.rds
# vmaps <- readRDS(paste0(cenretd, "vnamemappings.rds")) # mappings needed to create a time series from these files
# spop <- readRDS(file=paste0(rdat, "popst.rds")) # state population
sshistdb # year, type, typef, stabbr, variable (lower case sasvar), value
# count(sshistdb, variable)
ssxl # year, type, typef, stabbr, ItemID, ItemName
sysyearvar # year, id, type, typef, pubname, variable, value
vmaps # itemnumber, itemname, sasvarname, vname
# create conformed versions of the 3 files
# count(sshistdb, year, variable)
# count(sshistdb, variable)
df1 <- left_join(filter(mutate(sshistdb, type=as.numeric(type)), type %in% 1:3),
mutate(select(vmaps, variable=sasvarname, vname), variable=tolower(variable))) %>%
select(-variable) %>%
filter(!is.na(vname), !is.na(value)) %>%
mutate(source="sshistdb")
ht(df1)
count(df1, year)
count(df1, year, vname)
count(df1, type, typef)
df2 <- left_join(select(filter(ssxl, type %in% 1:3), year, stabbr, ItemID, type, typef, value),
mutate(select(vmaps, ItemID=itemnumber, vname), ItemID=as.numeric(ItemID))) %>%
select(-ItemID) %>%
filter(!is.na(vname)) %>%
mutate(source="ssxl")
count(df2, type, typef)
df3a <- sysyearvar %>% ungroup %>%
mutate(type=ifelse(type==0, 2, 3)) %>%
group_by(stabbr, year, type, vname=variable) %>%
summarise(value=sum(value, na.rm=TRUE))
# add state-local
df3sl <- df3a %>% group_by(stabbr, year, vname) %>% summarise(value=sum(value, na.rm=TRUE)) %>% mutate(type=1)
df3 <- bind_rows(df3a, df3sl) %>%
filter(!is.na(stabbr)) %>%
mutate(typef=factor(type, levels=1:3, labels=c("State-local total", "State", "Local")),
source="sysyearvar")
count(df3, type, typef)
stack <- bind_rows(df1, df2, df3)
str(df1)
str(df2)
str(df3)
ht(stack)
count(stack, year, source) %>% spread(source, n)
stack2 <- stack %>% filter((source=="sshistdb" & year<=2002) |
(source=="ssxl" & year %in% 2004:2006) |
(source=="sysyearvar" & year>=2007))
count(stack2, year, source) %>% spread(source, n)
ht(stack2)
count(stack2, year, type) %>% spread(type, n)
count(stack2, year, var=(stabbr=="US")) %>% spread(var, n) # no US for 2007+
count(filter(stack2, year>=2007), year, stabbr) %>% spread(year, n) # no US for 2007+
count(stack2, year, vname) %>% spread(vname, n)
glimpse(stack2)
saveRDS(stack2, paste0(cenretd, "state_type_year_var.rds"))
# now clean up the data and save
styv <- readRDS(paste0(cenretd, "state_type_year_var.rds"))
stackw <- styv %>% spread(vname, value)
count(stackw, year, is.na(erc))
count(stackw, year, is.na(contrib))
filter(stackw, is.na(eec))
# calc contrib from eec, erc; explore missing erc in 1982;
stackw2 <- stackw %>% mutate(contrib=ifelse(is.na(contrib), eec+erc, contrib),
xcf=contrib-expend,
contrib.pct=contrib/assets*100,
expend.pct=expend/assets*100,
xcf.pct=xcf/assets*100)
count(stackw2, year, is.na(contrib))
filter(stackw2, is.na(contrib)) %>% glimpse # WY 1982 all gov contrib missing - is so in xls file
stackw2
glimpse(stackw2 %>% tail)
dfl <- stackw2 %>% gather(variable, value, -c(year, type, typef, stabbr, source))
saveRDS(dfl, paste0(cenretd, "styvl.rds"))
#************************************************************************************************************************
#
# Read 35-character files ####
#
#************************************************************************************************************************
read35 <- function(fn) {
}
fn <- paste0(u35web, "93ret35.txt")
wids <- c(2, 1, 3, 3, 3, 2, 3, 12, 2, 2, 2)
cn <- c("stcode", "admin", "cocode", "id", "suppcode", "subcode", "ic", "value", "syear", "dyear", "source")
# wids <- c(14, 3, 12, 2, 2, 2)
# cn <- c("id", "ic", "value", "syear", "dyear", "source")
unitdf <- read_fwf(fn, fwf_widths(wids, cn)) # , col_types="ccdiic"
# compare imputes and no imputes
# 2003ret35noimputes.txt 2003Ret35.txt YES, this looks right
# 2008indiv_unit_reported_data.txt 2008unit file.txt unit file has more recs but smaller $!
# 2009indiv_unit_reported_data.txt 2009unit file.txt same here - suggests latter is local govt admin systems
df1 <- read_fwf(paste0(u35web, "2009indiv_unit_reported_data.txt"), fwf_widths(wids, cn)) # , col_types="ccdiic"
df2 <- read_fwf(paste0(u35nonweb, "2009unit file.txt"), fwf_widths(wids, cn)) # , col_types="ccdiic"
df3 <- bind_rows(mutate(df1, src="noimp"), mutate(df2, src="imp"))
df3 %>% group_by(ic, src) %>%
summarise(n=n(), value=sum(value, na.rm=TRUE)) %>%
select(-n) %>%
spread(src, value) %>%
mutate(pct=noimp/imp*100)
# look at assets:
df <- read_fwf(paste0(u35web, "95ret35.txt"), fwf_widths(wids, cn), col_types="ciiciicdiic")
# count(df, ic) %>% data.frame
df %>% filter(ic=="Z81") %>% # assets Z81 benefits X11
summarise(n=n(), value=sum(value, na.rm=TRUE) / 1e6)
# Columns:
# units add to universe? YES/NO
# year
# Total assets:
# unit file name and (total assets from units, sum of Z81, $ billions)
# summary file name and (total assets reported, $ billions, AFF=American Fact Finder)
# if name ends in .html, it is url portion after http://www.census.gov/govs/retire/
# ??? 2013 2013indiv_unit_reported_data.txt(3278.489) AFF(3287.365)
# YES 2012 2012indiv_unit_reported_data.txt(3024.571) AFF(3024.603)
# NO 2011 2011indiv_unit_reported_data.txt(3011.601) 2011retest_data.xls(3050.401)
# NO 2010 2010indiv_unit_reported_data.txt(2646.949) 2010retest_data.xls(2671.233)
# NO 2009 2009indiv_unit_reported_data.txt(2372.848) 2009retest_data.xls(2415.671)
# NO 2008 2008indiv_unit_reported_data.txt(3076.251) 2008retest_data.xls(3130.390)
# NO 2007 2007indiv_unit_reported_data.txt(3293.399) 2007retest_data.xls(3305.379)
# NO 2006 2006ret35imputes.txt(2855.109) 2006retest_data.xls(2912.494)
# NO 2004 2004ret35noimputes.txt(2474.235) 2004retest_data.xls(2495.352)
# NO 2003 2003ret35noimputes.txt(2135.883) ret03t4.xls(2172.002)
# NO 2002 02ret35.txt(2118.191) ret02t4.xls(2157.210)
# YES 2001 01ret35.txt(2157.629) ret01t4.html(2157.629)
# YES 2000 00ret35.txt(2168.643) ret00t4.html(2168.643)
# ...
# YES 1995 95ret35.txt(1118.353) ret95t4.html(1118.353)
# YES 1994 94ret35.txt(1006.411) ret94t4.html(1006.411)
# ??? 1993 93ret35.txt(920.572) ret93t4.html(909.850)
# 1993 93ret3.txt X11 benefits 48.32786
# E:\Dropbox (Personal)\RPrograms PC\Packages\pdata\data-raw\Census\Units-35char\web-interface
E:/Dropbox (Personal)/RPrograms PC/Packages/pdata/data-raw/Census/Units-35char/web-interface/93ret35.txt
list.files(u35web)
starts <- c(1, 3, 5, 9)
ends <- c(2, 3, 8, 20)
df <- read_fwf(paste0(d35, fn), fwf_positions(starts, ends, col_names=c("stcode", "level", "ic", "value")),
col_types="cicd")
# 35-char format; NOTE that type code is DIFFERENT from historical database
# Unit ID (14 character ID number)
# State code (states & DC in alpha sequence) 1-2 2
# Type of government code: 3 1
# 0 = State
# 1 = County
# 2 = City
# 3 = Municipalities and Townships
# 4 = Special Districts
# 5 = Schools
# County Code 4-6 3
# Unit Identification Number 7-9 3
# Supp Code 10-12 3
# Sub Code 13-14 2
# Line Item code 15-17 3
# Amount (thousands) 18-29 12
# Survey year 30-31 2
# Year of Data 32-33 2
# Source 34-35 2
read_fwf(fn, widths=c(14,3,12,-2,2,2), col.names=c("govid","ic","value","datayear","source"), colClasses="character", n=-1) # skip survyear
for(year in 1993:2009){
fn<-getfn(year)
df<-read.fwf(paste(cendir,fn,sep=""), widths=c(14,3,12,-2,2,2), col.names=c("govid","ic","value","datayear","source"), colClasses="character", n=-1) # skip survyear
df$year<-year
df$datayear<-ifelse(df$datayear<"70",as.numeric(df$datayear)+2000,as.numeric(df$datayear)+1900)
df$value<-ctov(df$value)
stateid<-substr(df$govid,1,2)
df$stabbr<-as.character(factor(stateid,levels=stcen,labels=stabbr))
df$type<-as.numeric(substr(df$govid,3,3))
rm(stateid)
# print(head(df)); print(tail(df))
print(paste("Processing: ",year,sep=""))
dfname<-paste("retsys",year,sep="")
assign(dfname,df)
save(list=dfname,file=paste(cendir,dfname,".RData",sep="")) # must use list= to use data frame name as character
}
#************************************************************************************************************************
#
# Summarise, play, etc ####
#
#************************************************************************************************************************
dfl <- readRDS(paste0(cenretd, "styvl.rds")) # state tax year variable long
glimpse(dfl)
glimpse(spop)
gdppi
dfl2 <- dfl %>% mutate(pop=spop$pop[match(paste(year, stabbr), paste(spop$year, spop$stabbr))],
valpc=value/pop,
rvalue=value*gdppi$value[gdppi$year==2014]/gdppi$value[match(year, gdppi$year)],
rvalpc=rvalue/pop)
glimpse(tail(dfl2, 30))
vars <- c("contrib", "expend", "xcf")
st <- "NJ"
dfl2 %>% filter(stabbr==st, type==1, variable %in% vars) %>%
filter(!year %in% 2004:2006) %>%
qplot(year, rvalpc, data=., colour=variable, geom=c("point", "line"), main=paste0(st, ": pension cash flow")) + geom_hline(y=0)
count(dfl, type, typef)
vars <- c("contrib.pct", "expend.pct", "xcf.pct")
vars <- c("contrib", "expend", "xcf")
dfl %>% filter(stabbr=="US", type==1, variable %in% vars) %>%
filter(!year %in% 2004:2006) %>%
qplot(year, value/1e6, data=., colour=variable, geom=c("point", "line")) + geom_hline(y=0)
vars <- c("contrib.pct", "expend.pct", "xcf.pct")
vars <- c("contrib", "expend", "xcf")
dfl2 %>% filter(year==2013, type==1, variable %in% vars) %>%
select(stabbr, year, type, variable, rvalpc) %>%
spread(variable, rvalpc) %>%
arrange(desc(xcf))
df <- readRDS(paste0(cenretd, "sysyearvar.rds"))
df %>% group_by(year, variable) %>%
summarise(value=sum(value, na.rm=TRUE)) %>%
filter(variable=="assets")
df %>% group_by(year, variable) %>%
summarise(value=sum(value, na.rm=TRUE)) %>%
spread(variable, value)
df %>% group_by(year, variable) %>%
summarise(value=sum(value, na.rm=TRUE)) %>%
spread(variable, value)
# get state per capita summaries
dfs <- df %>% group_by(year, variable, stabbr) %>%
summarise(value=sum(value, na.rm=TRUE)) %>%
left_join(select(spop, year, stabbr, pop)) %>%
mutate(valpc=value/pop)
dfs
count(dfs, variable)
var <- "eec" # benefits erc eec
dfs %>% filter(year==2012, variable==var) %>%
arrange(desc(valpc))
dfs %>% filter(year==2012, variable %in% c("eec", "erc")) %>%
select(stabbr, variable, value) %>%
spread(variable, value) %>%
mutate(eecpct=eec/(eec+erc)*100) %>%
arrange(desc(eecpct))
dfs %>% filter(stabbr=="NY", variable=="benefits") %>%
arrange(desc(valpc))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.