# Process data
# Jesse Tweedle
# September 14, 2017
# get both ASMs, and the other data. Hmm, it's .dta.
rm(list=ls(all=TRUE))
gc()
library(dplyr)
library(readr)
# could also use asm9010
asm7399 <- read_csv('data/asm-1973-1999.csv')
asm0012 <- read_csv('data/asm-2000-2012.csv') %>% filter(!grepl('T', SRSN7))
t2 <- read_csv('data/growth_rate_data.csv')
# put into common format
# which is: id, industry, year, sales
asm73 <- asm7399 %>% select(id = rsn7, year = yr4, industry = sic80, sales = vst)
asm00 <- asm0012 %>% select(id = SRSN7, year = YR4, industry = NAICS, sales = VST)
t2 <- t2 %>% select(id = firm_id, year = year, industry = naics, sales = sales_this_yr)
# functions.
aggregate <- . %>% group_by(year) %>%
summarize(sales = sum(as.numeric(sales), na.rm = TRUE)) %>%
transmute(year = year, g_t = sales / lag(sales, order_by = year) - 1)
clean <- . %>% filter(!is.na(sales) & sales > 0, !is.na(industry)) #%>%
# mutate(industry = industry %>% as.factor(),
# year = year %>% as.factor())
weights <- . %>% group_by(year) %>%
mutate(wit = sales / sum(as.numeric(sales))) %>%
group_by(id) %>%
arrange(id, year) %>%
mutate(l.wit = lag(wit, order_by = year)) %>%
select(-wit)
growth_rates <- . %>% group_by(id) %>%
arrange(id, year) %>%
# mutate(git = sales / lag(sales, order_by = year) - 1) %>%
mutate(git = log(sales) - log(lag(sales, order_by = year))) %>%
filter(!is.na(git))
alpha <- 0.025
quantiles <- . %>% group_by(year) %>% # should be by group_by(industry, year) in the future; not sure this is really grouping when I also use .$
mutate(low = quantile(git, probs = c(alpha), na.rm = TRUE),
high = quantile(git, probs = c(1-alpha), na.rm = TRUE))
# t2 %>% group_by(industry, year) %>% mutate(hi = quantile(git, probs = c(0.05), na.rm = TRUE))
# t2 %>% quantiles()
# well, I'm dropping them now, but could
winsorize <- . %>% mutate(git = ifelse(git < low, low, git),
git = ifelse(git > high, high, git)) %>% select(-low,-high)
drop_outliers <- . %>% filter(git > low & git < high) %>% select(-low,-high)
# put all the steps together
process <- . %>% clean() %>% weights() %>% growth_rates() %>% quantiles() %>% winsorize()
# asm73 <- left_join(asm73, asm73 %>% aggregate()) %>% clean() %>% weights() %>% growth_rates() %>% quantiles() %>% winsorize()
# asm00 <- left_join(asm00, asm00 %>% aggregate()) %>% clean() %>% weights() %>% growth_rates() %>% quantiles() %>% winsorize()
# t2 <- left_join(t2, t2 %>% aggregate()) %>% clean() %>% weights() %>% growth_rates() %>% quantiles() %>% winsorize()
asm73 <- left_join(asm73, asm73 %>% aggregate()) %>% process()
asm00 <- left_join(asm00, asm00 %>% aggregate()) %>% process()
t2x <- left_join(t2, t2 %>% aggregate()) %>% process()
# etc
# that's it. write these to csv.
# write_csv(asm73, "data/asm7399-clean.csv")
# write_csv(asm00, "data/asm0012-clean.csv")
# write_csv(t2, "data/t2-clean.csv")
# then more files:
# 2. FE function that returns common + idiosyncratic?
# 3. Common factor function that returns ""
# 4. LASSO function that returns ""
# 5. one that analyzes the stuff.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.