idiosyncratics:

# Process data
# Jesse Tweedle
# September 14, 2017

# get both ASMs, and the other data. Hmm, it's .dta.
rm(list=ls(all=TRUE))
gc()
library(dplyr)
library(readr)

# could also use asm9010
asm7399 <- read_csv('data/asm-1973-1999.csv')
asm0012 <- read_csv('data/asm-2000-2012.csv') %>% filter(!grepl('T', SRSN7))
t2 <- read_csv('data/growth_rate_data.csv')

# put into common format
# which is: id, industry, year, sales
asm73 <- asm7399 %>% select(id = rsn7, year = yr4, industry = sic80, sales = vst)
asm00 <- asm0012 %>% select(id = SRSN7, year = YR4, industry = NAICS, sales = VST) 
t2 <- t2 %>% select(id = firm_id, year = year, industry = naics, sales = sales_this_yr)

# functions.
aggregate <- . %>% group_by(year) %>% 
  summarize(sales = sum(as.numeric(sales), na.rm = TRUE)) %>% 
  transmute(year = year, g_t = sales / lag(sales, order_by = year) - 1) 

clean <- . %>%  filter(!is.na(sales) & sales > 0, !is.na(industry)) #%>%
  # mutate(industry = industry %>% as.factor(),
  #        year = year %>% as.factor())

weights <- . %>% group_by(year) %>%
  mutate(wit = sales / sum(as.numeric(sales))) %>% 
  group_by(id) %>% 
  arrange(id, year) %>% 
  mutate(l.wit = lag(wit, order_by = year)) %>%
  select(-wit)

growth_rates <- . %>% group_by(id) %>%
  arrange(id, year) %>% 
  # mutate(git = sales / lag(sales, order_by = year) - 1) %>% 
  mutate(git = log(sales) - log(lag(sales, order_by = year))) %>% 
  filter(!is.na(git))

alpha <- 0.025
quantiles <- . %>% group_by(year) %>% # should be by group_by(industry, year) in the future; not sure this is really grouping when I also use .$
  mutate(low = quantile(git, probs = c(alpha), na.rm = TRUE), 
         high = quantile(git, probs = c(1-alpha), na.rm = TRUE))

# t2 %>% group_by(industry, year) %>% mutate(hi = quantile(git, probs = c(0.05), na.rm = TRUE))
# t2 %>% quantiles()

# well, I'm dropping them now, but could 
winsorize <- . %>% mutate(git = ifelse(git < low, low, git),
                          git = ifelse(git > high, high, git)) %>% select(-low,-high)
drop_outliers <- . %>% filter(git > low & git < high) %>% select(-low,-high)

# put all the steps together
process <- . %>% clean() %>% weights() %>%  growth_rates() %>% quantiles() %>% winsorize()

# asm73 <- left_join(asm73, asm73 %>% aggregate()) %>% clean() %>% weights() %>%  growth_rates() %>% quantiles() %>% winsorize()
# asm00 <- left_join(asm00, asm00 %>% aggregate()) %>% clean() %>% weights() %>% growth_rates() %>% quantiles() %>% winsorize()
# t2 <- left_join(t2, t2 %>% aggregate()) %>% clean() %>% weights() %>% growth_rates() %>% quantiles() %>% winsorize()
asm73 <- left_join(asm73, asm73 %>% aggregate()) %>% process()
asm00 <- left_join(asm00, asm00 %>% aggregate()) %>% process()
t2x <- left_join(t2, t2 %>% aggregate()) %>% process()

# etc
# that's it. write these to csv.
# write_csv(asm73, "data/asm7399-clean.csv")
# write_csv(asm00, "data/asm0012-clean.csv")
# write_csv(t2, "data/t2-clean.csv")

# then more files:
# 2. FE function that returns common + idiosyncratic?
# 3. Common factor function that returns ""
# 4. LASSO function that returns ""
# 5. one that analyzes the stuff.