idiosyncratics:

# 1. f <- compare weighted avg of growth rates to aggregate. sd of difference? like SSR.
# 2. loop through 1 to K or whatever.
# 3. graph
library(ggplot2)

# get aggregate growth rates
# start with a73 only.
a73 <- asm73
a00 <- asm00 #%>% filter(!(year %in% c(2004,2012)))
t2 <- t2x # a73g <- a73 %>% group_by(year) %>% summarize(gt = mean(g_t, na.rm = TRUE))


# lms <- . %>% lm(g_t ~ git, data = .) %>% summary() %>% .$r.squared
#
# lms_lag <- . %>% arrange(year) %>% lm(g_t ~ git + lag(git), data = .) %>% summary() %>% .$r.squared

by_year <- function(k, df, weight = TRUE) {
  library(dplyr)
  lms <- . %>% lm(g_t ~ git, data = .) %>% summary() %>% .$r.squared
  lms_lag <- . %>% arrange(year) %>% lm(g_t ~ git + lag(git), data = .) %>% summary() %>% .$r.squared

  df %>% group_by(year) %>%
    arrange(year, -l.wit) %>%
    slice(1:k) %>%
    select(id, year, l.wit, git, g_t) %>%
    mutate(l.wit = l.wit / sum(l.wit, na.rm = TRUE)) %>%
    summarize(git = ifelse(weight == TRUE, sum(l.wit * git, na.rm = TRUE)),
              mean(git, na.rm = TRUE),
              g_t = mean(g_t, na.rm = TRUE)) %>%
    lms()
}

by_year_industry <- function(k, df, weight = TRUE) {
  df %>%
    mutate(industry = str_sub(as.character(industry), 1, 2)) %>%
    group_by(year, industry) %>%
    arrange(year, industry, -l.wit) %>%
    slice(1:k) %>%
    select(id, year, industry, l.wit, git, gt) %>%
    ungroup() %>%
    group_by(year) %>%
    mutate(l.wit = l.wit / sum(l.wit, na.rm = TRUE)) %>%
    summarize(git = ifelse(weight == TRUE, sum(l.wit * git, na.rm = TRUE)),
              mean(git, na.rm = TRUE),
              g_t = mean(g_t, na.rm = TRUE)) %>%
    lms()
}


# now want to save the ks, and the resulting R^2, and graph them somehow.
as.df <- function(k, FUN, df, weight = TRUE, name) {
  df_ret <- tibble::tibble(k = c(1:k), type = name)

  cl <- makeCluster(8)
  df_ret$r2 <- parLapply(cl, 1:k, FUN, df = df, weight = weight) %>% unlist()
  stopCluster(cl)
  return(df_ret)
}


library(parallel)
N <- 100
# cl <- makeCluster(2)
# w <- parLapply(cl, z, g)
# clusterMap(cl, as.df, df = list(a73, a00, t2), name = list("w a7", "w a0", "w t2"), MoreArgs = list(k = N, FUN = by_year), SIMPLIFY = FALSE)
# stopCluster(cl)
# xxx
system.time(x <- mapply(as.df, df = list(a73, a00, t2), name = list("w a7", "w a0", "w t2"), MoreArgs = list(k = N, FUN = by_year), SIMPLIFY = FALSE))
# System.time(x <- mapply(as.df, df = list(a73, a00, t2), name = list("w a7", "w a0", "w t2"), MoreArgs = list(k = N, FUN = by_year), SIMPLIFY = FALSE))

# bind_rows(x)
Reduce(rbind, x) %>% ggplot(aes(x = k, y = r2, colour = type)) + geom_line() + geom_point() + ylim(0, 1)

t2 %>%
  group_by(year) %>%
  arrange(year, -l.wit) %>%
  slice(1:100) %>%
  select(id, year, l.wit, git, g_t) %>%
  mutate(l.wit = l.wit / sum(l.wit, na.rm = TRUE)) %>%
  summarize(git = ifelse(TRUE == TRUE, sum(l.wit * git, na.rm = TRUE)),
            mean(git, na.rm = TRUE),
            g_t = mean(g_t, na.rm = TRUE)) %>%
  arrange(year) %>%
  lm(g_t ~ git + lag(git), data = .) %>%
  summary()

# # Ni <- floor(N / 22)
# # dfi <- tibble(k = c(1:Ni)*22, type = "ind weighted")
# # dfi$r2 <- lapply(1:Ni, by_year_industry, df = a73, weight = TRUE) %>% unlist()

# ok, what next. try to use that info to predict? or use on bigger sample etc?