idiosyncratics:

# Can we predict the aggregate growth rate using a subsample of
# individual firms? Suppose we only sampled the top 10 largest firms
# in the economy, and used their weighted growth rate as a way to
# predict the aggregate growth rate?

library(idiosyncratics)
library(tidyverse)
library(MLmetrics)

l <- 3
tbl <- tibble(N = floor(seq(50, 5000, length.out = l)), #seq(10, 50, 10),
              T = rep_len(50, l))
tbl

a <- bind_cols(tbl, tbl %>% { map2(.$N, .$T, .f = fakedata::fake_panel, I = 2, mean_sales = 30, sd_sales = 1 * 30, model = list(order = c(1,0,1), ar = 0.01, ma = 0.01), missing = 0)} %>% enframe()) %>% select(-name) %>% rename(data = value)

add_aggregate <- . %>% group_by(year) %>% mutate(g_a = sum(w * g, na.rm = TRUE))

b <- a %>%
  mutate(data = data %>% map(rename, industry = I)) %>%
  mutate(data = map(data, add_weights, x_var = sales)) %>%
  mutate(data = map(data, lag_panel, x_var = w)) %>%
  mutate(data = map(data, add_aggregate)) %>%
  mutate(data = map(data, add_quantiles, g_var = 'g')) %>%
  mutate(data = map(data, drop_outliers, g_var = 'g', method = 'drop'))

slice_k <- function(k, tbl, weight = TRUE, ...) {
  library(dplyr)
  grouping <- quos(...)

  tbl <- tbl %>%
    group_by(!!!grouping) %>%
    filter(!is.na(w)) %>%
    arrange(year, -w) %>%
    slice(1:k) %>%
    group_by(year) %>% # also by year, annoying.
    mutate(w = w / sum(w, na.rm = TRUE)) %>% # re-weight
    summarize(g = ifelse(weight == TRUE, sum(w * g, na.rm = TRUE),
                                         mean(g, na.rm = TRUE)),
              g_a = mean(g_a, na.rm = TRUE))

  MLmetrics::RMSE(tbl$g, tbl$g_a) / MLmetrics::RMSE(lag(tbl$g_a)[-1], tbl$g_a[-1])
}

slice_1_k <- function(data) {
  l <- 1:10
  names(l) <- l
  map_dbl(l, slice_k, tbl = data, weight = FALSE, year) %>% enframe()
}
c <- b %>% mutate(rrmse = map(data, slice_1_k))
c

gr <- c[, c('N','T','rrmse')] %>% unnest() %>% mutate(k = as.numeric(name), rrmse = value, N = factor(N)) # id gives something.
# gr %>% mutate(rrmse = rmse / MLmetrics::RMSE(gg$g_a[-1], lag(gg$g_a)[-1]))
ggplot(gr, aes(x = k, y = rrmse, colour = N, group = N)) + geom_line() + geom_point() #+ ylim(0, 1)
xxx
# MLmetrics::RMSE(y_pred, y_true)
# ??

# ok.

# MLmetrics::RMSE(gg$g_a[-1], lag(gg$g_a)[-1])
# now do all the plots relative to this.
# y <- arima.sim(model = list(order = c(1,0,1), ar = 0.1, ma = 0.1), n = 25) / 100
# y <- y %>% enframe() %>% mutate(lv = lag(value))
# lm(value ~ lv, data = y) %>% summary()

data
# WHAT.
xxx
# ok.
# c %>% .[[3,'r2']]

# names? enframe? will need to double-check names. or just get list of names.

# q <- b[[1,'data']]
# slice_k(2, q, weight = TRUE, year)
# map_dbl(l, slice_k, tbl = q, weight = TRUE, year) %>% enframe()


# qq <- q %>%
#   group_by(year) %>%
#   filter(!is.na(w)) %>%
#   arrange(year, -w) %>%
#   slice(1:1) %>%
#   group_by(year) %>% # also by year, annoying.
#   mutate(w = w / sum(w, na.rm = TRUE)) %>% # re-weight
#   summarize(g = sum(w * g, na.rm = TRUE),
#             g_a = mean(g_a, na.rm = TRUE))
#
# SSR <- (qq$g_a - mean(qq$g_a, na.rm = TRUE) - qq$g + mean(qq$g, na.rm = TRUE))^2 %>% sum()
# SST <- (qq$g_a - mean(qq$g_a, na.rm = TRUE))^2 %>% sum()
# 1 - SSR / SST # r-squared ish thing?
# MLmetrics::RMSE(qq$g_a, qq$g) # ok.
# ????????

# lm(g_a ~ g , data = qq) %>% summary()
#
# xxxs

# as.df <- function(k, FUN, df, weight = TRUE, name) {
#   df_ret <- tibble::tibble(k = c(1:k), type = name)
#
#   cl <- makeCluster(8)
#   df_ret$r2 <- parLapply(cl, 1:k, FUN, df = df, weight = weight) %>% unlist()
#   stopCluster(cl)
#   return(df_ret)
# }


# library(parallel)
# N <- 100
# cl <- makeCluster(2)
# w <- parLapply(cl, z, g)
# clusterMap(cl, as.df, df = list(a73, a00, t2), name = list("w a7", "w a0", "w t2"), MoreArgs = list(k = N, FUN = by_year), SIMPLIFY = FALSE)
# stopCluster(cl)
# xxx

# now change this.


# system.time(x <- mapply(as.df, df = list(a73, a00, t2), name = list("w a7", "w a0", "w t2"), MoreArgs = list(k = N, FUN = by_year), SIMPLIFY = FALSE))
# System.time(x <- mapply(as.df, df = list(a73, a00, t2), name = list("w a7", "w a0", "w t2"), MoreArgs = list(k = N, FUN = by_year), SIMPLIFY = FALSE))

# bind_rows(x)
# Reduce(rbind, x) %>% ggplot(aes(x = k, y = r2, colour = type)) + geom_line() + geom_point() + ylim(0, 1)

# t2 %>%
#   group_by(year) %>%
#   arrange(year, -l.wit) %>%
#   slice(1:100) %>%
#   select(id, year, l.wit, git, g_t) %>%
#   mutate(l.wit = l.wit / sum(l.wit, na.rm = TRUE)) %>%
#   summarize(git = ifelse(TRUE == TRUE, sum(l.wit * git, na.rm = TRUE)),
#             mean(git, na.rm = TRUE),
#             g_t = mean(g_t, na.rm = TRUE)) %>%
#   arrange(year) %>%
#   lm(g_t ~ git + lag(git), data = .) %>%
#   summary()

# # Ni <- floor(N / 22)
# # dfi <- tibble(k = c(1:Ni)*22, type = "ind weighted")
# # dfi$r2 <- lapply(1:Ni, by_year_industry, df = a73, weight = TRUE) %>% unlist()

# ok, what next. try to use that info to predict? or use on bigger sample etc?