idiosyncratics:

library(tidyverse)
library(idiosyncratics)

set.seed(403)

N <- 25
T <- 5

# it has a growth rate, but the real data won't.
asm <- fakedata::fake_panel(N = N, T = T, I = 2, missing = 0.01)
a <- asm %>% rename(industry = I, g_i = g)

# it has a growth rate, but the real data won't.
a <- a %>% add_weights(x_var = sales) %>% lag_panel(x_var = w, l_var = w)
a <- a %>% add_growth_rates(x_var = sales, drop = TRUE)
a <- a %>% add_quantiles(g_var = 'g_i') %>% drop_outliers(g_var = g_i, method = 'winsorize')
g <- a %>% group_by(year) %>% summarize(g = sum(w * g, na.rm = TRUE)) %>% tibble::rownames_to_column(var = 'index')
a <- a %>% balance_panel(id, year)

# need balance
tbl <- a

# that's to drop the 0 column; might need to deal with that in a different manner, pre-this

# need to convert years indices to years?
X <- get_matrix(a, id, year, g_i)
W <- get_matrix(a, id, year, w)
# pass in X, a



# wrapper to calculate icpx_pca? icpx_pca
# map, keep names
# bind_rows(
# xz <- purrr::map(icp$lhatr, predict_using_factors, X = X, W = W, pca = icp$pca) #, .id = "type" )
xz <- icpx_pca(X, W)
# ok, call things, put it all together, with type and year and everything else I need.
xy <- list('sparse' = sparse_pca(X, W))
# components <- bind_rows(xz, xy, .id = 'type') %>%
#   left_join(g %>% select(index, year), by = c('year' = 'index')) %>%
#   select(-year) %>% rename(year = year.y) %>%
#   mutate(type = ifelse(type == 'sparse', "Sparse", type))

components <- bind_rows(xz, xy, .id = 'type') %>%
  left_join(g, by = c('year' = 'index')) %>%
  select(-year) %>%
  rename(year = year.y, e = value) %>%
  mutate(type = ifelse(type == 'sparse', "Sparse", type))


tidy_components <- bind_rows(components %>% select(-g),
          g %>% mutate(index = 'Aggregate') %>% rename(e = g, type = index))

tidy_components
# %>%
#   select(year = year.y, type, value)
# # and do a correct legend + scale
ggplot(tidy_components,
       aes(x = year, y = e, colour = type)) +
  geom_line() +
  labs(x = "Year", y = "Growth") +
  scale_colour_discrete(name="No. fctr")

# ,
#                         breaks=c("e", "g", "m"),
#                         labels=c("Idiosyncratic", "Aggregate", "Common"))

# maybe this., list-col strat.
# bind_rows(enframe(xz), enframe(xy))

lms <- . %>% lm(g ~ e, data = .) %>% summary() %>% .$r.squared
dlm <- function(tbl) {
  cov <- tbl %>% select(one_of(c('e', 'm', 'g'))) %>% var()
  (cov / cov[dim(cov)[1], dim(cov)[2]]) %>% diag() %>% sqrt()
}

# hmm.
tab <- left_join(split(components, components$type) %>% map_dbl(lms) %>% round(2) %>% enframe(),
                 split(components, components$type) %>% map(dlm) %>% map_dbl("e") %>% round(2) %>% enframe(), by = 'name')
names(tab) <- c('No. of factors', 'e_t, R^2', 'e_t, DLM')
tab %>% knitr::kable()


tab <- cbind(components %>% lms() %>% t(), components %>% dlm() %>% t()) %>% tbl_df()
names(tab) <- c("R^2", "e_t", "m_t", "g_t")
kable(tab %>% round(2), caption = 'Contribution to aggregate growth')


# do separate vignettes for PCA and the regular things?

# also do that for all the "datasets"

# ok, then what. get the other stuff, the printing things.

xxx
# fakedata::fake_panel(N = N, T = T, I = 2)

# imap!

set.seed(403)

z <- fakedata::fake_panel(N = 50, T = 50, I = 2, missing = 0.1)
z %>% rename(industry = I, g_i = g) %>% add_weights() %>% View()
# ugh, missing stuff. can't just drop things, can't just calculate weights,
# need to be sequential.



l <- 10
tbl <- tibble(N = floor(seq(50, 2500, length.out = l)), #seq(10, 50, 10),
       T = rep_len(50, l))
tbl

a <- bind_cols(tbl, tbl %>% { map2(.$N, .$T, .f = fakedata::fake_panel, I = 2, missing = 0.1)} %>% enframe()) %>% select(-name) %>% rename(data = value)

b <- a %>%
  mutate(data = data %>% map(rename, industry = I, g_i = g)) %>%
  mutate(data = map(data, add_weights)) %>%
  mutate(data = map(data, add_quantiles, g_var = 'g_i')) %>%
  mutate(data = map(data, drop_outliers, g_var = 'g_i', method = 'drop')) %>%
  mutate(data = map(data, demean, g_var = 'g_i'))
c <- b %>%
  mutate(comps = map(data, aggregate_components, g_var = 'g_i', w_var = 'lag_w'))

c
# ok, now do lm summary stuff I guess.

# then write the pca function.

# asm <- asm %>% rename(industry = I, g_i = g)
# asm <- asm %>%
#   add_weights() %>% # add lagged weights to data
#   add_quantiles(g_var = 'g_i') %>% # add quantiles to data
#   drop_outliers(g_var = 'g_i', method = 'drop') # drop outliers based on quantiles

# then what.
# asm <- asm %>% demean(g_var = 'g_i')
# components <- asm %>% aggregate_components(g_var = 'g_i', w_var = 'lag_w')

# what does the table look like. just need results from each thing.

lms <- . %>% lm(g2 ~ e, data = .) %>% summary() %>% .$r.squared
dlm <- function(df) {
  cov <- df %>% select(one_of(c('e', 'm', 'g2'))) %>% var()
  (cov / cov[dim(cov)[1], dim(cov)[2]]) %>% diag() %>% sqrt()
}

c %>% mutate(r2 = map_dbl(comps, lms),
             dlm = map(comps, dlm)) %>% .$dlm

c[[1,"comps"]]