# Can we predict the aggregate growth rate using a subsample of
# individual firms? Suppose we only sampled the top 10 largest firms
# in the economy, and used their weighted growth rate as a way to
# predict the aggregate growth rate?
library(idiosyncratics)
library(tidyverse)
library(MLmetrics)
l <- 3
tbl <- tibble(N = floor(seq(50, 5000, length.out = l)), #seq(10, 50, 10),
T = rep_len(50, l))
tbl
a <- bind_cols(tbl, tbl %>% { map2(.$N, .$T, .f = fakedata::fake_panel, I = 2, mean_sales = 30, sd_sales = 1 * 30, model = list(order = c(1,0,1), ar = 0.01, ma = 0.01), missing = 0)} %>% enframe()) %>% select(-name) %>% rename(data = value)
add_aggregate <- . %>% group_by(year) %>% mutate(g_a = sum(w * g, na.rm = TRUE))
b <- a %>%
mutate(data = data %>% map(rename, industry = I)) %>%
mutate(data = map(data, add_weights, x_var = sales)) %>%
mutate(data = map(data, lag_panel, x_var = w)) %>%
mutate(data = map(data, add_aggregate)) %>%
mutate(data = map(data, add_quantiles, g_var = 'g')) %>%
mutate(data = map(data, drop_outliers, g_var = 'g', method = 'drop'))
slice_k <- function(k, tbl, weight = TRUE, ...) {
library(dplyr)
grouping <- quos(...)
tbl <- tbl %>%
group_by(!!!grouping) %>%
filter(!is.na(w)) %>%
arrange(year, -w) %>%
slice(1:k) %>%
group_by(year) %>% # also by year, annoying.
mutate(w = w / sum(w, na.rm = TRUE)) %>% # re-weight
summarize(g = ifelse(weight == TRUE, sum(w * g, na.rm = TRUE),
mean(g, na.rm = TRUE)),
g_a = mean(g_a, na.rm = TRUE))
MLmetrics::RMSE(tbl$g, tbl$g_a) / MLmetrics::RMSE(lag(tbl$g_a)[-1], tbl$g_a[-1])
}
slice_1_k <- function(data) {
l <- 1:10
names(l) <- l
map_dbl(l, slice_k, tbl = data, weight = FALSE, year) %>% enframe()
}
c <- b %>% mutate(rrmse = map(data, slice_1_k))
c
gr <- c[, c('N','T','rrmse')] %>% unnest() %>% mutate(k = as.numeric(name), rrmse = value, N = factor(N)) # id gives something.
# gr %>% mutate(rrmse = rmse / MLmetrics::RMSE(gg$g_a[-1], lag(gg$g_a)[-1]))
ggplot(gr, aes(x = k, y = rrmse, colour = N, group = N)) + geom_line() + geom_point() #+ ylim(0, 1)
xxx
# MLmetrics::RMSE(y_pred, y_true)
# ??
# ok.
# MLmetrics::RMSE(gg$g_a[-1], lag(gg$g_a)[-1])
# now do all the plots relative to this.
# y <- arima.sim(model = list(order = c(1,0,1), ar = 0.1, ma = 0.1), n = 25) / 100
# y <- y %>% enframe() %>% mutate(lv = lag(value))
# lm(value ~ lv, data = y) %>% summary()
data
# WHAT.
xxx
# ok.
# c %>% .[[3,'r2']]
# names? enframe? will need to double-check names. or just get list of names.
# q <- b[[1,'data']]
# slice_k(2, q, weight = TRUE, year)
# map_dbl(l, slice_k, tbl = q, weight = TRUE, year) %>% enframe()
# qq <- q %>%
# group_by(year) %>%
# filter(!is.na(w)) %>%
# arrange(year, -w) %>%
# slice(1:1) %>%
# group_by(year) %>% # also by year, annoying.
# mutate(w = w / sum(w, na.rm = TRUE)) %>% # re-weight
# summarize(g = sum(w * g, na.rm = TRUE),
# g_a = mean(g_a, na.rm = TRUE))
#
# SSR <- (qq$g_a - mean(qq$g_a, na.rm = TRUE) - qq$g + mean(qq$g, na.rm = TRUE))^2 %>% sum()
# SST <- (qq$g_a - mean(qq$g_a, na.rm = TRUE))^2 %>% sum()
# 1 - SSR / SST # r-squared ish thing?
# MLmetrics::RMSE(qq$g_a, qq$g) # ok.
# ????????
# lm(g_a ~ g , data = qq) %>% summary()
#
# xxxs
# as.df <- function(k, FUN, df, weight = TRUE, name) {
# df_ret <- tibble::tibble(k = c(1:k), type = name)
#
# cl <- makeCluster(8)
# df_ret$r2 <- parLapply(cl, 1:k, FUN, df = df, weight = weight) %>% unlist()
# stopCluster(cl)
# return(df_ret)
# }
# library(parallel)
# N <- 100
# cl <- makeCluster(2)
# w <- parLapply(cl, z, g)
# clusterMap(cl, as.df, df = list(a73, a00, t2), name = list("w a7", "w a0", "w t2"), MoreArgs = list(k = N, FUN = by_year), SIMPLIFY = FALSE)
# stopCluster(cl)
# xxx
# now change this.
# system.time(x <- mapply(as.df, df = list(a73, a00, t2), name = list("w a7", "w a0", "w t2"), MoreArgs = list(k = N, FUN = by_year), SIMPLIFY = FALSE))
# System.time(x <- mapply(as.df, df = list(a73, a00, t2), name = list("w a7", "w a0", "w t2"), MoreArgs = list(k = N, FUN = by_year), SIMPLIFY = FALSE))
# bind_rows(x)
# Reduce(rbind, x) %>% ggplot(aes(x = k, y = r2, colour = type)) + geom_line() + geom_point() + ylim(0, 1)
# t2 %>%
# group_by(year) %>%
# arrange(year, -l.wit) %>%
# slice(1:100) %>%
# select(id, year, l.wit, git, g_t) %>%
# mutate(l.wit = l.wit / sum(l.wit, na.rm = TRUE)) %>%
# summarize(git = ifelse(TRUE == TRUE, sum(l.wit * git, na.rm = TRUE)),
# mean(git, na.rm = TRUE),
# g_t = mean(g_t, na.rm = TRUE)) %>%
# arrange(year) %>%
# lm(g_t ~ git + lag(git), data = .) %>%
# summary()
# # Ni <- floor(N / 22)
# # dfi <- tibble(k = c(1:Ni)*22, type = "ind weighted")
# # dfi$r2 <- lapply(1:Ni, by_year_industry, df = a73, weight = TRUE) %>% unlist()
# ok, what next. try to use that info to predict? or use on bigger sample etc?
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.