# decompose using PCA.
get_matrix <- function(df) {
X <- df %>%
ungroup() %>%
select(id, year, git) %>%
spread(key = year, value = git) %>%
arrange(id) %>%
select(-id) %>%
as.matrix() %>%
t()
return(X)
}
process_df_for_pca <- function(df) {
balanced <- df %>%
group_by(id, industry) %>%
count() %>%
ungroup() %>%
filter(n==max(n)) %>%
select(-n)
df_processed <- df %>%
inner_join(balanced) %>%
arrange(id, year)
# return matrix. but also need to return id_df and dfw.
return(df_processed)
}
decompose_pca <- function(df) {
# refactor this so it goes 1. process/convert to matrix, 2. pca and return df, 3. lasso and return df.
library(glmnet)
library(stringr)
# df <- a73 # for testing
df_p <- process_df_for_pca(df)
# get ids
id_df <- df_p %>% ungroup() %>% distinct(id) %>% tibble::rownames_to_column(var="n_id")
dfw <- df_p %>% left_join(id_df) %>% select(n_id, l.wit, year) #%>% # should I re-normalize the weights here?
# group_by(year) %>% mutate(l.wit = l.wit/sum(l.wit, na.rm = TRUE))
X <- get_matrix(df_p)
dfx <- icpx_pca(X, dfw)
e_sp <- sparse_pca(X, dfw)
# combine.
# and another function that rbinds all of them together.
pca_df_l <- dplyr::bind_rows(dfx, e_sp) #Reduce(rbind, list(dfx, e_sp))
return(pca_df_l)
}
# one that does pre PCA?
# then one to do all the factors. and return the dataset.
icpx_pca <- function(X, dfw) {
N <- dim(X)[2] # number of firms/plants
T <- dim(X)[1] # number of years
k <- min(N, T) # maximum number of factors.
# Perform PCA
myPCA <- prcomp(X) #, scale. = F, center = F)
res <- (X - myPCA$x[,1:k] %*% t(myPCA$rotation[,1:k])) %>% t() %>% tbl_df()
C2NT <- min(N,T) # = k
ICp1 <- (N * T)^(-1) * sum(res^2) + k * (N + T) * log(N * T) / ((N * T) * log(N + T))
ICp2 <- (N * T)^(-1) * sum(res^2) + k * (N + T) * log(C2NT) / (N * T)
ICp3 <- (N * T)^(-1) * sum(res^2) + k * log(C2NT) / C2NT
lhatr <- purrr::map(c(ICp1, ICp2, ICp3, min(N,T)), function(x) max(1, floor(x))) %>% unique()
# end of a function that returns the normal df of PCA results of different factors.
dfx <- dplyr::bind_rows(purrr::map(lhatr, predict_using_factors, Xm = X, pca = myPCA, dfw = dfw))
# dfx <- Reduce(rbind, lapply(X = lhatr, FUN = predict_using_factors, Xm = X, pca = myPCA, dfw = dfw))
dfx
}
# hatr stands for the estimated number of factors.
# pass the original data X, the pca myPCA, and weights dfw?
predict_using_factors <- function(hatr, Xm, pca, dfw) { # not sure if I have to pass data or not.
# hatr = the (estimated) number of factors.
res_icp <- (Xm - pca$x[,1:hatr] %*% t(pca$rotation[, 1:hatr])) %>% t() %>% tbl_df()
# res_icp <- (pca$x[, 1:hatr] %*% t(pca$rotation[, 1:hatr]))
# rownames(res_icp) <- rownames(pca$x)
# res_icp <- res_icp %>% t() %>% tbl_df()
e_icp <- res_icp %>%
tbl_df() %>%
tibble::rownames_to_column(var = "n_id") %>% # id will be different.
gather(year, v, -n_id) %>%
# mutate(year = year %>% as.factor()) %>%
left_join(dfw, by = c("n_id", "year")) %>% # merge on weights.
group_by(year) %>%
summarize(value = sum(l.wit * v, na.rm = TRUE)) %>%
mutate(type = hatr %>% as.character() %>% str_pad(width = 2))
return(e_icp)
}
# return factors; still don't know what to do with this. you still have to use the weights to get back to the original?
# return_factors <- function(hatr, Xm, pca) {
# res_icp <- (Xm - pca$x[,1:hatr] %*% t(pca$rotation[, 1:hatr])) %>% t() %>% tbl_df()
# }
sparse_pca <- function(X, dfw) {
N <- dim(X)[2] # number of firms/plants
T <- dim(X)[1] # number of years
myPCA <- prcomp(X) #, scale. = F, center = F)
# for glmnet, pass in original PCA?
pc <- myPCA$x # for glmnet
hatpci <- function(pci) {
cvc <- glmnet(x = X, y = pci, alpha = 0.1, intercept = FALSE)
hatcoefi <- coef(cvc, s=cvc$lambda[length(cvc$lambda)]) # ok cool. now normalize.
# cvc <- cv.glmnet(x = X, y = pci, alpha = 0.0, intercept = FALSE)
# hatcoefi <- coef(cvc, s="lambda.min")
hatcoefi <- hatcoefi / ifelse(sum(hatcoefi) == 0, 1, hatcoefi^2 %>% sum() %>% sqrt())
hatpci <- X %*% hatcoefi[-1,] # minus the intercept, always 0.
return(hatpci)
}
lx <- lapply(as.list(as.data.frame(pc)), hatpci)
hatpc <- Reduce(cbind, lx)
colnames(hatpc) = c(1:min(T, N))
res_sp <- (X - hatpc[,1:min(N, T)] %*% t(myPCA$rotation[,1:min(N, T)])) %>% t() %>% tbl_df()
e_sp <- res_sp %>%
tbl_df() %>%
tibble::rownames_to_column(var = "n_id") %>% # id will be different.
gather(year, v, -n_id) %>%
# mutate(year = year %>% as.factor()) %>%
left_join(dfw, by=c("n_id", "year")) %>% # merge on weights.
group_by(year) %>%
summarize(value = sum(l.wit * v, na.rm = TRUE)) %>%
mutate(type = "e_sp")
return(e_sp)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.