data-raw/decompose_pca.R

# decompose using PCA.

get_matrix <- function(df) {
  X <- df %>%
    ungroup() %>%
    select(id, year, git) %>%
    spread(key = year, value = git) %>%
    arrange(id) %>%
    select(-id) %>%
    as.matrix() %>%
    t()
  return(X)
}

process_df_for_pca <- function(df) {

  balanced <- df %>%
    group_by(id, industry) %>%
    count() %>%
    ungroup() %>%
    filter(n==max(n)) %>%
    select(-n)

  df_processed <- df %>%
    inner_join(balanced) %>%
    arrange(id, year)

  # return matrix. but also need to return id_df and dfw.
  return(df_processed)
}

decompose_pca <- function(df) {

  # refactor this so it goes 1. process/convert to matrix, 2. pca and return df, 3. lasso and return df.
  library(glmnet)
  library(stringr)

  # df <- a73 # for testing

  df_p <- process_df_for_pca(df)
  # get ids
  id_df <- df_p %>% ungroup() %>% distinct(id) %>% tibble::rownames_to_column(var="n_id")
  dfw <- df_p %>% left_join(id_df) %>% select(n_id, l.wit, year) #%>% # should I re-normalize the weights here?
        # group_by(year) %>% mutate(l.wit = l.wit/sum(l.wit, na.rm = TRUE))
  X <- get_matrix(df_p)

  dfx <- icpx_pca(X, dfw)
  e_sp <- sparse_pca(X, dfw)
  # combine.
  # and another function that rbinds all of them together.
  pca_df_l <- dplyr::bind_rows(dfx, e_sp) #Reduce(rbind, list(dfx, e_sp))
  return(pca_df_l)
}
# one that does pre PCA?
# then one to do all the factors. and return the dataset.

icpx_pca <- function(X, dfw) {
  N <- dim(X)[2] # number of firms/plants
  T <- dim(X)[1] # number of years
  k <- min(N, T) # maximum number of factors.

  # Perform PCA
  myPCA <- prcomp(X) #, scale. = F, center = F)

  res <- (X - myPCA$x[,1:k] %*% t(myPCA$rotation[,1:k])) %>% t() %>% tbl_df()

  C2NT <- min(N,T) # = k
  ICp1 <- (N * T)^(-1) * sum(res^2) + k * (N + T) * log(N * T) / ((N * T) * log(N + T))
  ICp2 <- (N * T)^(-1) * sum(res^2) + k * (N + T) * log(C2NT) / (N * T)
  ICp3 <- (N * T)^(-1) * sum(res^2) + k * log(C2NT) / C2NT

  lhatr <- purrr::map(c(ICp1, ICp2, ICp3, min(N,T)), function(x) max(1, floor(x))) %>% unique()

  # end of a function that returns the normal df of PCA results of different factors.
  dfx <- dplyr::bind_rows(purrr::map(lhatr, predict_using_factors, Xm = X, pca = myPCA, dfw = dfw))
  # dfx <- Reduce(rbind, lapply(X = lhatr, FUN = predict_using_factors, Xm = X, pca = myPCA, dfw = dfw))
  dfx
}

# hatr stands for the estimated number of factors.
# pass the original data X, the pca myPCA, and weights dfw?
predict_using_factors <- function(hatr, Xm, pca, dfw) { # not sure if I have to pass data or not.
  # hatr = the (estimated) number of factors.
  res_icp <- (Xm - pca$x[,1:hatr] %*% t(pca$rotation[, 1:hatr])) %>% t() %>% tbl_df()
  # res_icp <- (pca$x[, 1:hatr] %*% t(pca$rotation[, 1:hatr]))
  # rownames(res_icp) <- rownames(pca$x)
  # res_icp <- res_icp %>% t() %>% tbl_df()

  e_icp <- res_icp %>%
    tbl_df() %>%
    tibble::rownames_to_column(var = "n_id") %>% # id will be different.
    gather(year, v, -n_id) %>%
    # mutate(year = year %>% as.factor()) %>%
    left_join(dfw, by = c("n_id", "year")) %>% # merge on weights.
    group_by(year) %>%
    summarize(value = sum(l.wit * v, na.rm = TRUE)) %>%
    mutate(type = hatr %>% as.character() %>% str_pad(width = 2))
  return(e_icp)
}

# return factors; still don't know what to do with this. you still have to use the weights to get back to the original?
# return_factors <- function(hatr, Xm, pca)  {
#   res_icp <- (Xm - pca$x[,1:hatr] %*% t(pca$rotation[, 1:hatr])) %>% t() %>% tbl_df()
# }


sparse_pca <- function(X, dfw) {

  N <- dim(X)[2] # number of firms/plants
  T <- dim(X)[1] # number of years

  myPCA <- prcomp(X) #, scale. = F, center = F)
  # for glmnet, pass in original PCA?
  pc <- myPCA$x # for glmnet

  hatpci <- function(pci) {
    cvc <- glmnet(x = X, y = pci, alpha = 0.1, intercept = FALSE)
    hatcoefi <- coef(cvc, s=cvc$lambda[length(cvc$lambda)]) # ok cool. now normalize.
    # cvc <- cv.glmnet(x = X, y = pci, alpha = 0.0, intercept = FALSE)
    # hatcoefi <- coef(cvc, s="lambda.min")
    hatcoefi <- hatcoefi / ifelse(sum(hatcoefi) == 0, 1, hatcoefi^2 %>% sum() %>% sqrt())
    hatpci <- X %*% hatcoefi[-1,] # minus the intercept, always 0.
    return(hatpci)
  }

  lx <- lapply(as.list(as.data.frame(pc)), hatpci)
  hatpc <- Reduce(cbind, lx)
  colnames(hatpc) = c(1:min(T, N))

  res_sp <- (X - hatpc[,1:min(N, T)] %*% t(myPCA$rotation[,1:min(N, T)])) %>% t() %>% tbl_df()

  e_sp <- res_sp %>%
    tbl_df() %>%
    tibble::rownames_to_column(var = "n_id") %>% # id will be different.
    gather(year, v, -n_id) %>%
    # mutate(year = year %>% as.factor()) %>%
    left_join(dfw, by=c("n_id", "year")) %>% # merge on weights.
    group_by(year) %>%
    summarize(value = sum(l.wit * v, na.rm = TRUE)) %>%
    mutate(type = "e_sp")
  return(e_sp)
}
tweed1e/idiosyncratics documentation built on May 29, 2019, 10:51 a.m.