R/pca_functions.R

Defines functions get_matrix icpx predict_using_factors sparse_pca icpx_pca

Documented in get_matrix icpx icpx_pca predict_using_factors sparse_pca

#' Get a matrix version of a balanced panel
#'
#' Convert a balanced panel tbl into a matrix,
#' id_var as rows, t_var as columns
#'
#' @param tbl A tbl of panel data
#'
#' @param id_var The (quoted or unquoted) name of the id variable
#'
#' @param t_var The (quoted or unquoted) name of the time variable
#'
#' @param x_var The (quoted or unquoted) name of the variable to put in the matrix
#'
#' @return A matrix
#' @keywords panel matrix
#'
#'
#' @examples
#'
#' asm <- fakedata::fake_panel(N = 50, T = 10, missing = 0.1)
#' get_matrix(asm, id, year, g)
#'
#' @export
get_matrix <- function(tbl, id_var = id, t_var = year, x_var = g_i) {
  # process variable names to handle quoted or unquoted inputs
  id_var <- dplyr::quo_name(dplyr::enquo(id_var))
  t_var <- dplyr::quo_name(dplyr::enquo(t_var))
  x_var <- dplyr::quo_name(dplyr::enquo(x_var))

  # make sure the tbl is ordered
  tbl <- tbl %>% arrange(!!rlang::sym(t_var), !!rlang::sym(id_var))

  # convert it to matrix, row-wise
  matrix(tbl[[x_var]],
         nrow = length(unique(tbl[[t_var]])),
         ncol = length(unique(tbl[[id_var]])),
         byrow = TRUE)
}

#' Calculate ICP rules for common factors
#'
#' Given a matrix, calculate ICP rules for
#' the number of common factors
#'
#' @param X A matrix
#'
#' @return A list of factor numbers calculated using
#' icp rules.
#'
#' @keywords ICP, factor analysis, PCA
#'
#'
#' @examples
#'
#' asm <- fakedata::fake_panel(N = 50, T = 10, missing = 0.1)
#' asm <- asm %>% rename(industry = I, g_i = g) %>% add_weights() %>% balance_panel(id, year)
#' X <- get_matrix(asm)
#' icpx(X)
#'
#' @export
icpx <- function(X) {
  N <- dim(X)[2] # number of individuals...wait?
  T <- dim(X)[1] # number of years
  k <- min(N, T) # maximum number of factors.

  # Perform PCA
  pca <- prcomp(X) #, scale. = F, center = F)

  # residuals
  res <- (X - pca$x[,1:k] %*% t(pca$rotation[,1:k]))

  c2nt <- min(N,T) # = k, but renamed for the purposes of the formula
  icp1 <- (N * T)^(-1) * sum(res^2) + k * (N + T) * log(N * T) / ((N * T) * log(N + T))
  icp2 <- (N * T)^(-1) * sum(res^2) + k * (N + T) * log(c2nt) / (N * T)
  icp3 <- (N * T)^(-1) * sum(res^2) + k * log(c2nt) / c2nt

  # lhatr: list of \hat{r}, estimated number of common factors
  lhatr <- purrr::map(list(icp1, icp2, icp3, k), function(x) max(1, floor(x))) %>% unique()
  names(lhatr) <- lhatr

  list(lhatr = lhatr, pca = pca) # need to pass back `pca` too?
}


#' Predict aggregate growth based on PCA decomposition
#'
#' A function that predicts an annual aggregate growth rate
#' by using a PCA decomposition of individual growth rates,
#' using a given number of common factors (estimated elsewhere).
#'
#' @param hatr The number of common factors (estimated by some other means)
#'
#' @param X A matrix of data
#'
#' @param W A matrix of weights
#'
#' @param pca The PCA decomposition of X
#'
#' @return A tbl of predictions of aggregate growth by year,
#' using the number of common factors given by hatr
#'
#' @keywords ICP, factor analysis, PCA, predict
#'
#'
#' @export
predict_using_factors <- function(hatr, X, W, pca = prcomp(X)) { # not sure if I have to pass pca or not.
  # hatr = the (estimated) number of factors.
  # X is the growth matrix
  # W is the weight matrix
  # pca may or may not be the same as before; may or may not need to pass it in, or re-calculate it.
  res_icp <- (X - pca$x[,1:hatr] %*% t(pca$rotation[, 1:hatr]))

  # element-wise multiply the residuals by the weights, then aggregate back up.
  # residuals (e.g., errors) are common across individuals, hmm.
  rowSums(res_icp * W) %>%
    dplyr::tbl_df() %>%
    tibble::rownames_to_column(var = "year")
}


#' Predict aggregate growth based on sparse PCA decomposition
#'
#' A function that predicts an annual aggregate growth rate
#' by using a PCA decomposition of individual growth rates,
#' using a sparse PCA based on glmnet.
#'
#' @param X A matrix of data
#'
#' @param W A matrix of weights
#'
#' @param pca The PCA decomposition of X
#'
#' @return A tbl of predictions of aggregate growth by year,
#' using the number of common factors given by hatr
#'
#' @keywords lasso, factor analysis, sparse PCA, predict
#'
#'
#' @export
sparse_pca <- function(X, W, pca = prcomp(X)) {
  N <- dim(X)[2] # number of firms/plants
  T <- dim(X)[1] # number of years

  pca <- prcomp(X) #, scale. = F, center = F)

  hatpci <- function(pci) {
    cvc <- glmnet::glmnet(x = X, y = pci, alpha = 0.1, intercept = FALSE)
    hatcoefi <- coef(cvc, s=cvc$lambda[length(cvc$lambda)]) # ok cool. now normalize.
    hatcoefi <- hatcoefi / ifelse(sum(hatcoefi) == 0, 1, hatcoefi^2 %>% sum() %>% sqrt())
    hatpci <- X %*% hatcoefi[-1, ] # minus the intercept, always 0.
    return(hatpci)
  }

  lx <- map(as.list(as.data.frame(pca$x)), hatpci)
  hatpc <- Reduce(cbind, lx) # if you use bind_cols, you get a df, we need a matrix.
  colnames(hatpc) = c(1:min(T, N))

  # residuals
  res_sp <- (X - hatpc %*% t(pca$rotation))

  rowSums(res_sp * W) %>%
    tbl_df() %>%
    tibble::rownames_to_column(var = "year")
}


#' A function to return PCA predictions based on ICP
#' estimated common factors
#'
#' A wrapper function that calls icpx() to estimate the number
#' of common factors, and then calls predict_using_factors()
#' to return the aggregate predictions.
#'
#' @param X A matrix of data
#'
#' @param W A matrix of weights
#'
#' @return A tbl of predictions of aggregate growth by year,
#' using the number of common factors given by hatr
#'
#' @keywords ICP, factor analysis, PCA, predict
#'
#'
#' @export
icpx_pca <- function(X, W) {
  icp <- icpx(X)
  purrr::map(icp$lhatr, predict_using_factors, X = X, W = W, pca = icp$pca)
}
tweed1e/idiosyncratics documentation built on May 29, 2019, 10:51 a.m.