R/pca_tools.R

tidy_pca <- function(data, scale = TRUE, show_plot = TRUE) {
    
    if ("FALSE" %in% purrr::map_lgl(data, is.numeric))
        stop("All columns must be numeric for PCA")
    
    if (scale == TRUE) {
        pca <- scale(data) %>% princomp()
    } else {
        pca <- princomp(data)
    }
    
    eigs <- pca$sdev^2
    
    var_explained <- eigs / sum(eigs)
    
    pca_summary <- tibble::tibble(component = seq_along(eigs),
                              eigen_values = eigs,
                              var_explained,
                              total_explained = cumsum(var_explained))
    
    pca_loadings <- pca$loadings %>% 
        as.numeric() %>% 
        matrix(., ncol = ncol(data)) %>%
        tibble::as_tibble() %>%
        setNames(., nm = paste0("Comp.", 1:ncol(data))) %>%
        mutate(variable = colnames(data)) %>%
        select(variable, dplyr::everything())
    
    pca_output <- list(
        pca_summary = pca_summary,
        pca_loadings = pca_loadings,
        pca_scores = pca$scores %>% tibble::as_tibble()
    )
    
    if (show_plot == TRUE) {
        
        eigen_plot <- ggplot2::ggplot(pca_output$pca_summary, 
                                      aes(x = component, y = eigen_values)) +
            ggplot2::geom_point() +
            ggplot2::geom_line() +
            ggplot2::labs(title = "Eigenvalues",
                         x = "Components",
                         y = "") +
            ggplot2::scale_y_continuous(breaks = scales::pretty_breaks(n = 10))
        
        cumvar_plot <- ggplot2::ggplot(pca_output$pca_summary, 
                                       aes(x = component, y = total_explained)) +
            ggplot2::geom_point() +
            ggplot2::geom_line() +
            ggplot2::labs(title = "Cumulative Variance Explained",
                          x = "Components",
                          y = "") +
            ggplot2::scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) 
        
        gridExtra::grid.arrange(eigen_plot, cumvar_plot, ncol=2)
        
    }
    
    return(pca_output)
}
dannymorris/smltools documentation built on May 15, 2019, 10:49 a.m.