R/pca.R

library(ggplot2)

#' Compute Prcomp objects for a data frame with many feature columns.
#'
#' @param feats_df data.frame with cols img_id, chars/cats (discarded), and numeric feats
#' @param top_n integer number of rows to randomly subsample for faster training
#' @param ... additional args to pass to stats::prcomp.
#'
#' @return prcomp object \code{\link[stats]{prcomp}}
#' @family pca
#' @export
#' @examples 
#' fit_prcomp(replSicate(10, rnorm(20)), id_coln=NULL)
#' fit_prcomp(mtcars, id_coln="mpg")
fit_prcomp <- function(feats_df, n_samples=nrow(feats_df), id_coln="img_id", ...) {
    feats_df <- as.data.frame(feats_df)
    stopifnot(is.data.frame(feats_df),
              id_coln %in% names(feats_df),
              nrow(feats_df) >= 2)  # if not, svd will have a zero-dimension

    # Subset rows
    if (n_samples < nrow(feats_df)) {feats_df <- sample_n(feats_df, n_samples)}

    # Filter cols for just numerics; convert to mtx with img_id rownames
    img_ids <- feats_df$img_id  # Prepend later in prcomp retx product
    mtx <- feats_df %>%
        purrr::keep(.p=is.numeric) %>%  # numeric only
        as.matrix
    nzv <- caret::nearZeroVar(mtx)  # Remove any zero var predictors before PCA
    if (length(nzv) > 0) {
        warning("removing nero zero variance cols: ", names(mtx)[nzv])
        mtx <- mtx[, -nzv]
    }

    # Compute prcomp object ----------------------
    prcomp <- stats::prcomp(mtx, ...)

    rownames(prcomp$x) <- img_ids  # Reappend cols to train data copy .$x
    
    stopifnot(prcomp$x %>% dim %>% `[`(1) == n_samples)
    return(prcomp)
}


#' Apply a fit prcomp's rotation onto a new data frame
#'
#' @param prcomp prcomp object with fit rotation to apply
#' @param newdata_df df with 'img_id'
#'
#' @return data.frame with same number of rows as newdata, and ncol
#'  equal to the rank of the fit prcomp
#' @export
predict_pca <- function(prcomp, newdata_df, id_coln="img_id") {
    stopifnot(is.data.frame(newdata_df))
    mtx <- newdata_df %>% 
            tibble::remove_rownames() %>% 
            tibble::column_to_rownames(var=id_coln) %>% 
            as.matrix()
    pcs <- predict(prcomp, newdata = mtx) %>%
        as.data.frame %>%
        tibble::rownames_to_column(var=id_coln)
    return(pcs)
}


#' extract data for a cumulative variance explained figure
#'
#' @param prcompObjFit prcomp object
#'
#' @return ggplot object
#' @export
#' @family pca
#'
#' @examples
#' ggplot(gg_data_cve(fitPrcomp), aes(x=x, y=y)) + gg_cve_style
gg_data_cve <- function(prcompObjFit) {
    stopifnot(class(prcompObjFit == "prcomp"))
    pc.var <- prcompObjFit$sdev^2
    pve <- pc.var/sum(pc.var)
    pve %>%
        `[`(1:10) %>%
        cumsum() %>%
        enframe(name="x", value="y")
}

gg_style_cve <- list(geom_point(),
                     geom_line(),
                     labs(x = "Number of Principal Components",
                          y = "Cumulative Variance Explained"),
                     coord_cartesian(ylim=c(0, 1), expand=FALSE),
                     scale_color_manual("Inception Featurizer", values=darks),
                     theme(panel.grid.major=element_line(colour="gray", size=0.5)),
                     scale_x_continuous(breaks = seq(0, 10, 2))
)


gg_scatter <- function(prcompObjFit) {
    ids <- prcompObjFit$ids
    scat_dat <- prcompObjFit$x %>%
        as.data.frame %>%
        select(x=PC1, y=PC2) %>%
        add_column(ids=ids)
}

# GG Components
gg_scat_style <- list(labs(x="1st Principal Component",
                           y="2nd Principal Component"),
                      theme(axis.text = element_blank(),
                            axis.ticks = element_blank()))
mbadge/AnalysisToolkitR documentation built on May 27, 2019, 1:08 p.m.