library(ggplot2)
#' Compute Prcomp objects for a data frame with many feature columns.
#'
#' @param feats_df data.frame with cols img_id, chars/cats (discarded), and numeric feats
#' @param top_n integer number of rows to randomly subsample for faster training
#' @param ... additional args to pass to stats::prcomp.
#'
#' @return prcomp object \code{\link[stats]{prcomp}}
#' @family pca
#' @export
#' @examples
#' fit_prcomp(replSicate(10, rnorm(20)), id_coln=NULL)
#' fit_prcomp(mtcars, id_coln="mpg")
fit_prcomp <- function(feats_df, n_samples=nrow(feats_df), id_coln="img_id", ...) {
feats_df <- as.data.frame(feats_df)
stopifnot(is.data.frame(feats_df),
id_coln %in% names(feats_df),
nrow(feats_df) >= 2) # if not, svd will have a zero-dimension
# Subset rows
if (n_samples < nrow(feats_df)) {feats_df <- sample_n(feats_df, n_samples)}
# Filter cols for just numerics; convert to mtx with img_id rownames
img_ids <- feats_df$img_id # Prepend later in prcomp retx product
mtx <- feats_df %>%
purrr::keep(.p=is.numeric) %>% # numeric only
as.matrix
nzv <- caret::nearZeroVar(mtx) # Remove any zero var predictors before PCA
if (length(nzv) > 0) {
warning("removing nero zero variance cols: ", names(mtx)[nzv])
mtx <- mtx[, -nzv]
}
# Compute prcomp object ----------------------
prcomp <- stats::prcomp(mtx, ...)
rownames(prcomp$x) <- img_ids # Reappend cols to train data copy .$x
stopifnot(prcomp$x %>% dim %>% `[`(1) == n_samples)
return(prcomp)
}
#' Apply a fit prcomp's rotation onto a new data frame
#'
#' @param prcomp prcomp object with fit rotation to apply
#' @param newdata_df df with 'img_id'
#'
#' @return data.frame with same number of rows as newdata, and ncol
#' equal to the rank of the fit prcomp
#' @export
predict_pca <- function(prcomp, newdata_df, id_coln="img_id") {
stopifnot(is.data.frame(newdata_df))
mtx <- newdata_df %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames(var=id_coln) %>%
as.matrix()
pcs <- predict(prcomp, newdata = mtx) %>%
as.data.frame %>%
tibble::rownames_to_column(var=id_coln)
return(pcs)
}
#' extract data for a cumulative variance explained figure
#'
#' @param prcompObjFit prcomp object
#'
#' @return ggplot object
#' @export
#' @family pca
#'
#' @examples
#' ggplot(gg_data_cve(fitPrcomp), aes(x=x, y=y)) + gg_cve_style
gg_data_cve <- function(prcompObjFit) {
stopifnot(class(prcompObjFit == "prcomp"))
pc.var <- prcompObjFit$sdev^2
pve <- pc.var/sum(pc.var)
pve %>%
`[`(1:10) %>%
cumsum() %>%
enframe(name="x", value="y")
}
gg_style_cve <- list(geom_point(),
geom_line(),
labs(x = "Number of Principal Components",
y = "Cumulative Variance Explained"),
coord_cartesian(ylim=c(0, 1), expand=FALSE),
scale_color_manual("Inception Featurizer", values=darks),
theme(panel.grid.major=element_line(colour="gray", size=0.5)),
scale_x_continuous(breaks = seq(0, 10, 2))
)
gg_scatter <- function(prcompObjFit) {
ids <- prcompObjFit$ids
scat_dat <- prcompObjFit$x %>%
as.data.frame %>%
select(x=PC1, y=PC2) %>%
add_column(ids=ids)
}
# GG Components
gg_scat_style <- list(labs(x="1st Principal Component",
y="2nd Principal Component"),
theme(axis.text = element_blank(),
axis.ticks = element_blank()))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.