R/pca.R
In h2o: R Interface for the 'H2O' Scalable Machine Learning Platform

Documented in h2o.prcomp h2o.screeplot

# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details) 
#'
# -------------------------- Principal Components Analysis -------------------------- #
#'
#' Principal component analysis of an H2O data frame
#' 
#' Principal components analysis of an H2O data frame using the power method
#' to calculate the singular value decomposition of the Gram matrix.
#'
#' @param training_frame Id of the training data frame.
#' @param x A vector containing the \code{character} names of the predictors in the model.
#' @param model_id Destination id for this model; auto-generated if not specified.
#' @param validation_frame Id of the validation data frame.
#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE.
#' @param score_each_iteration \code{Logical}. Whether to score during each iteration of model training. Defaults to FALSE.
#' @param transform Transformation of training data Must be one of: "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE".
#'        Defaults to NONE.
#' @param pca_method Specify the algorithm to use for computing the principal components: GramSVD - uses a distributed computation
#'        of the Gram matrix, followed by a local SVD; Power - computes the SVD using the power iteration method
#'        (experimental); Randomized - uses randomized subspace iteration method; GLRM - fits a generalized low-rank
#'        model with L2 loss function and no regularization and solves for the SVD using local matrix algebra
#'        (experimental) Must be one of: "GramSVD", "Power", "Randomized", "GLRM". Defaults to GramSVD.
#' @param pca_impl Specify the implementation to use for computing PCA (via SVD or EVD): MTJ_EVD_DENSEMATRIX - eigenvalue
#'        decompositions for dense matrix using MTJ; MTJ_EVD_SYMMMATRIX - eigenvalue decompositions for symmetric matrix
#'        using MTJ; MTJ_SVD_DENSEMATRIX - singular-value decompositions for dense matrix using MTJ; JAMA - eigenvalue
#'        decompositions for dense matrix using JAMA. References: JAMA - http://math.nist.gov/javanumerics/jama/; MTJ -
#'        https://github.com/fommil/matrix-toolkits-java/ Must be one of: "MTJ_EVD_DENSEMATRIX", "MTJ_EVD_SYMMMATRIX",
#'        "MTJ_SVD_DENSEMATRIX", "JAMA".
#' @param k Rank of matrix approximation Defaults to 1.
#' @param max_iterations Maximum training iterations Defaults to 1000.
#' @param use_all_factor_levels \code{Logical}. Whether first factor level is included in each categorical expansion Defaults to FALSE.
#' @param compute_metrics \code{Logical}. Whether to compute metrics on the training data Defaults to TRUE.
#' @param impute_missing \code{Logical}. Whether to impute missing entries with the column mean Defaults to FALSE.
#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
#'        Defaults to -1 (time-based random number).
#' @param max_runtime_secs Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to 0.
#' @param export_checkpoints_dir Automatically export generated models to this directory.
#' @return an object of class \linkS4class{H2ODimReductionModel}.
#' @seealso \code{\link{h2o.svd}}, \code{\link{h2o.glrm}}
#' @references N. Halko, P.G. Martinsson, J.A. Tropp. {Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions}[http://arxiv.org/abs/0909.4061]. SIAM Rev., Survey and Review section, Vol. 53, num. 2, pp. 217-288, June 2011.
#' @examples
#' \dontrun{
#' library(h2o)
#' h2o.init()
#' australia_path <- system.file("extdata", "australia.csv", package = "h2o")
#' australia <- h2o.uploadFile(path = australia_path)
#' h2o.prcomp(training_frame = australia, k = 8, transform = "STANDARDIZE")
#' }
#' @export
h2o.prcomp <- function(training_frame,
                       x,
                       model_id = NULL,
                       validation_frame = NULL,
                       ignore_const_cols = TRUE,
                       score_each_iteration = FALSE,
                       transform = c("NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"),
                       pca_method = c("GramSVD", "Power", "Randomized", "GLRM"),
                       pca_impl = c("MTJ_EVD_DENSEMATRIX", "MTJ_EVD_SYMMMATRIX", "MTJ_SVD_DENSEMATRIX", "JAMA"),
                       k = 1,
                       max_iterations = 1000,
                       use_all_factor_levels = FALSE,
                       compute_metrics = TRUE,
                       impute_missing = FALSE,
                       seed = -1,
                       max_runtime_secs = 0,
                       export_checkpoints_dir = NULL)
{
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
  validation_frame <- .validate.H2OFrame(validation_frame, required=FALSE)

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  if(!missing(x))
    parms$ignored_columns <- .verify_datacols(training_frame, x)$cols_ignore

  if (!missing(model_id))
    parms$model_id <- model_id
  if (!missing(validation_frame))
    parms$validation_frame <- validation_frame
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(transform))
    parms$transform <- transform
  if (!missing(pca_method))
    parms$pca_method <- pca_method
  if (!missing(pca_impl))
    parms$pca_impl <- pca_impl
  if (!missing(k))
    parms$k <- k
  if (!missing(max_iterations))
    parms$max_iterations <- max_iterations
  if (!missing(use_all_factor_levels))
    parms$use_all_factor_levels <- use_all_factor_levels
  if (!missing(compute_metrics))
    parms$compute_metrics <- compute_metrics
  if (!missing(impute_missing))
    parms$impute_missing <- impute_missing
  if (!missing(seed))
    parms$seed <- seed
  if (!missing(max_runtime_secs))
    parms$max_runtime_secs <- max_runtime_secs
  if (!missing(export_checkpoints_dir))
    parms$export_checkpoints_dir <- export_checkpoints_dir

  # Error check and build model
  model <- .h2o.modelJob('pca', parms, h2oRestApiVersion=3, verbose=FALSE)
  return(model)
}
.h2o.train_segments_prcomp <- function(training_frame,
                                       x,
                                       validation_frame = NULL,
                                       ignore_const_cols = TRUE,
                                       score_each_iteration = FALSE,
                                       transform = c("NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"),
                                       pca_method = c("GramSVD", "Power", "Randomized", "GLRM"),
                                       pca_impl = c("MTJ_EVD_DENSEMATRIX", "MTJ_EVD_SYMMMATRIX", "MTJ_SVD_DENSEMATRIX", "JAMA"),
                                       k = 1,
                                       max_iterations = 1000,
                                       use_all_factor_levels = FALSE,
                                       compute_metrics = TRUE,
                                       impute_missing = FALSE,
                                       seed = -1,
                                       max_runtime_secs = 0,
                                       export_checkpoints_dir = NULL,
                                       segment_columns = NULL,
                                       segment_models_id = NULL,
                                       parallelism = 1)
{
  # formally define variables that were excluded from function parameters
  model_id <- NULL
  verbose <- NULL
  destination_key <- NULL
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
  validation_frame <- .validate.H2OFrame(validation_frame, required=FALSE)

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  if(!missing(x))
    parms$ignored_columns <- .verify_datacols(training_frame, x)$cols_ignore

  if (!missing(validation_frame))
    parms$validation_frame <- validation_frame
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(transform))
    parms$transform <- transform
  if (!missing(pca_method))
    parms$pca_method <- pca_method
  if (!missing(pca_impl))
    parms$pca_impl <- pca_impl
  if (!missing(k))
    parms$k <- k
  if (!missing(max_iterations))
    parms$max_iterations <- max_iterations
  if (!missing(use_all_factor_levels))
    parms$use_all_factor_levels <- use_all_factor_levels
  if (!missing(compute_metrics))
    parms$compute_metrics <- compute_metrics
  if (!missing(impute_missing))
    parms$impute_missing <- impute_missing
  if (!missing(seed))
    parms$seed <- seed
  if (!missing(max_runtime_secs))
    parms$max_runtime_secs <- max_runtime_secs
  if (!missing(export_checkpoints_dir))
    parms$export_checkpoints_dir <- export_checkpoints_dir

  # Build segment-models specific parameters
  segment_parms <- list()
  if (!missing(segment_columns))
    segment_parms$segment_columns <- segment_columns
  if (!missing(segment_models_id))
    segment_parms$segment_models_id <- segment_models_id
  segment_parms$parallelism <- parallelism

  # Error check and build segment models
  segment_models <- .h2o.segmentModelsJob('pca', segment_parms, parms, h2oRestApiVersion=3)
  return(segment_models)
}


.h2o.fill_pca <- function(model, parameters, allparams) {
    model$variable_importances <- model$importance
    return(model)
}

#' Scree Plot
#' @param model  A PCA model
#' @param type  Type of the plot. Either "barplot" or "lines".
#' @export
h2o.screeplot <- function(model, type=c("barplot", "lines")) {
    type <- match.arg(type)
    if (type == "barplot") {
        graphics::barplot(t(model@model$importance)[,1], xlab = "Components", ylab = "Variances", main = "Scree Plot")
    } else {
        graphics::plot(t(model@model$importance)[,1], xlab = "Components", ylab = "Variances", main = "Scree Plot",
               type = "l", lty = "dashed", col = "blue", lwd = 2)
    }
}