R/glrm.R
In h2o: R Interface for the 'H2O' Scalable Machine Learning Platform

Documented in h2o.glrm h2o.proj_archetypes h2o.reconstruct

# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details) 
#'
# -------------------------- Generalized Low Rank Model -------------------------- #
#'
#' Generalized low rank decomposition of an H2O data frame
#' 
#' Builds a generalized low rank decomposition of an H2O data frame
#'
#' @param training_frame Id of the training data frame.
#' @param cols (Optional) A vector containing the data columns on which k-means operates.
#' @param model_id Destination id for this model; auto-generated if not specified.
#' @param validation_frame Id of the validation data frame.
#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE.
#' @param score_each_iteration \code{Logical}. Whether to score during each iteration of model training. Defaults to FALSE.
#' @param representation_name Frame key to save resulting X
#' @param loading_name [Deprecated] Use representation_name instead.  Frame key to save resulting X.
#' @param transform Transformation of training data Must be one of: "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE".
#'        Defaults to NONE.
#' @param k Rank of matrix approximation Defaults to 1.
#' @param loss Numeric loss function Must be one of: "Quadratic", "Absolute", "Huber", "Poisson", "Hinge", "Logistic",
#'        "Periodic". Defaults to Quadratic.
#' @param loss_by_col Loss function by column (override) Must be one of: "Quadratic", "Absolute", "Huber", "Poisson", "Hinge",
#'        "Logistic", "Periodic", "Categorical", "Ordinal".
#' @param loss_by_col_idx Loss function by column index (override)
#' @param multi_loss Categorical loss function Must be one of: "Categorical", "Ordinal". Defaults to Categorical.
#' @param period Length of period (only used with periodic loss function) Defaults to 1.
#' @param regularization_x Regularization function for X matrix Must be one of: "None", "Quadratic", "L2", "L1", "NonNegative",
#'        "OneSparse", "UnitOneSparse", "Simplex". Defaults to None.
#' @param regularization_y Regularization function for Y matrix Must be one of: "None", "Quadratic", "L2", "L1", "NonNegative",
#'        "OneSparse", "UnitOneSparse", "Simplex". Defaults to None.
#' @param gamma_x Regularization weight on X matrix Defaults to 0.
#' @param gamma_y Regularization weight on Y matrix Defaults to 0.
#' @param max_iterations Maximum number of iterations Defaults to 1000.
#' @param max_updates Maximum number of updates, defaults to 2*max_iterations Defaults to 2000.
#' @param init_step_size Initial step size Defaults to 1.
#' @param min_step_size Minimum step size Defaults to 0.0001.
#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
#'        Defaults to -1 (time-based random number).
#' @param init Initialization mode Must be one of: "Random", "SVD", "PlusPlus", "User". Defaults to PlusPlus.
#' @param svd_method Method for computing SVD during initialization (Caution: Randomized is currently experimental and unstable)
#'        Must be one of: "GramSVD", "Power", "Randomized". Defaults to Randomized.
#' @param user_y User-specified initial Y
#' @param user_x User-specified initial X
#' @param expand_user_y \code{Logical}. Expand categorical columns in user-specified initial Y Defaults to TRUE.
#' @param impute_original \code{Logical}. Reconstruct original training data by reversing transform Defaults to FALSE.
#' @param recover_svd \code{Logical}. Recover singular values and eigenvectors of XY Defaults to FALSE.
#' @param max_runtime_secs Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to 0.
#' @param export_checkpoints_dir Automatically export generated models to this directory.
#' @return an object of class \linkS4class{H2ODimReductionModel}.
#' @seealso \code{\link{h2o.kmeans}, \link{h2o.svd}}, \code{\link{h2o.prcomp}}
#' @references M. Udell, C. Horn, R. Zadeh, S. Boyd (2014). {Generalized Low Rank Models}[https://arxiv.org/abs/1410.0342]. Unpublished manuscript, Stanford Electrical Engineering Department.
#'             N. Halko, P.G. Martinsson, J.A. Tropp. {Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions}[https://arxiv.org/abs/0909.4061]. SIAM Rev., Survey and Review section, Vol. 53, num. 2, pp. 217-288, June 2011.
#' @examples
#' \dontrun{
#' library(h2o)
#' h2o.init()
#' australia_path <- system.file("extdata", "australia.csv", package = "h2o")
#' australia <- h2o.uploadFile(path = australia_path)
#' h2o.glrm(training_frame = australia, k = 5, loss = "Quadratic", regularization_x = "L1",
#'          gamma_x = 0.5, gamma_y = 0, max_iterations = 1000)
#' }
#' @export
h2o.glrm <- function(training_frame,
                     cols = NULL,
                     model_id = NULL,
                     validation_frame = NULL,
                     ignore_const_cols = TRUE,
                     score_each_iteration = FALSE,
                     representation_name = NULL,
                     loading_name = NULL,
                     transform = c("NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"),
                     k = 1,
                     loss = c("Quadratic", "Absolute", "Huber", "Poisson", "Hinge", "Logistic", "Periodic"),
                     loss_by_col = c("Quadratic", "Absolute", "Huber", "Poisson", "Hinge", "Logistic", "Periodic", "Categorical", "Ordinal"),
                     loss_by_col_idx = NULL,
                     multi_loss = c("Categorical", "Ordinal"),
                     period = 1,
                     regularization_x = c("None", "Quadratic", "L2", "L1", "NonNegative", "OneSparse", "UnitOneSparse", "Simplex"),
                     regularization_y = c("None", "Quadratic", "L2", "L1", "NonNegative", "OneSparse", "UnitOneSparse", "Simplex"),
                     gamma_x = 0,
                     gamma_y = 0,
                     max_iterations = 1000,
                     max_updates = 2000,
                     init_step_size = 1,
                     min_step_size = 0.0001,
                     seed = -1,
                     init = c("Random", "SVD", "PlusPlus", "User"),
                     svd_method = c("GramSVD", "Power", "Randomized"),
                     user_y = NULL,
                     user_x = NULL,
                     expand_user_y = TRUE,
                     impute_original = FALSE,
                     recover_svd = FALSE,
                     max_runtime_secs = 0,
                     export_checkpoints_dir = NULL)
{
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
  validation_frame <- .validate.H2OFrame(validation_frame, required=FALSE)

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  if(!missing(cols))
    parms$ignored_columns <- .verify_datacols(training_frame, cols)$cols_ignore  

  if (!missing(model_id))
    parms$model_id <- model_id
  if (!missing(validation_frame))
    parms$validation_frame <- validation_frame
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(representation_name))
    parms$representation_name <- representation_name
  if (!missing(loading_name))
    parms$loading_name <- loading_name
  if (!missing(transform))
    parms$transform <- transform
  if (!missing(k))
    parms$k <- k
  if(!missing(loss)) {
    if(loss == "MeanSquare") {
      warning("Loss name 'MeanSquare' is deprecated; please use 'Quadratic' instead.")
      parms$loss <- "Quadratic"
    } else 
      parms$loss <- loss
  }
  if (!missing(loss_by_col))
    parms$loss_by_col <- loss_by_col
  if (!missing(loss_by_col_idx))
    parms$loss_by_col_idx <- loss_by_col_idx
  if (!missing(multi_loss))
    parms$multi_loss <- multi_loss
  if (!missing(period))
    parms$period <- period
  if (!missing(regularization_x))
    parms$regularization_x <- regularization_x
  if (!missing(regularization_y))
    parms$regularization_y <- regularization_y
  if (!missing(gamma_x))
    parms$gamma_x <- gamma_x
  if (!missing(gamma_y))
    parms$gamma_y <- gamma_y
  if (!missing(max_iterations))
    parms$max_iterations <- max_iterations
  if (!missing(max_updates))
    parms$max_updates <- max_updates
  if (!missing(init_step_size))
    parms$init_step_size <- init_step_size
  if (!missing(min_step_size))
    parms$min_step_size <- min_step_size
  if (!missing(seed))
    parms$seed <- seed
  if (!missing(init))
    parms$init <- init
  if (!missing(svd_method))
    parms$svd_method <- svd_method
  if (!missing(user_y))
    parms$user_y <- user_y
  if (!missing(user_x))
    parms$user_x <- user_x
  if (!missing(expand_user_y))
    parms$expand_user_y <- expand_user_y
  if (!missing(impute_original))
    parms$impute_original <- impute_original
  if (!missing(recover_svd))
    parms$recover_svd <- recover_svd
  if (!missing(max_runtime_secs))
    parms$max_runtime_secs <- max_runtime_secs
  if (!missing(export_checkpoints_dir))
    parms$export_checkpoints_dir <- export_checkpoints_dir

  # Check if user_y is an acceptable set of user-specified starting points
  if( is.data.frame(user_y) || is.matrix(user_y) || is.list(user_y) || is.H2OFrame(user_y) ) {
    # Convert user-specified starting points to H2OFrame
    if( is.data.frame(user_y) || is.matrix(user_y) || is.list(user_y) ) {
      if( !is.data.frame(user_y) && !is.matrix(user_y) ) user_y <- t(as.data.frame(user_y))
      user_y <- as.h2o(user_y)
    }
    parms[["user_y"]] <- user_y

    # Set k
    if( !(missing(k)) && k!=as.integer(nrow(user_y)) ) {
      warning("Argument k is not equal to the number of rows in user-specified Y. Ignoring k. Using specified Y.")
    }
    if ( !missing(loading_name)) {
      warning("Argument loading_name is deprecated.  Use representation_name instead.")
    }
    parms[["k"]] <- as.numeric(nrow(user_y))
  # } else if( is.null(user_y) ) {
  #  if(!missing(init) && parms[["init"]] == "User")
  #    warning("Initializing Y to a standard Gaussian random matrix.")
  # } else
  } else if( !is.null(user_y) )
    stop("Argument user_y must either be null or a valid user-defined starting Y matrix.")

  # Check if user_x is an acceptable set of user-specified starting points
  if( is.data.frame(user_x) || is.matrix(user_x) || is.list(user_x) || is.H2OFrame(user_x) ) {
    # Convert user-specified starting points to H2OFrame
    if( is.data.frame(user_x) || is.matrix(user_x) || is.list(user_x) ) {
      if( !is.data.frame(user_x) && !is.matrix(user_x) ) user_x <- t(as.data.frame(user_x))
      user_x <- as.h2o(user_x)
    }
    parms[["user_x"]] <- user_x
  # } else if( is.null(user_x) ) {
  #  if(!missing(init) && parms[["init"]] == "User")
  #    warning("Initializing X to a standard Gaussian random matrix.")
  # } else
  } else if( !is.null(user_x) )
    stop("Argument user_x must either be null or a valid user-defined starting X matrix.")

  # Error check and build model
  model <- .h2o.modelJob('glrm', parms, h2oRestApiVersion=3, verbose=FALSE)
  return(model)
}
.h2o.train_segments_glrm <- function(training_frame,
                                     cols = NULL,
                                     validation_frame = NULL,
                                     ignore_const_cols = TRUE,
                                     score_each_iteration = FALSE,
                                     representation_name = NULL,
                                     loading_name = NULL,
                                     transform = c("NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"),
                                     k = 1,
                                     loss = c("Quadratic", "Absolute", "Huber", "Poisson", "Hinge", "Logistic", "Periodic"),
                                     loss_by_col = c("Quadratic", "Absolute", "Huber", "Poisson", "Hinge", "Logistic", "Periodic", "Categorical", "Ordinal"),
                                     loss_by_col_idx = NULL,
                                     multi_loss = c("Categorical", "Ordinal"),
                                     period = 1,
                                     regularization_x = c("None", "Quadratic", "L2", "L1", "NonNegative", "OneSparse", "UnitOneSparse", "Simplex"),
                                     regularization_y = c("None", "Quadratic", "L2", "L1", "NonNegative", "OneSparse", "UnitOneSparse", "Simplex"),
                                     gamma_x = 0,
                                     gamma_y = 0,
                                     max_iterations = 1000,
                                     max_updates = 2000,
                                     init_step_size = 1,
                                     min_step_size = 0.0001,
                                     seed = -1,
                                     init = c("Random", "SVD", "PlusPlus", "User"),
                                     svd_method = c("GramSVD", "Power", "Randomized"),
                                     user_y = NULL,
                                     user_x = NULL,
                                     expand_user_y = TRUE,
                                     impute_original = FALSE,
                                     recover_svd = FALSE,
                                     max_runtime_secs = 0,
                                     export_checkpoints_dir = NULL,
                                     segment_columns = NULL,
                                     segment_models_id = NULL,
                                     parallelism = 1)
{
  # formally define variables that were excluded from function parameters
  model_id <- NULL
  verbose <- NULL
  destination_key <- NULL
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
  validation_frame <- .validate.H2OFrame(validation_frame, required=FALSE)

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  if(!missing(cols))
    parms$ignored_columns <- .verify_datacols(training_frame, cols)$cols_ignore  

  if (!missing(validation_frame))
    parms$validation_frame <- validation_frame
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(representation_name))
    parms$representation_name <- representation_name
  if (!missing(loading_name))
    parms$loading_name <- loading_name
  if (!missing(transform))
    parms$transform <- transform
  if (!missing(k))
    parms$k <- k
  if(!missing(loss)) {
    if(loss == "MeanSquare") {
      warning("Loss name 'MeanSquare' is deprecated; please use 'Quadratic' instead.")
      parms$loss <- "Quadratic"
    } else 
      parms$loss <- loss
  }
  if (!missing(loss_by_col))
    parms$loss_by_col <- loss_by_col
  if (!missing(loss_by_col_idx))
    parms$loss_by_col_idx <- loss_by_col_idx
  if (!missing(multi_loss))
    parms$multi_loss <- multi_loss
  if (!missing(period))
    parms$period <- period
  if (!missing(regularization_x))
    parms$regularization_x <- regularization_x
  if (!missing(regularization_y))
    parms$regularization_y <- regularization_y
  if (!missing(gamma_x))
    parms$gamma_x <- gamma_x
  if (!missing(gamma_y))
    parms$gamma_y <- gamma_y
  if (!missing(max_iterations))
    parms$max_iterations <- max_iterations
  if (!missing(max_updates))
    parms$max_updates <- max_updates
  if (!missing(init_step_size))
    parms$init_step_size <- init_step_size
  if (!missing(min_step_size))
    parms$min_step_size <- min_step_size
  if (!missing(seed))
    parms$seed <- seed
  if (!missing(init))
    parms$init <- init
  if (!missing(svd_method))
    parms$svd_method <- svd_method
  if (!missing(user_y))
    parms$user_y <- user_y
  if (!missing(user_x))
    parms$user_x <- user_x
  if (!missing(expand_user_y))
    parms$expand_user_y <- expand_user_y
  if (!missing(impute_original))
    parms$impute_original <- impute_original
  if (!missing(recover_svd))
    parms$recover_svd <- recover_svd
  if (!missing(max_runtime_secs))
    parms$max_runtime_secs <- max_runtime_secs
  if (!missing(export_checkpoints_dir))
    parms$export_checkpoints_dir <- export_checkpoints_dir

  # Check if user_y is an acceptable set of user-specified starting points
  if( is.data.frame(user_y) || is.matrix(user_y) || is.list(user_y) || is.H2OFrame(user_y) ) {
    # Convert user-specified starting points to H2OFrame
    if( is.data.frame(user_y) || is.matrix(user_y) || is.list(user_y) ) {
      if( !is.data.frame(user_y) && !is.matrix(user_y) ) user_y <- t(as.data.frame(user_y))
      user_y <- as.h2o(user_y)
    }
    parms[["user_y"]] <- user_y

    # Set k
    if( !(missing(k)) && k!=as.integer(nrow(user_y)) ) {
      warning("Argument k is not equal to the number of rows in user-specified Y. Ignoring k. Using specified Y.")
    }
    if ( !missing(loading_name)) {
      warning("Argument loading_name is deprecated.  Use representation_name instead.")
    }
    parms[["k"]] <- as.numeric(nrow(user_y))
  # } else if( is.null(user_y) ) {
  #  if(!missing(init) && parms[["init"]] == "User")
  #    warning("Initializing Y to a standard Gaussian random matrix.")
  # } else
  } else if( !is.null(user_y) )
    stop("Argument user_y must either be null or a valid user-defined starting Y matrix.")

  # Check if user_x is an acceptable set of user-specified starting points
  if( is.data.frame(user_x) || is.matrix(user_x) || is.list(user_x) || is.H2OFrame(user_x) ) {
    # Convert user-specified starting points to H2OFrame
    if( is.data.frame(user_x) || is.matrix(user_x) || is.list(user_x) ) {
      if( !is.data.frame(user_x) && !is.matrix(user_x) ) user_x <- t(as.data.frame(user_x))
      user_x <- as.h2o(user_x)
    }
    parms[["user_x"]] <- user_x
  # } else if( is.null(user_x) ) {
  #  if(!missing(init) && parms[["init"]] == "User")
  #    warning("Initializing X to a standard Gaussian random matrix.")
  # } else
  } else if( !is.null(user_x) )
    stop("Argument user_x must either be null or a valid user-defined starting X matrix.")

  # Build segment-models specific parameters
  segment_parms <- list()
  if (!missing(segment_columns))
    segment_parms$segment_columns <- segment_columns
  if (!missing(segment_models_id))
    segment_parms$segment_models_id <- segment_models_id
  segment_parms$parallelism <- parallelism

  # Error check and build segment models
  segment_models <- .h2o.segmentModelsJob('glrm', segment_parms, parms, h2oRestApiVersion=3)
  return(segment_models)
}


#' Reconstruct Training Data via H2O GLRM Model
#'
#' Reconstruct the training data and impute missing values from the H2O GLRM model
#' by computing the matrix product of X and Y, and transforming back to the original
#' feature space by minimizing each column's loss function.
#'
#' @param object An \linkS4class{H2ODimReductionModel} object that represents the
#'        model to be used for reconstruction.
#' @param data An H2OFrame object representing the training data for the H2O GLRM model.
#'        Used to set the domain of each column in the reconstructed frame.
#' @param reverse_transform (Optional) A logical value indicating whether to reverse the
#'        transformation from model-building by re-scaling columns and adding back the
#'        offset to each column of the reconstructed frame.
#' @return Returns an H2OFrame object containing the approximate reconstruction of the
#'         training data;
#' @seealso \code{\link{h2o.glrm}} for making an H2ODimReductionModel.
#' @examples
#' \dontrun{
#' library(h2o)
#' h2o.init()
#' iris_hf <- as.h2o(iris)
#' iris_glrm <- h2o.glrm(training_frame = iris_hf, k = 4, transform = "STANDARDIZE",
#'                       loss = "Quadratic", multi_loss = "Categorical", max_iterations = 1000)
#' iris_rec <- h2o.reconstruct(iris_glrm, iris_hf, reverse_transform = TRUE)
#' head(iris_rec)
#' }
#' @export
h2o.reconstruct <- function(object, data, reverse_transform=FALSE) {
  url <- paste0('Predictions/models/', object@model_id, '/frames/',h2o.getId(data))
  res <- .h2o.__remoteSend(url, method = "POST", reconstruct_train=TRUE, reverse_transform=reverse_transform)
  key <- res$model_metrics[[1L]]$predictions$frame_id$name
  h2o.getFrame(key)
}

#' Convert Archetypes to Features from H2O GLRM Model
#'
#' Project each archetype in an H2O GLRM model into the corresponding feature
#' space from the H2O training frame.
#'
#' @param object An \linkS4class{H2ODimReductionModel} object that represents the
#'        model containing archetypes to be projected.
#' @param data An H2OFrame object representing the training data for the H2O GLRM model.
#' @param reverse_transform (Optional) A logical value indicating whether to reverse the
#'        transformation from model-building by re-scaling columns and adding back the
#'        offset to each column of the projected archetypes.
#' @return Returns an H2OFrame object containing the projection of the archetypes
#'         down into the original feature space, where each row is one archetype.
#' @seealso \code{\link{h2o.glrm}} for making an H2ODimReductionModel.
#' @examples
#' \dontrun{
#' library(h2o)
#' h2o.init()
#' iris_hf <- as.h2o(iris)
#' iris_glrm <- h2o.glrm(training_frame = iris_hf, k = 4, loss = "Quadratic",
#'                       multi_loss = "Categorical", max_iterations = 1000)
#' iris_parch <- h2o.proj_archetypes(iris_glrm, iris_hf)
#' head(iris_parch)
#' }
#' @export
h2o.proj_archetypes <- function(object, data, reverse_transform=FALSE) {
  url <- paste0('Predictions/models/', object@model_id, '/frames/',h2o.getId(data))
  res <- .h2o.__remoteSend(url, method = "POST", project_archetypes=TRUE, reverse_transform=reverse_transform)
  key <- res$model_metrics[[1L]]$predictions$frame_id$name
  h2o.getFrame(key)
}