getting_started.R
In h2o4gpu: Interface to 'H2O4GPU'

## ---- eval = FALSE------------------------------------------------------------
#  library(devtools)
#  devtools::install_github("h2oai/h2o4gpu", subdir = "src/interface_r")

## ---- eval = FALSE------------------------------------------------------------
#  library(reticulate)
#  use_virtualenv("/home/username/venv/h2o4gpu")  # set this to the path of your venv

## ---- eval = FALSE------------------------------------------------------------
#  library(h2o4gpu)
#  library(reticulate)  # only needed if using a virtual Python environment
#  use_virtualenv("/home/username/venv/h2o4gpu")  # set this to the path of your venv
#  
#  # Prepare data
#  x <- iris[1:4]
#  y <- as.integer(iris$Species) # all columns, including the response, must be numeric
#  
#  # Initialize and train the classifier
#  model <- h2o4gpu.random_forest_classifier() %>% fit(x, y)
#  
#  # Make predictions
#  pred <- model %>% predict(x)
#  
#  # Compute classification error using the Metrics package (note this is training error)
#  library(Metrics)
#  ce(actual = y, predicted = pred)

## ---- eval = FALSE------------------------------------------------------------
#  # Load a sample dataset for binary classification
#  # Source: https://archive.ics.uci.edu/ml/datasets/HIGGS
#  train <- read.csv("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
#  test <- read.csv("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
#  
#  # Create train & test sets (column 1 is the response)
#  x_train <- train[, -1]
#  y_train <- train[, 1]
#  x_test <- test[, -1]
#  y_test <- test[, 1]

## ---- eval = FALSE------------------------------------------------------------
#  # Train three different binary classification models
#  model_gbc <- h2o4gpu.gradient_boosting_classifier() %>% fit(x_train, y_train)
#  model_rfc <- h2o4gpu.random_forest_classifier() %>% fit(x_train, y_train)
#  model_enc <- h2o4gpu.elastic_net_classifier() %>% fit(x_train, y_train)

## ---- eval = FALSE------------------------------------------------------------
#  # Generate predictions (type "prob" gives predicted values instead of predicted label)
#  pred_gbc <- model_gbc %>% predict(x_test, type = "prob")
#  pred_rfc <- model_rfc %>% predict(x_test, type = "prob")
#  pred_enc <- model_enc %>% predict(x_test, type = "prob")

## ---- eval = FALSE------------------------------------------------------------
#  head(pred_rfc)

## ---- eval = FALSE------------------------------------------------------------
#  # Compare test set performance using AUC
#  auc(actual = y_test, predicted = pred_gbc[, 2])
#  auc(actual = y_test, predicted = pred_rfc[, 2])
#  auc(actual = y_test, predicted = pred_enc[, 2])

## ---- eval = FALSE------------------------------------------------------------
#  # Load a sample dataset for regression
#  # Source: https://archive.ics.uci.edu/ml/datasets/Abalone
#  df <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", header = FALSE)
#  str(df)

## ---- eval = FALSE------------------------------------------------------------
#  df[, 1] <- as.integer(df[, 1])  #label encode the one factor column

## ---- eval = FALSE------------------------------------------------------------
#  # Randomly sample 80% of the rows for the training set
#  set.seed(1)
#  train_idx <- sample(1:nrow(df), 0.8*nrow(df))
#  
#  # Create train & test sets (column 9 is the response)
#  x_train <- df[train_idx, -9]
#  y_train <- df[train_idx, 9]
#  x_test <- df[-train_idx, -9]
#  y_test <- df[-train_idx, 9]

## ---- eval = FALSE------------------------------------------------------------
#  # Train two different regression models
#  model_gbr <- h2o4gpu.gradient_boosting_regressor() %>% fit(x_train, y_train)
#  model_enr <- h2o4gpu.elastic_net_regressor() %>% fit(x_train, y_train)
#  
#  # Generate predictions
#  pred_gbr <- model_gbr %>% predict(x_test)
#  pred_enr <- model_enr %>% predict(x_test)

## ---- eval = FALSE------------------------------------------------------------
#  head(pred_gbr)

## ---- eval = FALSE------------------------------------------------------------
#  # Compare test set performance using MSE
#  mse(actual = y_test, predicted = pred_gbr)
#  mse(actual = y_test, predicted = pred_enr)

## ---- eval = FALSE------------------------------------------------------------
#  # Prepare data
#  iris$Species <- as.integer(iris$Species) # convert to numeric data
#  
#  # Randomly sample 80% of the rows for the training set
#  set.seed(1)
#  train_idx <- sample(1:nrow(iris), 0.8*nrow(iris))
#  train <- iris[train_idx, ]
#  test <- iris[-train_idx, ]

## ---- eval = FALSE------------------------------------------------------------
#  model_km <- h2o4gpu.kmeans(n_clusters = 3L) %>% fit(train)

## ---- eval = FALSE------------------------------------------------------------
#  test_dist <- model_km %>% transform(test)
#  head(test_dist)

## ---- eval = FALSE------------------------------------------------------------
#  # Load a sample dataset for binary classification
#  # Source: https://archive.ics.uci.edu/ml/datasets/HIGGS
#  train <- read.csv("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
#  test <- read.csv("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

## ---- eval = FALSE------------------------------------------------------------
#  model_pca <- h2o4gpu.pca(n_components = 4) %>% fit(train)
#  test_transformed <- model_pca %>% transform(test)

## ---- eval = FALSE------------------------------------------------------------
#  model_tsvd <- h2o4gpu.truncated_svd(n_components = 4) %>% fit(train)
#  test_transformed <- model_tsvd %>% transform(test)