https://github.com/h2oai/h2o-tutorials/blob/master/h2o-open-tour-2016/chicago/intro-to-h2o.R
# Introductory H2O Machine Learning Tutorial # Prepared for H2O Open Chicago 2016: http://open.h2o.ai/chicago.html # First step is to download & install the h2o R library # The latest version is available by clicking on the R tab here: http://h2o-release.s3.amazonaws.com/h2o/latest_stable.html # Load the H2O library and start up the H2O cluster locally on your machine library(h2o) h2o.init(nthreads = -1, #Number of threads -1 means use all cores on your machine max_mem_size = "8G") #max mem size is the maximum memory to allocate to H2O
# Next we will import a cleaned up version of the Lending Club "Bad Loans" dataset # The purpose here is to predict whether a loan will be bad (not repaid to the lender) # The response column, bad_loan, is 1 if the loan was bad, and 0 otherwise # Import the data # loan_csv <- "/Volumes/H2OTOUR/loan.csv" # modify this for your machine # Alternatively, you can import the data directly from a URL loan_csv <- "https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv" data <- h2o.importFile(loan_csv) # 163,987 rows x 15 columns dim(data) # [1] 163987 15
# Since we want to train a binary classification model, # we must ensure that the response is coded as a factor # If the response is 0/1, H2O will assume it's numeric, # which means that H2O will train a regression model instead data$bad_loan <- as.factor(data$bad_loan) #encode the binary repsonse as a factor h2o.levels(data$bad_loan) #optional: after encoding, this shows the two factor levels, '0' and '1' # [1] "0" "1"
# Partition the data into training, validation and test sets splits <- h2o.splitFrame(data = data, ratios = c(0.7, 0.15), #partition data into 70%, 15%, 15% chunks seed = 1) #setting a seed will guarantee reproducibility train <- splits[[1]] valid <- splits[[2]] test <- splits[[3]] # Take a look at the size of each partition # Notice that h2o.splitFrame uses approximate splitting not exact splitting (for efficiency) # so these are not exactly 70%, 15% and 15% of the total rows nrow(train) # 114908 nrow(valid) # 24498 nrow(test) # 24581
# Identify response and predictor variables y <- "bad_loan" x <- setdiff(names(data), c(y, "int_rate")) #remove the interest rate column because it's correlated with the outcome print(x) # [1] "loan_amnt" "term" # [3] "emp_length" "home_ownership" # [5] "annual_inc" "verification_status" # [7] "purpose" "addr_state" # [9] "dti" "delinq_2yrs" # [11] "revol_util" "total_acc" # [13] "longest_credit_length"
# Now that we have prepared the data, we can train some models # We will start by training a single model from each of the H2O supervised algos: # 1. Generalized Linear Model (GLM) # 2. Random Forest (RF) # 3. Gradient Boosting Machine (GBM) # 4. Deep Learning (DL) # 5. Naive Bayes (NB) # 1. Let's start with a basic binomial Generalized Linear Model # By default, h2o.glm uses a regularized, elastic net model glm_fit1 <- h2o.glm(x = x, y = y, training_frame = train, model_id = "glm_fit1", family = "binomial") #similar to R's glm, h2o.glm has the family argument
# Next we will do some automatic tuning by passing in a validation frame and setting # `lambda_search = True`. Since we are training a GLM with regularization, we should # try to find the right amount of regularization (to avoid overfitting). The model # parameter, `lambda`, controls the amount of regularization in a GLM model and we can # find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` # and passing in a validation frame (which is used to evaluate model performance using a # particular value of lambda). glm_fit2 <- h2o.glm(x = x, y = y, training_frame = train, model_id = "glm_fit2", validation_frame = valid, family = "binomial", lambda_search = TRUE)
# Let's compare the performance of the two GLMs glm_perf1 <- h2o.performance(model = glm_fit1, newdata = test) glm_perf2 <- h2o.performance(model = glm_fit2, newdata = test) # Print model performance glm_perf1 glm_perf2
# Instead of printing the entire model performance metrics object, # it is probably easier to print just the metric that you are interested in comparing. # Retreive test set AUC h2o.auc(glm_perf1) #0.677449084114 h2o.auc(glm_perf2) #0.677675858276
# Compare test AUC to the training AUC and validation AUC h2o.auc(glm_fit2, train = TRUE) #0.674306164325 h2o.auc(glm_fit2, valid = TRUE) #0.675512216705 glm_fit2@model$validation_metrics #0.675512216705
# 2. Random Forest # H2O's Random Forest (RF) implements a distributed version of the standard # Random Forest algorithm and variable importance measures. # First we will train a basic Random Forest model with default parameters. # The Random Forest model will infer the response distribution from the response encoding. # A seed is required for reproducibility. rf_fit1 <- h2o.randomForest(x = x, y = y, training_frame = train, model_id = "rf_fit1", seed = 1)
# Next we will increase the number of trees used in the forest by setting `ntrees = 100`. # The default number of trees in an H2O Random Forest is 50, so this RF will be twice as # big as the default. Usually increasing the number of trees in a RF will increase # performance as well. Unlike Gradient Boosting Machines (GBMs), Random Forests are fairly # resistant (although not free from) overfitting. # See the GBM example below for additional guidance on preventing overfitting using H2O's # early stopping functionality. rf_fit2 <- h2o.randomForest(x = x, y = y, training_frame = train, model_id = "rf_fit2", #validation_frame = valid, #only used if stopping_rounds > 0 ntrees = 100, seed = 1)
# Let's compare the performance of the two RFs rf_perf1 <- h2o.performance(model = rf_fit1, newdata = test) rf_perf2 <- h2o.performance(model = rf_fit2, newdata = test) # Print model performance rf_perf1 rf_perf2 # Retreive test set AUC h2o.auc(rf_perf1) # 0.662266990734 h2o.auc(rf_perf2) # 0.66525468051
# Cross-validate performance # Rather than using held-out test set to evaluate model performance, a user may wish # to estimate model performance using cross-validation. Using the RF algorithm # (with default model parameters) as an example, we demonstrate how to perform k-fold # cross-validation using H2O. No custom code or loops are required, you simply specify # the number of desired folds in the nfolds argument. # Since we are not going to use a test set here, we can use the original (full) dataset, # which we called data rather than the subsampled `train` dataset. Note that this will # take approximately k (nfolds) times longer than training a single RF model, since it # will train k models in the cross-validation process (trained on n(k-1)/k rows), in # addition to the final model trained on the full training_frame dataset with n rows. rf_fit3 <- h2o.randomForest(x = x, y = y, training_frame = train, model_id = "rf_fit3", seed = 1, nfolds = 5)
# To evaluate the cross-validated AUC, do the following: h2o.auc(rf_fit3, xval = TRUE) # 0.661201482614
# 3. Gradient Boosting Machine # H2O's Gradient Boosting Machine (GBM) offers a Stochastic GBM, which can # increase performance quite a bit compared to the original GBM implementation. # Now we will train a basic GBM model # The GBM model will infer the response distribution from the response encoding if not specified # explicitly through the `distribution` argument. A seed is required for reproducibility. gbm_fit1 <- h2o.gbm(x = x, y = y, training_frame = train, model_id = "gbm_fit1", seed = 1)
# Next we will increase the number of trees used in the GBM by setting `ntrees=500`. # The default number of trees in an H2O GBM is 50, so this GBM will trained using ten times # the default. Increasing the number of trees in a GBM is one way to increase performance # of the model, however, you have to be careful not to overfit your model to the training data # by using too many trees. To automatically find the optimal number of trees, you must use # H2O's early stopping functionality. This example will not do that, however, the following # example will. gbm_fit2 <- h2o.gbm(x = x, y = y, training_frame = train, model_id = "gbm_fit2", #validation_frame = valid, #only used if stopping_rounds > 0 ntrees = 500, seed = 1)
# We will again set `ntrees = 500`, however, this time we will use early stopping in order to # prevent overfitting (from too many trees). All of H2O's algorithms have early stopping available, # however early stopping is not enabled by default (with the exception of Deep Learning). # There are several parameters that should be used to control early stopping. The three that are # common to all the algorithms are: `stopping_rounds`, `stopping_metric` and `stopping_tolerance`. # The stopping metric is the metric by which you'd like to measure performance, and so we will choose # AUC here. The `score_tree_interval` is a parameter specific to the Random Forest model and the GBM. # Setting `score_tree_interval = 5` will score the model after every five trees. The parameters we # have set below specify that the model will stop training after there have been three scoring intervals # where the AUC has not increased more than 0.0005. Since we have specified a validation frame, # the stopping tolerance will be computed on validation AUC rather than training AUC. gbm_fit3 <- h2o.gbm(x = x, y = y, training_frame = train, model_id = "gbm_fit3", validation_frame = valid, #only used if stopping_rounds > 0 ntrees = 500, score_tree_interval = 5, #used for early stopping stopping_rounds = 3, #used for early stopping stopping_metric = "AUC", #used for early stopping stopping_tolerance = 0.0005, #used for early stopping seed = 1) # Let's compare the performance of the two GBMs gbm_perf1 <- h2o.performance(model = gbm_fit1, newdata = test) gbm_perf2 <- h2o.performance(model = gbm_fit2, newdata = test) gbm_perf3 <- h2o.performance(model = gbm_fit3, newdata = test) # Print model performance gbm_perf1 gbm_perf2 gbm_perf3 # Retreive test set AUC h2o.auc(gbm_perf1) # 0.682765594191 h2o.auc(gbm_perf2) # 0.671854616713 h2o.auc(gbm_perf3) # 0.68309902855
# To examine the scoring history, use the `scoring_history` method on a trained model. # If `score_tree_interval` is not specified, it will score at various intervals, as we can # see for `h2o.scoreHistory()` below. However, regular 5-tree intervals are used # for `h2o.scoreHistory()`. # The `gbm_fit2` was trained only using a training set (no validation set), so the scoring # history is calculated for training set performance metrics only. h2o.scoreHistory(gbm_fit2)
# When early stopping is used, we see that training stopped at 105 trees instead of the full 500. # Since we used a validation set in `gbm_fit3`, both training and validation performance metrics # are stored in the scoring history object. Take a look at the validation AUC to observe that the # correct stopping tolerance was enforced. h2o.scoreHistory(gbm_fit3)
# Look at scoring history for third GBM model plot(gbm_fit3, timestep = "number_of_trees", metric = "AUC") plot(gbm_fit3, timestep = "number_of_trees", metric = "logloss")
# 4. Deep Learning # H2O's Deep Learning algorithm is a multilayer feed-forward artificial neural network. # It can also be used to train an autoencoder. In this example we will train # a standard supervised prediction model. # Train a default DL # First we will train a basic DL model with default parameters. The DL model will infer the response # distribution from the response encoding if it is not specified explicitly through the `distribution` # argument. H2O's DL will not be reproducible if it is run on more than a single core, so in this example, # the performance metrics below may vary slightly from what you see on your machine. # In H2O's DL, early stopping is enabled by default, so below, it will use the training set and # default stopping parameters to perform early stopping. dl_fit1 <- h2o.deeplearning(x = x, y = y, training_frame = train, model_id = "dl_fit1", seed = 1) # Train a DL with new architecture and more epochs. # Next we will increase the number of epochs used in the GBM by setting `epochs=20` (the default is 10). # Increasing the number of epochs in a deep neural net may increase performance of the model, however, # you have to be careful not to overfit your model to your training data. To automatically find the optimal number of epochs, # you must use H2O's early stopping functionality. Unlike the rest of the H2O algorithms, H2O's DL will # use early stopping by default, so for comparison we will first turn off early stopping. We do this in the next example # by setting `stopping_rounds=0`. dl_fit2 <- h2o.deeplearning(x = x, y = y, training_frame = train, model_id = "dl_fit2", #validation_frame = valid, #only used if stopping_rounds > 0 epochs = 20, hidden= c(10,10), stopping_rounds = 0, # disable early stopping seed = 1) # Train a DL with early stopping # This example will use the same model parameters as `dl_fit2`. This time, we will turn on # early stopping and specify the stopping criterion. We will also pass a validation set, as is # recommended for early stopping. dl_fit3 <- h2o.deeplearning(x = x, y = y, training_frame = train, model_id = "dl_fit3", validation_frame = valid, #in DL, early stopping is on by default epochs = 20, hidden = c(10,10), score_interval = 1, #used for early stopping stopping_rounds = 3, #used for early stopping stopping_metric = "AUC", #used for early stopping stopping_tolerance = 0.0005, #used for early stopping seed = 1) # Let's compare the performance of the three DL models dl_perf1 <- h2o.performance(model = dl_fit1, newdata = test) dl_perf2 <- h2o.performance(model = dl_fit2, newdata = test) dl_perf3 <- h2o.performance(model = dl_fit3, newdata = test) # Print model performance dl_perf1 dl_perf2 dl_perf3 # Retreive test set AUC h2o.auc(dl_perf1) # 0.6774335 h2o.auc(dl_perf2) # 0.678446 h2o.auc(dl_perf3) # 0.6770498 # Scoring history h2o.scoreHistory(dl_fit3) # Scoring History: # timestamp duration training_speed epochs # 1 2016-05-03 10:33:29 0.000 sec 0.00000 # 2 2016-05-03 10:33:29 0.347 sec 424697 rows/sec 0.86851 # 3 2016-05-03 10:33:30 1.356 sec 601925 rows/sec 6.09185 # 4 2016-05-03 10:33:31 2.348 sec 717617 rows/sec 13.05168 # 5 2016-05-03 10:33:32 3.281 sec 777538 rows/sec 20.00783 # 6 2016-05-03 10:33:32 3.345 sec 777275 rows/sec 20.00783 # iterations samples training_MSE training_r2 # 1 0 0.000000 # 2 1 99804.000000 0.14402 0.03691 # 3 7 700039.000000 0.14157 0.05333 # 4 15 1499821.000000 0.14033 0.06159 # 5 23 2299180.000000 0.14079 0.05853 # 6 23 2299180.000000 0.14157 0.05333 # training_logloss training_AUC training_lift # 1 # 2 0.45930 0.66685 2.20727 # 3 0.45220 0.68133 2.59354 # 4 0.44710 0.67993 2.70390 # 5 0.45100 0.68192 2.81426 # 6 0.45220 0.68133 2.59354 # training_classification_error validation_MSE validation_r2 # 1 # 2 0.36145 0.14682 0.03426 # 3 0.33647 0.14500 0.04619 # 4 0.37126 0.14411 0.05204 # 5 0.32868 0.14474 0.04793 # 6 0.33647 0.14500 0.04619 # validation_logloss validation_AUC validation_lift # 1 # 2 0.46692 0.66582 2.53209 # 3 0.46256 0.67354 2.64124 # 4 0.45789 0.66986 2.44478 # 5 0.46292 0.67117 2.70672 # 6 0.46256 0.67354 2.64124 # validation_classification_error # 1 # 2 0.37197 # 3 0.34716 # 4 0.34385 # 5 0.36544 # 6 0.34716 # Look at scoring history for third DL model plot(dl_fit3, timestep = "epochs", metric = "AUC")
# 5. Naive Bayes model # The Naive Bayes (NB) algorithm does not usually beat an algorithm like a Random Forest # or GBM, however it is still a popular algorithm, especially in the text domain (when your # input is text encoded as "Bag of Words", for example). The Naive Bayes algorithm is for # binary or multiclass classification problems only, not regression. Therefore, your response # must be a factor instead of a numeric. # First we will train a basic NB model with default parameters. nb_fit1 <- h2o.naiveBayes(x = x, y = y, training_frame = train, model_id = "nb_fit1") # Train a NB model with Laplace Smoothing # One of the few tunable model parameters for the Naive Bayes algorithm is the amount of Laplace # smoothing. The H2O Naive Bayes model will not use any Laplace smoothing by default. nb_fit2 <- h2o.naiveBayes(x = x, y = y, training_frame = train, model_id = "nb_fit2", laplace = 6) # Let's compare the performance of the two NB models nb_perf1 <- h2o.performance(model = nb_fit1, newdata = test) nb_perf2 <- h2o.performance(model = nb_fit2, newdata = test) # Print model performance nb_perf1 nb_perf2 # Retreive test set AUC h2o.auc(nb_perf1) # 0.6488014 h2o.auc(nb_perf2) # 0.6490678
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.