R/h2o_forecast.R

Defines functions forecast_h2o h2o_fitting

# ========================================== #
# Project automatic forecast ML
# Jared Chung
# 23/08/2017
# file: machine_learning_forecast
# ========================================== #

#' Forecast using h2o Machine learning techniques
#'
#' @param train Takes a TS object as an argument
#' @param test
#' @param
#' @param
#' @param
#' @param
#'
#' @example

#' @export


#https://github.com/h2oai/h2o-tutorials/blob/master/h2o-open-tour-2016/chicago/grid-search-model-selection.R
#http://www.business-science.io/code-tools/2017/10/28/demo_week_h2o.html

###################################
# H2o
####################################


forecast_h2o <- function(train,
                         test,
                         seed = 42,
                         run_automl = FALSE) {

  # Convert into dataframe if object is TS

  if(class(train) == "ts") {
    train <- data.frame(list(date = as.Date(lubridate::date_decimal(as.numeric(time(train)))),
                            value = as.numeric(train)))
    print("Object is Time Series")
  }
  if(class(test) == "ts") {
    test <- data.frame(list(date = as.Date(lubridate::date_decimal(as.numeric(time(test)))),
                            value = as.numeric(test)))
    print("Object is Time Series")
  }


  # h2o initilization
  init <- h2o::h2o.init(strict_version_check = FALSE, nthreads = -1)

  # Convert into h2o data frame
  train_h2o <- h2o::as.h2o(train)
  test_h2o <- h2o::as.h2o(test)

  # assign x and y index
  y_index <- 'value'
  x_index <- setdiff(names(train_h2o), y_index)

  # Hyperparamter search function

  h2o_gridsearch <- function(grid_id,
                             sort_by = "rmse",
                             decreasing = FALSE) {

    gridperf <- h2o::h2o.getGrid(grid_id = grid_id,
                                     sort_by = sort_by,
                                     decreasing = decreasing)

    h2o_model_id <- gridperf@model_ids[[1]]
    h2o_model <- h2o::h2o.getModel(h2o_model_id)

    model_param <- as.data.frame(h2o_model@parameters)

    return(list(h2o_model = h2o_model,
                model_param = model_param))
  }

  #========================================
  # ELASTIC NET
  #========================================

  # Set range of alpha to be searched on (deciding the balance between ridge(L2) and lasso(l1))

  print("running Generalised Linear Model")

  # solvers <- c("IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT")
  #
  # families <- c("gaussian", "poisson", "gamma")
  #
  # gaussianLinks <- c("identity", "log", "inverse")
  #
  # poissonLinks <- c("log")
  #
  # gammaLinks <- c("identity", "log", "inverse")
  # gammaLinks_CD <- c("identity", "log")

  # if(family == "gaussian")
  #     theLinks <- gaussianLinks
  #   else if(family == "poisson")theLinks <- poissonLinks
  #   else{
  #     if(solver == "COORDINATE_DESCENT")theLinks <- gammaLinks_CD
  #     else theLinks = gammaLinks
  # }

  # allGrids <- lapply(solvers, function(solver){
  #   lapply(families, function(family){
  #
  #     if(family == "gaussian")theLinks <- gaussianLinks
  #     else if(family == "poisson")theLinks <- poissonLinks
  #     else{
  #       if(solver == "COORDINATE_DESCENT")theLinks <- gammaLinks_CD
  #       else theLinks = gammaLinks
  #     }
  #
  #     lapply(theLinks, function(link){
  #       grid_id = paste("GLM", solver, family, link, sep="_")
  #       h2o.grid("glm", grid_id = grid_id,
  #                hyper_params = list(
  #                  alpha = c(0, 0.1, 0.5, 0.99)
  #                ),
  #                x = x, y = y, training_frame = train,
  #                nfolds = 10,
  #                lambda_search = TRUE,
  #
  #                solver = solver,
  #                family = family,
  #                link = link,
  #
  #                max_iterations = 100
  #       )
  #     })
  #   })
  # })

  glm_params = list(alpha = seq(0, 1, length = 10))

  glm_grid <- h2o::h2o.grid("glm",
                           x = x_index,
                           y = y_index,
                           grid_id = "glm_grid",
                           training_frame = train_h2o,
                           validation_frame = test_h2o,
                           family = "gaussian",
                           lambda_search = TRUE,
                           standardize = TRUE,
                           seed = seed,
                           hyper_params = glm_params)

  # Get the grid results, sorted by AUC

  glm_grid_details <- h2o_gridsearch("glm_grid")

  glm_h2o <- glm_grid_details$h2o_model

  glm_model_param <- glm_grid_details$model_param

  #========================================
  # RANDOM FOREST
  #========================================

  print("running Random Forest")

  rf_params = list(ntrees = c(100,300,500),
                   max_depth = c(3, 6, 9),
                   min_rows = c(1, 2),
                   mtries = c(2, 3, 4, 5),
                   sample_rate = c(0.5, 0.632, 0.8, 0.95),
                   col_sample_rate_per_tree = c(0.5, 0.9, 1.0))

  rf_grid <- h2o::h2o.grid("randomForest",
                            x = x_index,
                            y = y_index,
                            grid_id = "rf_grid",
                            training_frame = train_h2o,
                            validation_frame = test_h2o,
                            seed = seed,
                            hyper_params = rf_params)

  # Get the grid results, sorted by AUC

  rf_grid_details <- h2o_gridsearch("rf_grid")

  rf_h2o <- rf_grid_details$h2o_model

  rf_model_param <- rf_grid_details$model_param


  #========================================
  # GRADIENT BOOSTED MACHINE
  #========================================


  print("running Gradient Boosted Machine")

  gbm_params <- list(learn_rate = c(0.01, 0.05, 0.1),
                      max_depth = c(3, 6, 9),
                      sample_rate = c(0.6, 0.8, 1.0),
                      col_sample_rate = c(0.2, 0.5, 1.0),
                      ntrees = c(100,300,500),
                     min_rows = c(2, 5, 10),
                     col_sample_rate_per_tree = c(0.8, 0.99, 1.0))

  # Train and validate a grid of GBMs
  gbm_grid <- h2o::h2o.grid("gbm",
                        x = x_index,
                        y = y_index,
                        grid_id = "gbm_grid",
                        training_frame = train_h2o,
                        validation_frame = test_h2o,
                        seed = seed,
                        hyper_params = gbm_params)


  # Get the grid results, sorted by rmse
  gbm_grid_details <- h2o_gridsearch("gbm_grid")

  gbm_h2o <- gbm_grid_details$h2o_model

  gbm_model_param <- gbm_grid_details$model_param


  #========================================
  # MULTILAYER PERCEPTRON
  #========================================

  print("running Multilayer Perceptron")

  activation_opt <- c("Rectifier", "RectifierWithDropout", "Maxout", "MaxoutWithDropout")

  l1_opt <- c(0, 0.00001, 0.0001, 0.001, 0.01, 0.1)
  l2_opt <- c(0, 0.00001, 0.0001, 0.001, 0.01, 0.1)

  hyper_params <- list(activation = activation_opt,
                       l1 = l1_opt,
                       l2 = l2_opt)

  mlp_h2o <- h2o::h2o.deeplearning(
                        model_id="mlp_model",
                        x = x_index,
                        y = y_index,
                        training_frame = train_h2o,
                        validation_frame = test_h2o,   ## validation dataset: used for scoring and early stopping
                        activation = "Rectifier",  ## default
                        hidden = c(200,200),       ## default: 2 hidden layers with 200 neurons each
                        epochs = 100,
                        variable_importances = T    ## not enabled by default
                        )

  #========================================
  # AUTOMATIC MACHINE LEARNING
  #========================================

  if(run_automl) {
    automl_h2o <- h2o::h2o.automl(x = x_index,
                                y = y_index,
                                training_frame = train_h2o,
                                validation_frame = test_h2o,
                                #leaderboard_frame = test_h2o,
                                max_runtime_secs = 3300,
                                stopping_metric = "AUTO")
  } else {
    automl_h2o <- NA
  }


  # Model Results

  # To store data

  results <- matrix(nrow = 1,ncol = 5)

  results[1, 1] <- h2o::h2o.rmse(glm_h2o, valid=T)
  results[1, 2] <- h2o::h2o.rmse(rf_h2o, valid=T)
  results[1, 3] <- h2o::h2o.rmse(gbm_h2o, valid=T)
  results[1, 4] <- h2o::h2o.rmse(mlp_h2o, valid=T)

  if(run_automl) {
    results[1,5] <- h2o::h2o.rmse(automl_h2o, valid=T)
  } else {
    results[1,5] <- NA
  }


  results <- as.data.frame(results)
  colnames(results) <- c("glm","rf","gbm","mlp","auto_ml")

  models <- list(glm = glm_h2o,
                 rf = rf_h2o,
                 gbm = gbm_h2o,
                 mlp = mlp_h2o,
                 auto_ml = automl_h2o)

  #predictions <- as.h2o.predict(glm_h2o , newdata = )

  model_param <- list(glm = glm_model_param,
                      gbm = gbm_model_param,
                      rf = rf_model_param)

  #h2o::h2o.shutdown(prompt=FALSE)


  return(list(results = results,
              models = models,
              model_param = model_param))

}


h2o_fitting <- function() {

}


#
# Testing

# Run
#
# library(fpp2)
#
# data <- a10
#
# x_reg <- fit_feature_extracter(data, num_lag = 2, num_roll = 3)
#
# cv_horizon <- 1
# intitial_window <- 0.7
#
# trainslices <- cross_validation_data(data,
#                                      initialwindow = intitial_window,
#                                      horizon = cv_horizon)$train
# testslices <- cross_validation_data(data,
#                                     initialwindow = intitial_window,
#                                     horizon = cv_horizon)$test
#
# trainslices_xreg <- cross_validation_data(x_reg,
#                                           initialwindow = intitial_window,
#                                           horizon = cv_horizon)$train
# testslices_xreg <- cross_validation_data(x_reg,
#                                          initialwindow = intitial_window,
#                                          horizon = cv_horizon)$test
#
#
# # with external regressors
# result <- forecast_h2o(train = x_reg[trainslices_xreg[[1]],],
#                        test = x_reg[testslices_xreg[[1]],])



# # without external regressors
# result <- forecast_h2o(train = data[trainslices[[1]]],
#                        test = data[testslices[[1]]])


#h2o::h2o.shutdown(prompt=FALSE)



#TESTING h2o

# data <- a10
#
# cv_horizon <- 1
# intitial_window <- 0.7
#
# trainslices <- cross_validation_data(data,
#                                      initialwindow = intitial_window,
#                                      horizon = cv_horizon)$train
# testslices <- cross_validation_data(data,
#                                     initialwindow = intitial_window,
#                                     horizon = cv_horizon)$test
#
# trainslices_xreg <- cross_validation_data(external_regressor,
#                                           initialwindow = intitial_window,
#                                           horizon = cv_horizon)$train
# testslices_xreg <- cross_validation_data(external_regressor,
#                                          initialwindow = intitial_window,
#                                          horizon = cv_horizon)$test
#
#
# h2o.init()
#
# data <- data.frame(list(date = as.Date(time(data)),
#                         value = as.numeric(data)))
#
# data <- as.h2o(data)
#
# # GBM hyperparamters
# gbm_params1 <- list(learn_rate = c(0.01, 0.1),
#                     max_depth = c(3, 5, 9),
#                     sample_rate = c(0.8, 1.0),
#                     col_sample_rate = c(0.2, 0.5, 1.0))
#
# # Train and validate a grid of GBMs
# gbm_grid1 <- h2o.grid("gbm", x = "date", y = "value",
#                       grid_id = "gbm_grid1",
#                       training_frame = data[trainslices[[1]],],
#                       validation_frame = data[testslices[[1]],],
#                       ntrees = 100,
#                       seed = 1,
#                       hyper_params = gbm_params1)
#
#
# # Get the grid results, sorted by AUC
# gbm_gridperf1 <- h2o.getGrid(grid_id = "gbm_grid1",
#                              sort_by = "rmse",
#                              decreasing = FALSE)
#
#
# best_gbm_model_id <- gbm_gridperf1@model_ids[[1]]
# best_gbm <- h2o.getModel(best_gbm_model_id)
#
# model_param <- as.data.frame(best_gbm@parameters)
#
#
# alpha_opts = list(list(0), list(.25), list(.5), list(.75), list(1))
# hyper_parameters = list(alpha = alpha_opts)
#
# glm_h2o <- h2o::h2o.glm(x = x_index,
#                         y = y_index,
#                         training_frame = train_h2o,
#                         validation_frame = test_h2o,
#                         seed = seed,
#                         family = "gaussian",
#                         lambda_search = TRUE,
#                         standardize = TRUE,
#                         #hyper_params = hyper_parameters,
#                         nfolds = 5)
#
# #
# rf_h2o <- h2o::h2o.randomForest(x = x_index,
#                                 y = y_index,
#                                 training_frame = train_h2o,
#                                 validation_frame = test_h2o,
#                                 seed = seed,
#
# h2o.shutdown()





###################################
# Mxnet
####################################




###################################
# Lightgbm
####################################





###################################
# Catboost
####################################




###################################
# Fitting of models with cross validation
####################################
JaredChung/forecastml documentation built on May 21, 2019, 2:31 a.m.