R/binaryClassificationHelper.R

Defines functions h2o.get_auc h2o.varimp h2o.get_varimp h2o.validate h2o.fit h2o.selectModel h2o.leaderBoard

Documented in h2o.fit h2o.get_auc h2o.get_varimp h2o.leaderBoard h2o.selectModel h2o.validate h2o.varimp

# helper Functions

TOP_FEATURES = 5

#' h2o.get_auc
#' @description Offitial H2O helper function (binaryClassificationHelper.R).
#' @export
h2o.get_auc <- function(model, data, response) {
  pred <- h2o.predict(model, data)[,3]
  perf <- h2o.performance(pred, data[,response])
  return(perf@model$auc)
}

#' h2o.varimp
#' @description Offitial H2O helper function (binaryClassificationHelper.R).
#' @export
h2o.varimp <- function(algo, model) {
  if (identical(algo, h2o.glm)) {
    varimp <- paste(names(sort(abs(model@model$normalized_coefficients), TRUE))[1:TOP_FEATURES], collapse = ",", sep = ",")
  } else if (identical(algo, h2o.randomForest) || identical(algo, h2o.deeplearning)) {
    varimp <- paste(names(sort(model@model$varimp[1,], TRUE))[1:TOP_FEATURES], collapse = ",", sep = ",")
  } else if (identical(algo, h2o.gbm)) {
    varimp <- paste(rownames(model@model$varimp)[1:TOP_FEATURES], collapse = ",", sep = ",")
  }
  varimp
}

#' h2o.get_varimp
#' @description Offitial H2O helper function (binaryClassificationHelper.R).
#' @export
h2o.get_varimp <- function(algo, model, top_features=5) {
  if (identical(algo, h2o.glm)) {
    varimp <- paste(names(sort(abs(model@model$normalized_coefficients), TRUE))[1:top_features], collapse = ",", sep = ",")
  } else if (identical(algo, h2o.randomForest) || identical(algo, h2o.deeplearning)) {
    varimp <- paste(names(sort(model@model$varimp[1,], TRUE))[1:top_features], collapse = ",", sep = ",")
  } else if (identical(algo, h2o.gbm)) {
    varimp <- paste(rownames(model@model$varimp)[1:top_features], collapse = ",", sep = ",")
  }
  return(varimp)
}

#' h2o.validate
#' @description Offitial H2O helper function (binaryClassificationHelper.R).
#' @export
h2o.validate <- function(t0, model, modeltype, validation, response, varimp) {
  elapsed_seconds <- as.numeric(Sys.time()) - as.numeric(t0)
  modelkey <- model@key
  type <- modeltype
  auc <- h2o.get_auc(model, validation, response)
  result <- list(list(model, modeltype, response, elapsed_seconds, auc, varimp))
  names(result) <- model@key
  return(result)
}

#' h2o.fit
#' @description Offitial H2O helper function (binaryClassificationHelper.R).
#' @export
h2o.fit <- function(algo, data, args) {
  t0 <- Sys.time()
  predictors <- data$x
  response <- data$y
  train <- data$train
  valid <- data$valid
  nfolds <- data$nfolds
  if (nfolds >= 0) {
    model <- do.call(algo, modifyList(list(x=predictors, y=response, data=train, nfolds=nfolds), args))
  } else {
    model <- do.call(algo, modifyList(list(x=predictors, y=response, data=train), args))
  }
  if (.hasSlot(model,"sumtable")) {
    model <- model@model[[1]]
  }
  return(h2o.validate(t0, model, as.character(substitute(algo)), valid, response, h2o.varimp(algo, model)))
}

#' h2o.selectModel
#' @description Offitial H2O helper function (binaryClassificationHelper.R).
#' @export
h2o.selectModel <- function(x) {
  c(model_key = x[[1]]@key,
    model_type = x[[2]],
    train_auc = as.numeric(x[[1]]@model$auc),
    validation_auc = as.numeric(x[[5]]),
    important_feat = x[[6]],
    tuning_time_s = as.numeric(as.character(x[[4]])))
}

#' h2o.leaderBoard
#' @description Offitial H2O helper function (binaryClassificationHelper.R).
#' @export
h2o.leaderBoard <- function(models, test_hex, response, top_features=5) {
  model.list <- as.data.frame(t(as.data.frame(lapply(models, h2o.selectModel))))
  model.list$train_auc <- as.numeric(as.character(model.list$train_auc))
  model.list$validation_auc <- as.numeric(as.character(model.list$validation_auc))

  #### sort the models by AUC from worst to best
  models.sort.by.auc <- model.list[with(model.list, order(validation_auc)),-1]
  models.sort.by.auc <- models.sort.by.auc[rev(rownames(models.sort.by.auc)),]

  #### convert the `auc` and `tuning_time` columns into numerics
  models.sort.by.auc$train_auc       <- as.numeric(as.character(models.sort.by.auc$train_auc))
  models.sort.by.auc$validation_auc  <- as.numeric(as.character(models.sort.by.auc$validation_auc))
  models.sort.by.auc$tuning_time_s   <- as.numeric(as.character(models.sort.by.auc$tuning_time_s))

  #### display the frame
  print(models.sort.by.auc)

  #### score the best model on the test data
  best_model <- h2o.getModel(h2oServer, rownames(models.sort.by.auc)[1])
  preds <- h2o.predict(best_model, test_hex)
  test_auc <- h2o.get_auc(best_model, test_hex, response)

  cat(paste(" -------------------------------\n",
            "Best Model Performance On Final Testing Data:", "\n",
            "AUC = ", round(test_auc,6), "\n",
            "--------------------------------\n"))

  cat(paste(" =---------Summary------------=\n",
            "Best model type: ", models.sort.by.auc[1,]$model_type, "\n",
            "Best model AUC on test: ", round(test_auc,6), "\n",
            "Top", top_features, "important features: ", models.sort.by.auc[1,]$important_feat, "\n",
            "Model training time (incl. tuning, grid search): ", round(models.sort.by.auc[1,]$tuning_time_s,6), "seconds \n",
            "=----------------------------=\n"))
  best_model
}
rocalabern/h2otools documentation built on May 27, 2019, 12:13 p.m.