R/H2OHelpers.R

Defines functions H2OVariableImportance H2OValidationData H2OSaveModel H2ODataPrep FeatureStore H2OCleanData H2OArgsCheck

# AutoQuant is a package for quickly creating high quality visualizations under a common and easy api.
# Copyright (C) <year>  <name of author>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

#' @title H2OArgsCheck
#'
#' @description Ensure arguments are defined correctly
#'
#' @author Adrian
#' @family H2O Helpers
#'
#' @param ModelType "drf",
#' @param TargetType "classification",
#' @param model_path. Passthrough
#' @param metadata_path. Passthrough
#' @param eval_metric. Passthrough
#' @param MaxModelsInGrid. Passthrough
#' @param ModelID. Passthrough
#' @param NumOfParDepPlots. Passthrough
#' @param ReturnModelObjects. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param GridTune. Passthrough
#' @param GridStrategy. Passthrough
#' @param CostMatrixWeights. Passthrough
#' @param IfSaveModel. Passthrough
#' @param Trees. Passthrough
#' @param MaxDepth. Passthrough
#' @param SampleRate. Passthrough
#' @param MTries. Passthrough
#' @param ColSampleRatePerTree. Passthrough
#' @param ColSampleRatePerTreeLevel. Passthrough
#' @param MinRows. Passthrough
#' @param NBins. Passthrough
#' @param NBinsCats. Passthrough
#' @param NBinsTopLevel. Passthrough
#' @param HistogramType. Passthrough
#' @param CategoricalEncoding. Passthrough
#'
#' @noRd
H2OArgsCheck <- function(ModelType = "drf",
                         TargetType = "classification",
                         model_path. = model_path,
                         metadata_path. = metadata_path,
                         eval_metric. = eval_metric,
                         MaxModelsInGrid. = MaxModelsInGrid,
                         ModelID. = ModelID,
                         NumOfParDepPlots. = NumOfParDepPlots,
                         ReturnModelObjects. = ReturnModelObjects,
                         SaveModelObjects. = SaveModelObjects,
                         GridTune. = GridTune,
                         GridStrategy. = GridStrategy,
                         CostMatrixWeights. = CostMatrixWeights,
                         IfSaveModel. = IfSaveModel,
                         Trees. = Trees,
                         MaxDepth. = MaxDepth,
                         SampleRate. = SampleRate,
                         MTries. = MTries,
                         ColSampleRatePerTree. = ColSampleRatePerTree,
                         ColSampleRatePerTreeLevel. = ColSampleRatePerTreeLevel,
                         MinRows. = MinRows,
                         NBins. = NBins,
                         NBinsCats. = NBinsCats,
                         NBinsTopLevel. = NBinsTopLevel,
                         HistogramType. = HistogramType,
                         CategoricalEncoding. = CategoricalEncoding) {

  if(tolower(ModelType) %chin% c("drf","gbm")) {
    if(!is.null(model_path.)) if(!dir.exists(file.path(model_path.))) dir.create(model_path.)
    if(!is.null(metadata_path.)) if(!is.null(metadata_path.)) if(!dir.exists(file.path(metadata_path.))) dir.create(metadata_path.)
    if(TargetType == "classification") {
      if(!(tolower(eval_metric.) %chin% c("auc", "logloss"))) stop("eval_metric not in AUC, logloss")
    } else if(TargetType == "regression") {
      if(!(toupper(eval_metric.) %chin% c("MSE", "RMSE", "MAE", "RMSLE"))) stop("eval_metric not in 'MSE', 'RMSE', 'MAE', 'RMSLE'")
    }
    if(MaxModelsInGrid. < 1 && GridTune.) stop("MaxModelsInGrid needs to be at least 1")
    if(!is.null(model_path.)) if(!is.character(model_path.)) stop("model_path. needs to be a character type")
    if(!is.null(metadata_path.)) if(!is.character(metadata_path.)) stop("metadata_path. needs to be a character type")
    if(!is.character(ModelID.) && !is.null(ModelID.)) stop("ModelID needs to be a character type")
    if(NumOfParDepPlots. < 0) stop("NumOfParDepPlots needs to be a positive number")
    if(!(ReturnModelObjects. %in% c(TRUE, FALSE))) stop("ReturnModelObjects needs to be TRUE or FALSE")
    if(!(SaveModelObjects. %in% c(TRUE, FALSE))) stop("SaveModelObjects needs to be TRUE or FALSE")
    if(!(tolower(eval_metric.) == "auc")) eval_metric. <- tolower(eval_metric.) else eval_metric. <- toupper(eval_metric.)
    if(tolower(eval_metric.) %chin% c("auc")) Decreasing <- TRUE else Decreasing <- FALSE
    if(GridTune. && !GridStrategy. %chin% c("Cartesian","RandomDiscrete")) stop("GridStrategy must be either 'Random' or 'Cartesian'")
    if(TargetType == "classification") if(length(CostMatrixWeights.) != 4) stop("CostMatrixWeights needs to be a 4 element numeric vector")
    if(SaveModelObjects. && !tolower(IfSaveModel.) %chin% c("standard", "mojo")) stop("IfSaveModel needs to be either 'standard' or 'mojo'")
    if(!GridTune. && length(Trees.) > 1L) stop("Trees needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(MaxDepth.) > 1L) stop("MaxDepth needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(SampleRate.) > 1L) stop("SampleRate needs to be a single value unless you're grid tuning")
    if(!GridTune. && tolower(ModelType) == "drf" && length(MTries.) > 1L) stop("MTries needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(ColSampleRatePerTree.) > 1L) stop("ColSampleRatePerTree needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(ColSampleRatePerTreeLevel.) > 1L) stop("ColSampleRatePerTreeLevel needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(MinRows.) > 1L) stop("MinRows needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(NBins.) > 1L) stop("NBins needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(NBinsCats.) > 1L) stop("NBinsCats needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(NBinsTopLevel.) > 1L) stop("NBinsTopLevel needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(HistogramType.) > 1L) stop("HistogramType needs to be a single value unless you're grid tuning")
    if(!GridTune. && length(CategoricalEncoding.) > 1L) stop("CategoricalEncoding needs to be a single value unless you're grid tuning")
    return(Decreasing)
  }
}

#' @title H2OCleanData
#'
#' @author Adrian Antico
#' @family H2O Helpers
#'
#' @param dataTrain. data
#' @param dataTest. data
#' @param TestData. data
#' @param TrainOnFull. TrainOnFull
#'
#' @noRd
H2OCleanData <- function(dataTrain. = NULL,
                         dataTest. = NULL,
                         TestData. = NULL,
                         TrainOnFull. = TrainOnFull) {
  dataTrain. <- Rodeo::ModelDataPrep(data = dataTrain., Impute = FALSE, CharToFactor = TRUE, FactorToChar = FALSE, IntToNumeric = TRUE, LogicalToBinary = TRUE, DateToChar = TRUE, RemoveDates = FALSE, MissFactor = "0", MissNum = -1, IgnoreCols = NULL)
  dataTrain. <- Rodeo::ModelDataPrep(data = dataTrain., Impute = FALSE, CharToFactor = TRUE, FactorToChar = FALSE, IntToNumeric = FALSE, LogicalToBinary = FALSE, DateToChar = FALSE, RemoveDates = FALSE, MissFactor = "0", MissNum = -1, IgnoreCols = NULL)
  if(!TrainOnFull.) {
    dataTest. <- Rodeo::ModelDataPrep(data = dataTest., Impute = FALSE, CharToFactor = TRUE, FactorToChar = FALSE, IntToNumeric = TRUE, LogicalToBinary = TRUE, DateToChar = TRUE, RemoveDates = FALSE, MissFactor = "0", MissNum = -1, IgnoreCols = NULL)
    dataTest. <- Rodeo::ModelDataPrep(data = dataTest., Impute = FALSE, CharToFactor = TRUE, FactorToChar = FALSE, IntToNumeric = FALSE, LogicalToBinary = FALSE, DateToChar = FALSE, RemoveDates = FALSE, MissFactor = "0", MissNum = -1, IgnoreCols = NULL)
  }
  if(!is.null(TestData.)) {
    TestData. <- Rodeo::ModelDataPrep(data = TestData., Impute = FALSE, CharToFactor = TRUE, FactorToChar = FALSE, IntToNumeric = TRUE, LogicalToBinary = TRUE, DateToChar = TRUE, RemoveDates = FALSE, MissFactor = "0", MissNum = -1, IgnoreCols = NULL)
    TestData. <- Rodeo::ModelDataPrep(data = TestData., Impute = FALSE, CharToFactor = TRUE, FactorToChar = FALSE, IntToNumeric = FALSE, LogicalToBinary = FALSE, DateToChar = FALSE, RemoveDates = FALSE, MissFactor = "0", MissNum = -1, IgnoreCols = NULL)
  }
  return(list(dataTrain = dataTrain., dataTest = dataTest., TestData = TestData.))
}

#' @param dataTrain. Passthrough
#' @param FeatureColNames. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param model_path. Passthrough
#' @param ModelID. Passthrough
#'
#' @noRd
FeatureStore <- function(dataTrain. = NULL,
                         FeatureColNames. = FeatureColNames,
                         SaveModelObjects. = SaveModelObjects,
                         model_path. = model_path,
                         ModelID. = ModelID) {
  if(is.numeric(FeatureColNames.)) {
    Names <- data.table::as.data.table(names(dataTrain.)[FeatureColNames.])
    data.table::setnames(Names, "V1", "ColNames")
  } else {
    Names <- data.table::as.data.table(FeatureColNames.)
    if(!"V1" %chin% names(Names)) {
      data.table::setnames(Names, "FeatureColNames.", "ColNames")
    } else {
      data.table::setnames(Names, "V1", "ColNames")
    }
  }
  if(SaveModelObjects. && length(model_path.) > 0L) data.table::fwrite(Names, file = file.path(model_path., paste0(ModelID., "_ColNames.csv")))
  return(Names)
}

#' @title H2ODataPrep
#'
#' @param TargetType. 'classifier', 'multiclass', 'regression'
#' @param TargetColumnName. Passthrough
#' @param data. Passthrough
#' @param ValidationData. Passthrough
#' @param TestData. Passthrough
#' @param TrainOnFull. Passthrough
#' @param FeatureColNames. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param ModelID. Passthrough
#' @param model_path. Passthrough
#' @param TransformNumericColumns. Regression Passthrough
#' @param Methods. Regression Passthrough
#'
#' @noRd
H2ODataPrep <- function(TargetType. = "classifier",
                        TargetColumnName. = TargetColumnName,
                        data. = NULL,
                        ValidationData. = NULL,
                        TestData. = NULL,
                        TrainOnFull. = TrainOnFull,
                        FeatureColNames. = FeatureColNames,
                        SaveModelObjects. = SaveModelObjects,
                        model_path. = model_path,
                        ModelID. = ModelID,
                        TransformNumericColumns. = NULL,
                        Methods. = NULL) {

  if(TargetType. %chin% c("classifier","multiclass","regression")) {

    # Target Name Storage
    if(!is.character(TargetColumnName.)) TargetColumnName. <- names(data.)[TargetColumnName.]

    # Ensure data. is a data.table
    if(!data.table::is.data.table(data.)) data.table::setDT(data.)
    if(!is.null(ValidationData.)) if(!data.table::is.data.table(ValidationData.)) data.table::setDT(ValidationData.)
    if(!is.null(TestData.)) if(!data.table::is.data.table(TestData.)) data.table::setDT(TestData.)

    # Ensure Target is a factor
    if(TargetType. %chin% c("classifier","multiclass")) {
      if(!is.factor(data.[[eval(TargetColumnName.)]])) {
        data.[, eval(TargetColumnName.) := as.factor(get(TargetColumnName.))]
        if(!is.null(ValidationData.)) ValidationData.[, eval(TargetColumnName.) := as.factor(get(TargetColumnName.))]
        if(!is.null(TestData.)) TestData.[, eval(TargetColumnName.) := as.factor(get(TargetColumnName.))]
      }
    }

    # Target levels
    if(TargetType. == "multiclass") {
      if(!is.null(TestData.)) {
        TargetLevels <- unique(
          as.character(data.[[eval(TargetColumnName.)]]),
          as.character(ValidationData.[[eval(TargetColumnName.)]]),
          as.character(TestData.[[eval(TargetColumnName.)]]))
      } else {
        TargetLevels <- unique(as.character(data.[[eval(TargetColumnName.)]]))
      }
    } else {
      TargetLevels <- NULL
    }

    # Data Partition
    if(TargetType. != "regression") {
      if(is.null(ValidationData.) && is.null(TestData.) && !TrainOnFull.) {
        dataSets <- Rodeo::AutoDataPartition(
          data = data.,
          NumDataSets = 3L,
          Ratios = c(0.70, 0.20, 0.10),
          PartitionType = "random",
          StratifyColumnNames = NULL,
          TimeColumnName = NULL)
        dataTrain <- dataSets$TrainData
        dataTest <- dataSets$ValidationData
        TestData. <- dataSets$TestData
      }
    } else {

      # Convert TransformNumericColumns to Names if not character
      if(!is.null(TransformNumericColumns.) && !is.character(TransformNumericColumns.)) TransformNumericColumns. <- names(data)[TransformNumericColumns.]

      # Transform data, ValidationData, and TestData ----
      if(!is.null(ValidationData.) && !is.null(TransformNumericColumns.)) {
        Output <- Rodeo::AutoTransformationCreate(
          data = data.,
          ColumnNames = TransformNumericColumns.,
          Methods = Methods.,
          Path = model_path.,
          TransID = ModelID.,
          SaveOutput = SaveModelObjects.)
        data. <- Output$Data
        TransformationResults <- Output$FinalResults

        # Transform ValidationData
        ValidationData. <- Rodeo::AutoTransformationScore(
          ScoringData = ValidationData.,
          Type = "Apply",
          FinalResults = TransformationResults,
          TransID = NULL,
          Path = NULL)

        # Transform TestData
        if(!is.null(TestData.)) {
          TestData. <- Rodeo::AutoTransformationScore(
            ScoringData = TestData.,
            Type = "Apply",
            FinalResults = TransformationResults,
            TransID = NULL,
            Path = NULL)
        }
      }

      # Regression Data Partition
      if(is.null(ValidationData.) && is.null(TestData.) && !TrainOnFull.) {
        if(!is.null(TransformNumericColumns.)) {
          dataSets <- Rodeo::AutoDataPartition(
            data = data.,
            NumDataSets = 3L,
            Ratios = c(0.70, 0.20, 0.10),
            PartitionType = "random",
            StratifyColumnNames = NULL,
            TimeColumnName = NULL)
          dataTrain <- dataSets$TrainData
          dataTest <- dataSets$ValidationData
          TestData <- dataSets$TestData
          rm(dataSets)

          # Transform data sets
          Output <- Rodeo::AutoTransformationCreate(
            dataTrain,
            ColumnNames = TransformNumericColumns.,
            Methods = Methods.,
            Path = model_path.,
            TransID = ModelID.,
            SaveOutput = SaveModelObjects.)
          dataTrain <- Output$Data
          TransformationResults <- Output$FinalResults

          # Transform ValidationData
          dataTest <- Rodeo::AutoTransformationScore(
            ScoringData = dataTest,
            Type = "Apply",
            FinalResults = TransformationResults,
            TransID = NULL,
            Path = NULL)

          # Transform TestData
          if(!is.null(TestData.)) {
            TestData. <- Rodeo::AutoTransformationScore(
              ScoringData = TestData.,
              Type = "Apply",
              FinalResults = TransformationResults,
              TransID = NULL,
              Path = NULL)
          }
        } else {
          dataSets <- Rodeo::AutoDataPartition(
            data = data.,
            NumDataSets = 3,
            Ratios = c(0.70, 0.20, 0.10),
            PartitionType = "random",
            StratifyColumnNames = NULL,
            TimeColumnName = NULL)
          dataTrain <- dataSets$TrainData
          dataTest <- dataSets$ValidationData
          TestData <- dataSets$TestData
          rm(dataSets)
        }
      }

      # Get Min Value of TargetColumnName Data
      MinVal <- min(data.[[eval(TargetColumnName.)]], na.rm = TRUE)
    }

    # Extras
    if(!exists("TransformationResults")) TransformationResults <- NULL
    if(!exists("MinVal")) MinVal <- NULL

    # Create dataTrain if not exists
    if(!exists("dataTrain")) dataTrain <- data.
    if(!exists("dataTest") && !TrainOnFull.) {
      dataTest <- ValidationData.
    } else if(!exists("dataTest")) {
      dataTest <- NULL
    }

    # Data prep ----
    Output <- H2OCleanData(dataTrain. = dataTrain, dataTest. = dataTest, TestData. = TestData., TrainOnFull. = TrainOnFull.)
    dataTrain <- Output$dataTrain; Output$dataTrain <- NULL
    dataTest <- Output$dataTest; Output$dataTest <- NULL
    TestData <- Output$TestData; Output$TestData <- NULL

    # Save Names of data.
    Names <- FeatureStore(dataTrain.=dataTrain, FeatureColNames.=FeatureColNames., SaveModelObjects.=SaveModelObjects., model_path.=model_path., ModelID.=ModelID.)

    # Return
    return(list(
      dataTrain = dataTrain,
      dataTest = dataTest,
      TestData = TestData.,
      TargetColumnName = TargetColumnName.,
      Names = Names,
      TargetLevels = TargetLevels,
      TransformationResults = TransformationResults,
      MinVal = MinVal))
  }
}

#' @param SaveModelObjects. Passthrough
#' @param IfSaveModel. Passthrough
#' @param base_model. Passthrough
#' @param model_path. Passthrough
#' @param ModelID. Passthrough
#'
#' @noRd
H2OSaveModel <- function(SaveModelObjects. = SaveModelObjects,
                         IfSaveModel. = IfSaveModel,
                         base_model. = base_model,
                         model_path. = model_path,
                         ModelID. = ModelID) {
  if(SaveModelObjects.) {
    for(i in 1:10) print(IfSaveModel.)
    if(tolower(IfSaveModel.) == "mojo") {
      h2o::h2o.save_mojo(object = base_model., path = model_path., force = TRUE)
      h2o::h2o.download_mojo(model = base_model., path = model_path., get_genmodel_jar = TRUE, genmodel_path = model_path., genmodel_name = ModelID.)
    } else {
      h2o::h2o.saveModel(object = base_model., path = model_path., force = TRUE)
    }
  }
}

#' @param Predict. Passthrough
#' @param TestData. Passthrough
#' @param dataTest. Passthrough
#' @param dataTrain. Passthrough
#' @param TrainOnFull. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param metadata_path. Passthrough
#' @param model_path. Passthrough
#' @param ModelID. Passthrough
#' @param TransformNumericColumns. Regression Passthrough
#' @param TransformationResults. Regression Passthrough
#' @param TargetColumnName. Regression Passthrough
#' @param data. Regression Passthrough
#'
#' @noRd
H2OValidationData <- function(Predict. = Predict,
                              TestData. = NULL,
                              dataTest. = NULL,
                              dataTrain. = NULL,
                              TrainOnFull. = TrainOnFull,
                              SaveModelObjects. = SaveModelObjects,
                              metadata_path. = metadata_path,
                              model_path. = model_path,
                              ModelID. = ModelID,
                              TransformNumericColumns. = NULL,
                              TransformationResults. = NULL,
                              TargetColumnName. = NULL,
                              data. = NULL) {

  # Create validation data
  ValidationData <- data.table::as.data.table(cbind(if(!is.null(TestData.)) TestData. else if(!TrainOnFull.) dataTest. else dataTrain., Predict.))
  data.table::setnames(ValidationData, "predict", "Predict")

  # Inverse Transform
  if(!is.null(TransformNumericColumns.)) {
    if(GridTune) TransformationResults. <- TransformationResults.[ColumnName != "Predict"]
    TransformationResults. <- data.table::rbindlist(list(
      TransformationResults.,
      data.table::data.table(
        ColumnName = "Predict",
        MethodName = rep(TransformationResults.[ColumnName == eval(TargetColumnName.), MethodName], 1L),
        Lambda = rep(TransformationResults.[ColumnName == eval(TargetColumnName.), Lambda], 1L),
        NormalizedStatistics = rep(0, 1L))))

    # If Actual target columnname == "Target" remove the duplicate version
    if(length(unique(TransformationResults.[["ColumnName"]])) != nrow(TransformationResults.)) {
      temp <- TransformationResults.[, .N, by = "ColumnName"][N != 1L][[1L]]
      temp1 <- which(names(ValidationData) == temp)[1L]
      data.table::set(ValidationData, j = eval(names(data.)[temp1]), value = NULL)
      TransformationResults. <- TransformationResults.[, ID := seq_len(.N)][ID != which(TransformationResults.[["ID"]] == temp1)][, ID := NULL]
    }

    # Transform Target and Predicted Value
    ValidationData <- Rodeo::AutoTransformationScore(
      ScoringData = ValidationData,
      Type = "Inverse",
      FinalResults = TransformationResults.,
      TransID = NULL,
      Path = NULL)
  }

  # Save validation data
  if(SaveModelObjects.) {
    for(aaa in 1:10) print(metadata_path.)
    if(!is.null(metadata_path.)) {
      data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., "_ValidationData.csv")))
    }
  }

  # Return output
  return(list(
    ValidationData = ValidationData,
    TransformationResults = TransformationResults.))
}

#' @param TrainOnFull. Passthrough
#' @param base_model. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param metadata_path Passthrough
#' @param model_path. Passthrough
#' @param ModelID. Passthrough
#'
#' @noRd
H2OVariableImportance <- function(TrainOnFull. = TrainOnFull,
                                  base_model. = base_model,
                                  SaveModelObjects. = SaveModelObjects,
                                  metadata_path. = metadata_path,
                                  model_path. = model_path,
                                  ModelID. = ModelID) {
  # Variable Importance
  if(!TrainOnFull.) {
    VariableImportance <- data.table::as.data.table(h2o::h2o.varimp(object = base_model.))
    data.table::setnames(VariableImportance, c("variable","relative_importance","scaled_importance","percentage"), c("Variable","RelativeImportance","ScaledImportance","Percentage"))
    VariableImportance[, ':=' (RelativeImportance = round(RelativeImportance, 4L), ScaledImportance = round(ScaledImportance, 4L), Percentage = round(Percentage, 4L))]
    if(SaveModelObjects.) {
      if(!is.null(metadata_path.)) {
        data.table::fwrite(VariableImportance, file = file.path(metadata_path., paste0(ModelID., "_VariableImportance.csv")))
      }
    }
  } else {
    VariableImportance <- NULL
  }

  # return
  return(VariableImportance)
}
AdrianAntico/RemixAutoML documentation built on Feb. 3, 2024, 3:32 a.m.