R/XGBoostHelpers.R

Defines functions XGBoostRegressionMetrics XGBoostValidationData XGBoostMultiClassPredict XGBoostGridParams XGBoostParameterGrids XGBoostFinalParams XGBoostDataPrep XGBoostArgsCheck

# AutoQuant is a package for quickly creating high quality visualizations under a common and easy api.
# Copyright (C) <year>  <name of author>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

#' @title XGBoostArgsCheck
#'
#' @author Adrian Antico
#' @family XGBoost Helpers
#'
#' @param GridTune. Passthrough
#' @param model_path. Passthrough
#' @param metadata_path. Passthrough
#' @param Trees. Passthrough
#' @param max_depth. Passthrough
#' @param eta. Passthrough
#' @param min_child_weight. Passthrough
#' @param subsample. Passthrough
#' @param colsample_bytree. Passthrough
#' @return A list containing data.table's with the parameters shuffled and ready to test in the bandit framework
#' @noRd
XGBoostArgsCheck <- function(GridTune.=GridTune,
                             model_path.=model_path,
                             metadata_path.=metadata_path,
                             Trees.=NTrees,
                             max_depth.=max_depth,
                             eta.=eta,
                             min_child_weight.=min_child_weight,
                             subsample.=subsample,
                             colsample_bytree.=colsample_bytree) {

  # Ensure model_path. and metadata_path. exists
  if(!is.null(model_path.) && !dir.exists(file.path(model_path.))) dir.create(model_path.)
  if(!is.null(metadata_path.) && !is.null(metadata_path.)) if(!dir.exists(file.path(metadata_path.))) dir.create(metadata_path.)

  # Ensure only one value if not grid tuning
  if(!GridTune. && length(Trees.) > 1) stop('Trees cannot have more than one value supplied')
  if(!GridTune. && length(max_depth.) > 1) stop('Depth cannot have more than one value supplied')
  if(!GridTune. && length(eta.) > 1) stop('LearningRate cannot have more than one value supplied')
  if(!GridTune. && length(min_child_weight.) > 1) stop('L2_Leaf_Reg cannot have more than one value supplied')
  if(!GridTune. && length(subsample.) > 1) stop('L2_Leaf_Reg cannot have more than one value supplied')
  if(!GridTune. && length(colsample_bytree.) > 1) stop('L2_Leaf_Reg cannot have more than one value supplied')
}

#' @title XGBoostDataPrep
#'
#' @description Prepare data for XGBoost modeling
#'
#' @family XGBoost Helpers
#' @author Adrian Antico
#'
#' @param Algo 'xgboost', 'lightgbm'
#' @param OutputSelection. Passthrough
#' @param ModelType 'regression', 'classification', or 'multiclass'
#' @param data. Passthrough
#' @param ValidationData. Passthrough
#' @param TestData. Passthrough
#' @param TargetColumnName. Passthrough
#' @param FeatureColNames. Passthrough
#' @param PrimaryDateColumn. Passthrough
#' @param WeightsColumnName. Passthrough
#' @param IDcols. Passthrough
#' @param TransformNumericColumns. Passthrough regression
#' @param Methods. Passthrough regression
#' @param ModelID. Passthrough regression
#' @param model_path. Passthrough regression
#' @param DummifyCols. Passthrough regression
#' @param LossFunction. Passthrough regression
#' @param EvalMetric. Passthrough regression
#' @param TrainOnFull. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param ReturnFactorLevels. Passthrough
#' @param EncodingMethod. Passthrough
#' @param DebugMode. Passthrough
#'
#' @noRd
XGBoostDataPrep <- function(Algo = 'xgboost',
                            OutputSelection. = NULL,
                            ModelType = 'regression',
                            data. = data,
                            ValidationData. = ValidationData,
                            TestData. = TestData,
                            TargetColumnName. = TargetColumnName,
                            FeatureColNames. = FeatureColNames,
                            WeightsColumnName. = NULL,
                            IDcols. = NULL,
                            TransformNumericColumns. = TransformNumericColumns,
                            Methods. = Methods,
                            ModelID. = ModelID,
                            model_path. = model_path,
                            TrainOnFull. = TrainOnFull,
                            SaveModelObjects. = SaveModelObjects,
                            ReturnFactorLevels.=ReturnFactorLevels,
                            EncodingMethod. = EncodingMethod,
                            DebugMode. = FALSE) {

  # Ensure data. is a data.table ----
  if(DebugMode.) print("Ensure data. is a data.table")
  if(!data.table::is.data.table(data.)) data.table::setDT(data.)
  if(!is.null(ValidationData.) && !data.table::is.data.table(ValidationData.)) data.table::setDT(ValidationData.)
  if(!is.null(TestData.) && !data.table::is.data.table(TestData.)) data.table::setDT(TestData.)

  # Target Name Storage ----
  if(DebugMode.) print("Target Name Storage")
  if(!is.character(TargetColumnName.)) TargetColumnName. <- names(data.)[TargetColumnName.]
  if(!is.character(FeatureColNames.)) FeatureColNames. <- names(data.)[FeatureColNames.]
  if(!is.null(IDcols.) && !is.character(IDcols.)) IDcols. <- names(data.)[IDcols.]

  # Identify column numbers for factor variables ----
  if(DebugMode.) print("Identify column numbers for factor variables")
  CatFeatures <- sort(c(as.numeric(which(sapply(data., is.factor))), as.numeric(which(sapply(data., is.character)))))
  if(!identical(numeric(0), CatFeatures)) {
    CatFeatures <- names(data.)[CatFeatures]
    CatFeatures <- CatFeatures[CatFeatures %in% FeatureColNames.]
    if(!is.null(IDcols.)) CatFeatures <- CatFeatures[!CatFeatures %chin% IDcols.]
  } else if(length(CatFeatures) == 0L) {
    CatFeatures <- NULL
  }

  # Store WeightsVector ----
  if(DebugMode.) print("Store WeightsVector")
  if(!is.null(WeightsColumnName.)) {
    if(Algo == 'xgboost') {
      WeightsVector <- data.[[eval(WeightsColumnName.)]]
      if(WeightsColumnName. %in% names(data.)) data.table::set(data., j = WeightsColumnName., value = NULL)
      if(!is.null(ValidationData.) && WeightsColumnName. %chin% names(ValidationData.)) data.table::set(ValidationData., j = WeightsColumnName., value = NULL)
      if(!is.null(TestData.) && WeightsColumnName. %chin% names(TestData.)) data.table::set(TestData., j = WeightsColumnName., value = NULL)
      FeatureColNames. <- FeatureColNames.[!FeatureColNames. %chin% WeightsColumnName.]
    } else {
      if(!WeightsColumnName. %chin% FeatureColNames.) FeatureColNames. <- c(FeatureColNames., WeightsColumnName.)
    }
  }

  # Target var management ----
  if(DebugMode.) print("Target var management")
  if(ModelType == 'multiclass' && !is.null(CatFeatures)) CatFeatures <- setdiff(CatFeatures, TargetColumnName.)

  # Classification ----
  if(ModelType == 'classification') {

    # Debug
    if(DebugMode.) print("Classification")

    # Data Partition
    if(DebugMode.) print("Data Partition")
    if(is.null(ValidationData.) && is.null(TestData.) && !TrainOnFull.) {
      dataSets <- Rodeo::AutoDataPartition(
        data = data.,
        NumDataSets = 3L,
        Ratios = c(0.80, 0.10, 0.10),
        PartitionType = 'random',
        StratifyColumnNames = TargetColumnName.,
        TimeColumnName = NULL)
      data. <- dataSets$TrainData
      ValidationData. <- dataSets$ValidationData
      TestData. <- dataSets$TestData
    }

    # Dummify dataTrain Categorical Features ----
    if(DebugMode.) print("Dummify dataTrain Categorical Features")
    if(length(CatFeatures) > 0L) {
      x <- names(data.table::copy(data.))
      print(EncodingMethod.)
      Output <- Rodeo::EncodeCharacterVariables(RunMode='train', ModelType=ModelType, TrainData=data., ValidationData=ValidationData., TestData=TestData., TargetVariableName=TargetColumnName., CategoricalVariableNames=CatFeatures, EncodeMethod=EncodingMethod., KeepCategoricalVariables=TRUE, ReturnMetaData=TRUE, MetaDataPath=model_path., MetaDataList=NULL, ImputeMissingValue=0)
      data. <- Output$TrainData; Output$TrainData <- NULL
      ValidationData. <- Output$ValidationData; Output$ValidationData <- NULL
      TestData. <- Output$TestData; Output$TestData. <- NULL
      FactorLevelsList <- Output$MetaData; rm(Output)
      y <- setdiff(names(data.), x)
      if(length(y) == 0L) y <- NULL
      FeatureColNames. <- FeatureColNames.[!FeatureColNames. %in% CatFeatures]
      if(tolower(ModelType) != 'multiclass') FeatureColNames. <- c(FeatureColNames., y) else FeatureColNames. <- c(FeatureColNames., y[!y %in% IDcols.])
    } else {
      FactorLevelsList <- NULL
    }

    # Classification data. Subset Columns Needed
    if(DebugMode.) print("Classification data. Subset Columns Needed")
    if(!is.null(ValidationData.)) {
      if(ncol(data.) == ncol(ValidationData.)) {
        if(length(CatFeatures) != 0L) {
          TrainMerge <- data.table::rbindlist(list(data.[, .SD, .SDcols = c(names(data.)[!names(data.) %in% y])],ValidationData.[, .SD, .SDcols = c(names(ValidationData.)[!names(ValidationData.) %in% y])]), use.names = TRUE, fill = TRUE)
        } else {
          TrainMerge <- data.table::rbindlist(list(data.,ValidationData.), use.names = TRUE, fill = TRUE)
        }
      } else {
        TrainMerge <- NULL
      }
      dataTrain <- data.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      dataTest <- ValidationData.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      if(!is.null(TestData.)) {
        TestMerge <- data.table::copy(TestData.)
        TestData. <- TestData.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      }
    } else {
      TrainMerge <- data.table::copy(data.)
      dataTrain <- data.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      dataTest <- NULL
      TestData. <- NULL
      TestMerge <- NULL
    }

    # Save Names of data
    if(DebugMode.) print("Save Names of data")
    Names <- data.table::as.data.table(names(dataTrain))
    if(!'V1' %chin% names(Names)) {
      data.table::setnames(Names, 'FeatureColNames.', 'ColNames')
    } else {
      data.table::setnames(Names, 'V1', 'ColNames')
    }

    # Subset TargetColumnName. Variables ----
    if(DebugMode.) print("Subset TargetColumnName. Variables")
    TrainTarget <- dataTrain[, get(TargetColumnName.)]
    if(!TrainOnFull.) TestTarget <- dataTest[, get(TargetColumnName.)]
    if(!is.null(TestData.)) FinalTestTarget <- TestData.[, get(TargetColumnName.)]

    # Remove TargetColumnName. Variable from Feature Data
    if(DebugMode.) print("Remove TargetColumnName. Variable from Feature Data")
    data.table::set(dataTrain, j = TargetColumnName., value = NULL)
    if(!TrainOnFull.) data.table::set(dataTest, j = TargetColumnName., value = NULL)
    if(!is.null(TestData.)) data.table::set(TestData., j = TargetColumnName., value = NULL)

    # Save feature names
    if(DebugMode.) print("Save feature names")
    Names <- Names[ColNames != eval(TargetColumnName.)]
    if(SaveModelObjects.) data.table::fwrite(Names, file = file.path(model_path., paste0(ModelID., '_ColNames.csv')))
  }

  # Regression
  if(ModelType == 'regression') {

    # Debug
    if(DebugMode.) print("Regression")

    # Transform data., ValidationData., and TestData.
    if(DebugMode.) print("Transform data., ValidationData., and TestData.")
    if((TrainOnFull. || !is.null(ValidationData.)) && !is.null(TransformNumericColumns.)) {
      MeanTrainTarget <- data.[, mean(get(TargetColumnName.))]
      Output <- Rodeo::AutoTransformationCreate(
        data.,
        ColumnNames = TransformNumericColumns.,
        Methods = Methods.,
        Path = model_path.,
        TransID = ModelID.,
        SaveOutput = SaveModelObjects.)
      data. <- Output$Data
      TransformationResults <- Output$FinalResults

      # Transform ValidationData.
      if(DebugMode.) print("Transform ValidationData.")
      if(!is.null(ValidationData.)) {
        ValidationData. <- Rodeo::AutoTransformationScore(
          ScoringData = ValidationData.,
          Type = 'Apply',
          FinalResults = TransformationResults,
          TransID = NULL,
          Path = NULL)
      }

      # Transform TestData.
      if(DebugMode.) print("Transform TestData.")
      if(!is.null(TestData.)) {
        TestData. <- Rodeo::AutoTransformationScore(
          ScoringData = TestData.,
          Type = 'Apply',
          FinalResults = TransformationResults,
          TransID = NULL,
          Path = NULL)
      }
    }

    # Regression Data Partition ----
    if(DebugMode.) print("Regression Data Partition")
    if(is.null(ValidationData.) && is.null(TestData.) && !TrainOnFull.) {
      if(!is.null(TransformNumericColumns.)) {
        dataSets <- Rodeo::AutoDataPartition(
          data = data.,
          NumDataSets = 3L,
          Ratios = c(0.80, 0.10, 0.10),
          PartitionType = 'random',
          StratifyColumnNames = NULL,
          TimeColumnName = NULL)
        data. <- dataSets$TrainData
        ValidationData. <- dataSets$ValidationData
        TestData. <- dataSets$TestData

        # Mean of data.
        if(DebugMode.) print("Mean of data.")
        MeanTrainTarget <- data.[, mean(get(TargetColumnName.))]

        # Transform data. sets
        if(DebugMode.) print("Transform data. sets")
        Output <- Rodeo::AutoTransformationCreate(
          data = data.,
          ColumnNames = TransformNumericColumns.,
          Methods = Methods.,
          Path = model_path.,
          TransID = ModelID.,
          SaveOutput = SaveModelObjects.)
        data. <- Output$Data
        TransformationResults <- Output$FinalResults

        # Transform ValidationData.
        if(DebugMode.) print("Transform ValidationData.")
        ValidationData. <- Rodeo::AutoTransformationScore(
          ScoringData = ValidationData.,
          Type = 'Apply',
          FinalResults = TransformationResults,
          TransID = NULL,
          Path = NULL)

        # Transform TestData.
        if(DebugMode.) print("Transform TestData.")
        if(!is.null(TestData.)) {
          TestData. <- Rodeo::AutoTransformationScore(
            ScoringData = TestData.,
            Type = 'Apply',
            FinalResults = TransformationResults,
            TransID = NULL,
            Path = NULL)
        }
      } else {

        # Partition Data
        if(DebugMode.) print("Partition Data")
        dataSets <- Rodeo::AutoDataPartition(
          data.,
          NumDataSets = 3L,
          Ratios = c(0.80, 0.10, 0.10),
          PartitionType = 'random',
          StratifyColumnNames = NULL,
          TimeColumnName = NULL)
        data. <- dataSets$TrainData
        ValidationData. <- dataSets$ValidationData
        TestData. <- dataSets$TestData
        MeanTrainTarget <- data.[, mean(get(TargetColumnName.))]
      }
    }

    # Dummify dataTrain Categorical Features ----
    if(DebugMode.) print("Dummify dataTrain Categorical Features")
    if(length(CatFeatures) != 0L) {
      x <- names(data.table::copy(data.))
      print(EncodingMethod.)
      Output <- Rodeo::EncodeCharacterVariables(RunMode='train', ModelType=ModelType, TrainData=data., ValidationData=ValidationData., TestData=TestData., TargetVariableName=TargetColumnName., CategoricalVariableNames=CatFeatures, EncodeMethod=EncodingMethod., KeepCategoricalVariables=TRUE, ReturnMetaData=TRUE, MetaDataPath=model_path., MetaDataList=NULL, ImputeMissingValue=0, Debug = DebugMode.)
      data. <- Output$TrainData; Output$TrainData <- NULL
      ValidationData. <- Output$ValidationData; Output$ValidationData <- NULL
      TestData. <- Output$TestData; Output$TestData. <- NULL
      FactorLevelsList <- Output$MetaData; rm(Output)

      y <- setdiff(names(data.), x)
      y <- y[!y %like% 'Predictions_']
      # y <- y[!y %like% 'GroupVar_']
      if(length(y) == 0L) y <- NULL
      FeatureColNames. <- FeatureColNames.[!FeatureColNames. %in% CatFeatures]
      FeatureColNames. <- c(FeatureColNames., y)
    }

    # Regression data. Subset Columns Needed
    if(DebugMode.) print("Regression data. Subset Columns Needed")
    if(!is.null(ValidationData.)) {
      if(ncol(data.) == ncol(ValidationData.)) {
        if(length(CatFeatures) != 0L) {
          TrainMerge <- data.table::rbindlist(list(
            data.[, .SD, .SDcols = c(names(data.)[!names(data.) %in% y])],
            ValidationData.[, .SD, .SDcols = c(names(ValidationData.)[!names(ValidationData.) %in% y])]), use.names = TRUE, fill = TRUE)
        } else {
          TrainMerge <- data.table::rbindlist(list(data.,ValidationData.), use.names = TRUE, fill = TRUE)
        }
      } else {
        TrainMerge <- NULL
      }
      dataTrain <- data.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      dataTest <- ValidationData.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      if(!is.null(TestData.)) {
        TestMerge <- data.table::copy(TestData.)
        TestData. <- TestData.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      }
    } else {
      TrainMerge <- data.table::copy(data.)
      dataTrain <- data.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      dataTest <- NULL
      TestData. <- NULL
      TestMerge <- NULL
    }

    # Save Names of data
    if(DebugMode.) print("Save Names of data")
    Names <- data.table::as.data.table(names(dataTrain))
    if(!'V1' %chin% names(Names)) {
      data.table::setnames(Names, 'FeatureColNames.', 'ColNames')
    } else {
      data.table::setnames(Names, 'V1', 'ColNames')
    }

    # Regression Subset Target Variables
    if(DebugMode.) print("Regression Subset Target Variables")
    TrainTarget <- dataTrain[, get(TargetColumnName.)]
    if(!is.null(dataTest)) {
      TestTarget <- dataTest[, get(TargetColumnName.)]
      if(!is.null(TestData.)) FinalTestTarget <- TestData.[, get(TargetColumnName.)]
    }

    # Regression Remove Target Variable from Feature Data
    if(DebugMode.) print("Regression Remove Target Variable from Feature Data")
    data.table::set(dataTrain, j = TargetColumnName., value = NULL)
    if(!is.null(dataTest)) data.table::set(dataTest, j = TargetColumnName., value = NULL)
    if(!is.null(TestData.)) data.table::set(TestData., j = TargetColumnName., value = NULL)

    # Save feature names
    if(DebugMode.) print("Save feature names")
    if(Algo == "xgboost") {
      Names <- Names[!ColNames %chin% c(eval(TargetColumnName.), eval(IDcols.), eval(WeightsColumnName.))]
    } else if(Algo == "lightgbm") {
      Names <- Names[!ColNames %chin% c(eval(TargetColumnName.), eval(IDcols.), eval(WeightsColumnName.))]
    }
    if(SaveModelObjects.) data.table::fwrite(Names, file = file.path(model_path., paste0(ModelID., '_ColNames.csv')))
  }

  # MultiClass
  if(ModelType == 'multiclass') {

    # MultiClass Data Partition
    if(DebugMode.) print("MultiClass Data Partition")
    if(is.null(ValidationData.) && is.null(TestData.) && !TrainOnFull.) {
      dataSets <- Rodeo::AutoDataPartition(
        data = data.,
        NumDataSets = 3L,
        Ratios = c(0.80, 0.10, 0.10),
        PartitionType = 'random',
        StratifyColumnNames = TargetColumnName.,
        TimeColumnName = NULL)
      data. <- dataSets$TrainData
      ValidationData. <- dataSets$ValidationData
      TestData. <- dataSets$TestData
    }

    # Dummify dataTrain Categorical Features ----
    if(DebugMode.) print("Dummify dataTrain Categorical Features")
    if(length(CatFeatures) > 0L) {
      x <- names(data.table::copy(data.))
      print(EncodingMethod.)
      Output <- Rodeo::EncodeCharacterVariables(RunMode='train', ModelType=ModelType, TrainData=data., ValidationData=ValidationData., TestData=TestData., TargetVariableName=TargetColumnName., CategoricalVariableNames=CatFeatures, EncodeMethod=EncodingMethod., KeepCategoricalVariables=TRUE, ReturnMetaData=TRUE, MetaDataPath=model_path., MetaDataList=NULL, ImputeMissingValue=0)
      data. <- Output$TrainData; Output$TrainData <- NULL
      ValidationData. <- Output$ValidationData; Output$ValidationData <- NULL
      TestData. <- Output$TestData; Output$TestData. <- NULL
      FactorLevelsList <- Output$MetaData; rm(Output)
      y <- setdiff(names(data.), x)
      if(length(y) == 0L) y <- NULL
      FeatureColNames. <- FeatureColNames.[!FeatureColNames. %in% CatFeatures]
      FeatureColNames. <- c(FeatureColNames., y)
    } else {
      FactorLevelsList <- NULL
    }

    # MultiClass Obtain Unique Target Levels
    if(DebugMode.) print("MultiClass Obtain Unique Target Levels")
    if(!is.null(ValidationData.) && !is.null(TestData.)) {
      temp <- data.table::rbindlist(list(data., ValidationData., TestData.), use.names = TRUE, fill = TRUE)
    } else if(!is.null(ValidationData.)) {
      temp <- data.table::rbindlist(list(data., ValidationData.), use.names = TRUE, fill = TRUE)
    } else {
      temp <- data.
    }
    TargetLevels <- data.table::as.data.table(sort(unique(temp[[eval(TargetColumnName.)]])))
    data.table::setnames(TargetLevels, 'V1', 'OriginalLevels')
    TargetLevels[, NewLevels := 0L:(.N - 1L)]
    if(SaveModelObjects.) data.table::fwrite(TargetLevels, file = file.path(model_path., paste0(ModelID., '_TargetLevels.csv')))

    # Number of levels
    if(DebugMode.) print("Number of levels")
    NumLevels <- TargetLevels[, .N]

    # MultiClass Convert Target to Numeric Factor
    if(DebugMode.) print("MultiClass Convert Target to Numeric Factor")
    data. <- merge(data., TargetLevels, by.x = eval(TargetColumnName.), by.y = 'OriginalLevels', all = FALSE)
    data.[, paste0(TargetColumnName.) := NewLevels]
    data.[, NewLevels := NULL]
    # Merging causes data to sort differently
    if(!is.null(ValidationData.)) {
      ValidationData. <- merge(ValidationData., TargetLevels, by.x = eval(TargetColumnName.), by.y = 'OriginalLevels', all = FALSE)
      ValidationData.[, paste0(TargetColumnName.) := NewLevels]
      ValidationData.[, NewLevels := NULL]
      if(!is.null(TestData.)) {
        TestData. <- merge(TestData., TargetLevels, by.x = eval(TargetColumnName.), by.y = 'OriginalLevels', all = FALSE)
        TestData.[, paste0(TargetColumnName.) := NewLevels]
        TestData.[, NewLevels := NULL]
      }
    }

    # combine data and Subset Columns Needed
    if(DebugMode.) print("Regression data. Subset Columns Needed")
    if(!is.null(ValidationData.)) {
      if(ncol(data.) == ncol(ValidationData.)) {
        if(length(CatFeatures) != 0L) {
          TrainMerge <- data.table::rbindlist(list(data.[, .SD, .SDcols = c(names(data.)[!names(data.) %in% y])],ValidationData.[, .SD, .SDcols = c(names(ValidationData.)[!names(ValidationData.) %in% y])]), use.names = TRUE, fill = TRUE)
        } else {
          TrainMerge <- data.table::rbindlist(list(data.,ValidationData.), use.names = TRUE, fill = TRUE)
        }
      } else {
        TrainMerge <- NULL
      }
      dataTrain <- data.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      dataTest <- ValidationData.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      if(!is.null(TestData.)) {
        TestMerge <- data.table::copy(TestData.)
        TestData. <- TestData.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      }
    } else {
      TrainMerge <- data.table::copy(data.)
      dataTrain <- data.[, .SD, .SDcols = c(FeatureColNames., TargetColumnName.)]
      dataTest <- NULL
      TestData. <- NULL
      TestMerge <- NULL
    }

    # Save Names of data
    if(DebugMode.) print("Save Names of data")
    Names <- data.table::as.data.table(names(dataTrain))
    if(!'V1' %chin% names(Names)) {
      data.table::setnames(Names, 'FeatureColNames.', 'ColNames')
    } else {
      data.table::setnames(Names, 'V1', 'ColNames')
    }

    # MultiClass Subset Target Variables ----
    if(DebugMode.) print("MultiClass Subset Target Variables")
    TrainTarget <- dataTrain[, get(TargetColumnName.)]
    if(!is.null(dataTest)) TestTarget <- dataTest[, get(TargetColumnName.)]
    if(!is.null(TestData.)) FinalTestTarget <- TestData.[, get(TargetColumnName.)]

    # MultiClass Remove Target Variable from Feature Data
    if(DebugMode.) print("MultiClass Remove Target Variable from Feature Data")
    dataTrain[, eval(TargetColumnName.) := NULL]
    if(!is.null(dataTest)) dataTest[, eval(TargetColumnName.) := NULL]
    if(!is.null(TestData.)) TestData.[, eval(TargetColumnName.) := NULL]

    # Save feature names
    if(DebugMode.) print("Save feature names")
    Names <- Names[ColNames != eval(TargetColumnName.)]
    if(SaveModelObjects.) data.table::fwrite(Names, file = file.path(model_path., paste0(ModelID., '_ColNames.csv')))
  }

  # Convert data to model object data
  if(DebugMode.) print("Convert data to model object data")
  if('GroupVar' %chin% names(dataTrain)) data.table::set(dataTrain, j = 'GroupVar', value = NULL)
  if('Weights' %chin% names(dataTrain)) data.table::set(dataTrain, j = 'Weights', value = NULL)
  if(tolower(Algo) == 'xgboost') {
    datatrain <- xgboost::xgb.DMatrix(as.matrix(dataTrain), label = TrainTarget)
  } else if(tolower(Algo) == 'lightgbm') {
    datatrain <- lightgbm::lgb.Dataset(data=as.matrix(dataTrain), label=TrainTarget)
  }
  if(!TrainOnFull.) {
    if(DebugMode.) print("Convert data to model object dataTest")
    if('GroupVar' %chin% names(dataTest)) data.table::set(dataTest, j = 'GroupVar', value = NULL)
    if('Weights' %chin% names(dataTest)) data.table::set(dataTest, j = 'Weights', value = NULL)
    if(tolower(Algo) == 'xgboost') {
      datavalidate <- xgboost::xgb.DMatrix(as.matrix(dataTest), label = TestTarget)
    } else if(tolower(Algo) == 'lightgbm') {
      datavalidate <- lightgbm::lgb.Dataset(data=as.matrix(dataTest), label=TestTarget)
    }
    if(!is.null(TestData.)) {
      if(DebugMode.) print("Convert data to model object TestData.")
      if('GroupVar' %chin% names(TestData.)) data.table::set(TestData., j = 'GroupVar', value = NULL)
      if('Weights' %chin% names(TestData.)) data.table::set(TestData., j = 'Weights', value = NULL)
      if(tolower(Algo) == 'xgboost') {
        datatest <- xgboost::xgb.DMatrix(as.matrix(TestData.), label = FinalTestTarget)
        EvalSets <- list(train = datavalidate, test = datatest)
      } else if(tolower(Algo) == 'lightgbm') {
        datatest <- lightgbm::lgb.Dataset(data=as.matrix(TestData.), label=FinalTestTarget)
        EvalSets <- list(ValidationData = datavalidate, TestData = datatest)
      }
    } else {
      if(tolower(Algo) == 'xgboost') {
        EvalSets <- list(train = datatrain, test = datavalidate)
      } else if(tolower(Algo) == 'lightgbm') {
        EvalSets <- list(ValidationData = datavalidate)
      }
    }
  } else {
    EvalSets <- list(train = datatrain)
  }

  # Return objects
  if(DebugMode.) print("Return objects")
  return(list(
    WeightsVector = if(exists('WeightsVector')) WeightsVector else NULL,
    datatrain = datatrain,
    datavalidate = if(exists('datavalidate')) datavalidate else NULL,
    datatest = if(exists('datatest')) datatest else NULL,
    EvalSets = EvalSets,
    dataTrain = dataTrain,
    dataTest = if(exists('dataTest')) dataTest else NULL,
    TrainMerge = if(exists('TrainMerge')) TrainMerge else NULL,
    TestMerge = if(exists('TestMerge')) TestMerge else NULL,
    TestData = if(exists('TestData.')) TestData. else NULL,
    TrainTarget = TrainTarget,
    TestTarget = if(exists('TestTarget')) TestTarget else NULL,
    FinalTestTarget = if(exists('FinalTestTarget')) FinalTestTarget else NULL,
    TargetLevels = if(exists('TargetLevels')) TargetLevels else NULL,
    Names = Names,
    FactorLevelsList = if(exists('FactorLevelsList')) FactorLevelsList else NULL,
    IDcols = unique(IDcols.),
    TransformNumericColumns = TransformNumericColumns.,
    TransformationResults = if(exists('TransformationResults')) TransformationResults else NULL,
    NumLevels = if(exists('NumLevels')) NumLevels else NULL))
}

#' @title XGBoostFinalParams
#'
#' @description Parameters for xgboost fitting
#'
#' @author Adrian Antico
#' @family XGBoost Helpers
#'
#' @param GridTune. Passthrough
#' @param TrainOnFull. Passthrough
#' @param LossFunction. Passthrough
#' @param eval_metric. Passthrough
#' @param NThreads. Passthrough
#' @param TreeMethod. Passthrough
#' @param PassInGrid. Passthrough
#' @param BestGrid. Passthrough
#' @param Trees. Passthrough
#' @param Alpha. Passthrough
#' @param Lambda. = Lambda Passthrough
#' @param NumLevels. Passthrough
#'
#' @noRd
XGBoostFinalParams <- function(GridTune.=GridTune,
                               PassInGrid.=PassInGrid,
                               TrainOnFull.=TrainOnFull,
                               LossFunction.=LossFunction,
                               eval_metric.=eval_metric,
                               NThreads.=NThreads,
                               TreeMethod.=TreeMethod,
                               BestGrid.=BestGrid,
                               Trees.=Trees,
                               Alpha. = alpha,
                               Lambda. = lambda,
                               NumLevels. = NumLevels) {

  # Parameter list
  base_params <- list()
  base_params$alpha <- Alpha.
  base_params$lambda <- Lambda.
  base_params$booster <- 'gbtree'
  base_params$objective <- LossFunction.
  base_params$eval_metric <- tolower(eval_metric.)
  base_params$nthread <- NThreads.
  base_params$max_bin <- 64L
  base_params$tree_method <- TreeMethod.

  # Grid tuning
  if(!is.null(PassInGrid.)) {
    if(PassInGrid.[,.N] > 1L) stop('PassInGrid needs to be a single row data.table')
    if(PassInGrid.[, BanditProbs_Grid_1] == -10) {
      PassInGrid. <- NULL
    } else {
      base_params$max_depth <- PassInGrid.$Depth
      base_params$eta <- PassInGrid.$LearningRate
      base_params$subsample <- PassInGrid.$SubSample
      base_params$colsample_bytree <- PassInGrid.$ColSampleByTree
      Trees. <- PassInGrid.$NTrees
    }
  }

  # Define parameters for case where you want to run grid tuning
  if(GridTune. && !TrainOnFull. && BestGrid.[['RunNumber']] != 1L) {
    base_params$max_depth <- BestGrid.$Depth
    base_params$eta <- BestGrid.$LearningRate
    base_params$subsample <- BestGrid.$SubSample
    base_params$colsample_bytree <- BestGrid.$ColSampleByTree
    Trees. <- BestGrid.$NTrees
  } else {
    for(z in seq_along(base_params)) if(length(base_params[[z]]) > 1L) base_params[[z]] <- base_params[[z]][length(base_params[[z]])]
  }

  # Return base_params
  return(list(base_params = base_params, NTrees = Trees.))
}

#' @title XGBoostParameterGrids
#'
#' @author Adrian Antico
#' @family XGBoost Helpers
#'
#' @param Shuffles The number of shuffles you want to apply to each grid
#' @param NTrees seq(500L, 5000L, 500L)
#' @param Depth seq(4L, 16L, 2L)
#' @param LearningRate seq(0.05,0.40,0.05)
#' @param MinChildWeight seq(1.0, 10.0, 1.0)
#' @param SubSample seq(0.55, 1.0, 0.05)
#' @param ColSampleByTree seq(0.55, 1.0, 0.05)
#' @return A list containing data.table's with the parameters shuffled and ready to test in the bandit framework
#' @noRd
XGBoostParameterGrids <- function(Shuffles = 1L,
                                  NTrees = seq(500L, 5000L, 500L),
                                  Depth = seq(4L, 16L, 2L),
                                  LearningRate = seq(0.05,0.40,0.05),
                                  MinChildWeight = seq(1.0, 10.0, 1.0),
                                  SubSample = seq(0.55, 1.0, 0.05),
                                  ColSampleByTree = seq(0.55, 1.0, 0.05)) {

  # Create grid sets----
  Grid <- data.table::CJ(

    # Basis for creating parsimonous buckets----
    NTrees = if(!is.null(NTrees)) sort(NTrees, decreasing = FALSE) else seq(500L, 5000L, 500L),
    Depth = if(!is.null(Depth)) sort(Depth, decreasing = FALSE) else seq(4L, 16L, 2L),
    LearningRate = if(!is.null(LearningRate)) sort(LearningRate, decreasing = FALSE) else seq(0.01,0.10,0.01),

    # Random hyperparameters----
    MinChildWeight = if(!is.null(MinChildWeight)) MinChildWeight else seq(1.0, 10.0, 1.0),
    SubSample = if(!is.null(SubSample)) SubSample else seq(0.55, 1.0, 0.05),
    ColSampleByTree = if(!is.null(ColSampleByTree)) ColSampleByTree else seq(0.55, 1.0, 0.05))

  # Total loops----
  N_NTrees <- length(unique(Grid[['NTrees']]))
  N_Depth <- length(unique(Grid[['Depth']]))
  N_LearningRate <- length(unique(Grid[['LearningRate']]))
  Runs <- max(N_NTrees, N_Depth, N_LearningRate)
  Grids <- list()

  # Create grid sets----
  for(i in seq_len(Runs)) {
    if(i == 1L) {
      Grids[[paste0('Grid_',i)]] <- Grid[NTrees <= unique(Grid[['NTrees']])[min(i,N_NTrees)] & Depth <= unique(Grid[['Depth']])[min(i,N_Depth)] & LearningRate <= unique(Grid[['LearningRate']])[min(i,N_LearningRate)]]
    } else {
      Grids[[paste0('Grid_',i)]] <- data.table::fsetdiff(
        Grid[NTrees <= unique(Grid[['NTrees']])[min(i,N_NTrees)] & Depth <= unique(Grid[['Depth']])[min(i,N_Depth)] & LearningRate <= unique(Grid[['LearningRate']])[min(i,N_LearningRate)]],
        Grid[NTrees <= unique(Grid[['NTrees']])[min(i-1L,N_NTrees)] & Depth <= unique(Grid[['Depth']])[min(i-1L,N_Depth)] & LearningRate <= unique(Grid[['LearningRate']])[min(i-1L,N_LearningRate)]])
    }
  }

  # Define experimental grid----
  eGrid <- data.table::data.table(
    GridNumber = rep(-1, 10000L),
    RunNumber = 1L:10000L,
    RunTime = rep(-1, 10000L),
    EvalMetric = rep(-1,10000L),
    TreesBuilt = rep(-1,10000L),
    NTrees = rep(-1,10000L),
    Depth = rep(-1,10000L),
    LearningRate = rep(-1,10000L),
    MinChildWeight = rep(-1,10000L),
    SubSample = rep(-1,10000L),
    ColSampleByTree = rep('aa', 10000L))

  # Shuffle grid sets----
  for(shuffle in seq_len(Shuffles)) for(i in seq_len(Runs)) Grids[[paste0('Grid_',i)]] <- Grids[[paste0('Grid_',i)]][order(runif(Grids[[paste0('Grid_',i)]][,.N]))]

  # Return grid----
  return(list(Grid = Grid, Grids = Grids, ExperimentalGrid = eGrid))
}

#' @title XGBoostGridParams
#'
#' @author Adrian Antico
#' @family XGBoost Helpers
#'
#' @param N. Passthrough
#' @param counter. Passthrough
#' @param Objective. Passthrough
#' @param NThreads. = -1L,
#' @param BanditArmsN. Passthrough
#' @param EvalMetric. Passthrough
#' @param TreeMethod. Passthrough
#' @param model_path. Passthrough
#' @param NewGrid. Passthrough
#' @param Grid. Passthrough
#' @param GridClusters. Passthrough
#' @noRd
XGBoostGridParams <- function(N. = N,
                              counter. = NULL,
                              NThreads. = -1L,
                              Objective. = 'reg:logistic',
                              BanditArmsN. = NULL,
                              EvalMetric. = NULL,
                              TreeMethod. = NULL,
                              model_path. = NULL,
                              NewGrid. = NULL,
                              Grid. = NULL,
                              GridClusters. = NULL) {

  # Create base_params (independent of runs)
  base_params <- list()
  base_params$booster <- 'gbtree'
  base_params$objective <- Objective.
  base_params$eval_metric <- tolower(EvalMetric.)
  base_params$nthread <- NThreads.
  base_params$max_bin <- 64L
  base_params$tree_method <- TreeMethod.

  # Run-dependent args and updates
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$max_depth <- GridClusters.[[paste0('Grid_', counter.-1L)]][['Depth']][1L] else if(counter. != 1) base_params$max_depth <- GridClusters.[[paste0('Grid_',NewGrid.)]][['Depth']][N.]
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$eta <- GridClusters.[[paste0('Grid_', counter.-1L)]][['LearningRate']][1L] else if(counter. != 1L) base_params$eta <- GridClusters.[[paste0('Grid_',NewGrid.)]][['LearningRate']][N.]
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$min_child_weight <- GridClusters.[[paste0('Grid_', counter.-1L)]][['MinChildWeight']][1L] else if(counter. != 1L) base_params$eta <- GridClusters.[[paste0('Grid_',NewGrid.)]][['MinChildWeight']][N.]
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$subsample <- GridClusters.[[paste0('Grid_',counter.-1L)]][['SubSample']][1L] else if(counter. != 1L) base_params$subsample <- GridClusters.[[paste0('Grid_',NewGrid.)]][['SubSample']][N.]
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$colsample_bytree <- GridClusters.[[paste0('Grid_',counter.-1L)]][['ColSampleByTree']][1L] else if(counter. != 1L) base_params$colsample_bytree <- GridClusters.[[paste0('Grid_',NewGrid.)]][['ColSampleByTree']][N.]

  # Return
  return(base_params)
}

#' @title XGBoostMultiClassPredict
#'
#' @description Create prediction output that matches catboost for multiclass models. Class prediction along with probabilities for each class
#'
#' @param model Passthrough
#' @param datatest Passthrough
#' @param TargetLevels Passthrough
#' @param NumLevels Passthrough
#' @param NumberRows rows in scoring data
#'
#' @noRd
XGBoostMultiClassPredict <- function(model = NULL,
                                     datatest = NULL,
                                     TargetLevels = NULL,
                                     NumLevels = NULL,
                                     NumberRows = NULL) {
  temp1 <- stats::predict(model, datatest)
  predict <- data.table::data.table(Preds = temp1, Label = 0L:(NumLevels-1L), ID = sort(rep(seq_len(NumberRows), NumLevels)))
  data.table::setkeyv(predict, 'Label')
  data.table::setkeyv(TargetLevels, 'NewLevels')
  predict[TargetLevels, OriginalLevels := i.OriginalLevels][, Predict := OriginalLevels][, OriginalLevels := NULL]
  data.table::setorderv(predict, c('ID','Preds'), c(1L,-1L))
  Class <- predict[, list(Predict = data.table::first(Predict)), keyby = 'ID']
  predict <- data.table::dcast.data.table(data = predict, formula = ID ~ Label, fun.aggregate = data.table::first, value.var = 'Preds', fill = 0)
  data.table::setkeyv(predict, 'ID')
  predict[Class, Predict := i.Predict][, ID := NULL]
  data.table::setcolorder(predict, c(ncol(predict), seq_len((ncol(predict)-1L))))
  data.table::setnames(predict, names(predict)[2L:ncol(predict)], as.character(TargetLevels[[1L]]))
  return(predict)
}

#' @title XGBoostValidation
#'
#' @description Generate validation, importance, and shap data
#'
#' @family XGBoost Helpers
#' @author Adrian Antico
#'
#' @param model. Passthrough
#' @param ModelType Passthrough
#' @param TrainOnFull. Passthrough
#' @param TestDataCheck Passthrough
#' @param FinalTestTarget. Passthrough
#' @param TestData. Passthrough
#' @param TestTarget. Passthrough
#' @param TrainTarget. Passthrough
#' @param TrainMerge. Passthrough
#' @param TestMerge. Passthrough
#' @param dataTest. Passthrough
#' @param data. Passthrough
#' @param predict. Passthrough
#' @param TargetColumnName. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param metadata_path. Passthrough
#' @param model_path. Passthrough
#' @param ModelID. Passthrough
#' @param LossFunction. Passthrough
#' @param TransformNumericColumns. Passthrough
#' @param GridTune. Passthrough
#' @param TransformationResults. Passthrough
#' @param TargetLevels. Passthrough
#'
#' @noRd
XGBoostValidationData <- function(model.=model,
                                  ModelType = 'classification',
                                  TrainOnFull. = TrainOnFull,
                                  TestDataCheck = FALSE,
                                  FinalTestTarget. = FinalTestTarget,
                                  TestTarget. = TestTarget,
                                  TrainTarget. = TrainTarget,
                                  TrainMerge. = NULL,
                                  TestMerge. = TestMerge,
                                  TestData. = TestData,
                                  dataTest. = dataTest,
                                  data. = data,
                                  predict. = predict,
                                  TargetColumnName. = TargetColumnName,
                                  SaveModelObjects. = SaveModelObjects,
                                  metadata_path. = metadata_path,
                                  model_path. = model_path,
                                  ModelID. = ModelID,
                                  LossFunction. = LossFunction,
                                  TransformNumericColumns. = TransformNumericColumns,
                                  GridTune. = GridTune,
                                  TransformationResults. = TransformationResults,
                                  TargetLevels.=TargetLevels) {

  # Classification
  if(ModelType == 'classification') {

    # Generate validation data
    if(!TrainOnFull.) {
      if(TestDataCheck) {
        ValidationData <- data.table::as.data.table(cbind(TestMerge., p1 = predict.))
        if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
          ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(TestData.), model = model., features = names(TestData.), max_observations = 2000000000)$shap_contrib)
        } else {
          ShapValues <- NULL
        }
      } else {
        ValidationData <- data.table::as.data.table(cbind(Target = TestTarget., dataTest., p1 = predict.))
        data.table::setnames(ValidationData, 'Target', eval(TargetColumnName.), skip_absent = TRUE)
        if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
          ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(dataTest.), model = model., features = names(dataTest.), max_observations = 2000000000)$shap_contrib)
        } else {
          ShapValues <- NULL
        }
      }
    } else if(!is.null(TrainMerge.)) {
      ValidationData <- data.table::as.data.table(cbind(TrainMerge., predict.))
      if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
        ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(data.), model = model., features = names(data.), max_observations = 2000000000)$shap_contrib)
        Shap_test <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(dataTest.), model = model., features = names(dataTest.), max_observations = 2000000000)$shap_contrib)
        ShapValues <- data.table::rbindlist(list(ShapValues, Shap_test), use.names = TRUE, fill = TRUE)
        rm(Shap_test)
      } else {
        ShapValues <- NULL
      }
    } else {
      ValidationData <- data.table::as.data.table(cbind(Target = TrainTarget., data., p1 = predict.))
      data.table::setnames(ValidationData, 'Target', eval(TargetColumnName.), skip_absent = TRUE)
      if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
        ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(data.), model = model., features = names(data.), max_observations = 2000000000)$shap_contrib)
      } else {
        ShapValues <- NULL
      }
    }
  }

  # Regression
  if(ModelType == 'regression') {

    # Generate validation data
    if(!TrainOnFull.) {
      if(TestDataCheck) {
        ValidationData <- data.table::as.data.table(cbind(TestMerge., Predict = predict.))
        data.table::setnames(ValidationData, 'Target', TargetColumnName., skip_absent = TRUE)
        if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
          if(length(names(TestData.)) > 1L) {
            ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(TestData.), model = model., features = names(TestData.), max_observations = 2000000000)$shap_contrib)
          } else {
            ShapValues <- NULL
          }
        } else {
          ShapValues <- NULL
        }
      } else {
        ValidationData <- data.table::as.data.table(cbind(Target = TestTarget., dataTest., Predict = predict.))
        if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
          if(length(names(dataTest.)) > 1L) {
            ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(dataTest.), model = model., features = names(dataTest.), max_observations = 2000000000)$shap_contrib)
          } else {
            ShapValues <- NULL
          }
        } else {
          ShapValues <- NULL
        }
        if(length(TargetColumnName.) > 1L) {
          data.table::setnames(ValidationData, c(names(ValidationData)[seq_along(TargetColumnName.)]), c(TargetColumnName.))
        } else {
          data.table::setnames(ValidationData, 'Target', TargetColumnName.)
        }
      }
    } else if(!is.null(TrainMerge.)) {
      ValidationData <- data.table::as.data.table(cbind(TrainMerge., predict.))
      if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
        if(length(names(data.)) > 1L) {
          ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(data.), model = model., features = names(data.), max_observations = 2000000000)$shap_contrib)
          Shap_test <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(dataTest.), model = model., features = names(dataTest.), max_observations = 2000000000)$shap_contrib)
          ShapValues <- data.table::rbindlist(list(ShapValues, Shap_test), use.names = TRUE, fill = TRUE)
          rm(Shap_test)
        } else {
          ShapValues <- NULL
        }
      } else {
        ShapValues <- NULL
      }
    } else {
      ValidationData <- data.table::as.data.table(cbind(Target = TrainTarget., data., Predict = predict.))
      if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
        if(length(names(data.)) > 1L) {
          ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(data.), model = model., features = names(data.), max_observations = 2000000000)$shap_contrib)
        } else {
          ShapValues <- NULL
        }
      } else {
        ShapValues <- NULL
      }
      data.table::setnames(ValidationData, 'Target', TargetColumnName.)
    }

    # Back transform before running metrics and plots
    if(!is.null(TransformNumericColumns.)) {
      if(GridTune. && !TrainOnFull.) TransformationResults. <- TransformationResults.[ColumnName != 'Predicted']
      if(length(TargetColumnName.) == 1L) {

        # Prepare transformation object
        TransformationResults. <- data.table::rbindlist(list(
          TransformationResults.,
          data.table::data.table(
            ColumnName = c('Predict'),
            MethodName = TransformationResults.[ColumnName == eval(TargetColumnName.), MethodName],
            Lambda = TransformationResults.[ColumnName == eval(TargetColumnName.), Lambda],
            NormalizedStatistics = 0L)), use.names = TRUE, fill = TRUE)
        if(length(unique(TransformationResults.[['ColumnName']])) != nrow(TransformationResults.)) {
          temp <- TransformationResults.[, .N, by = 'ColumnName'][N != 1L][[1L]]
          if(!is.null(ValidationData)) temp1 <- which(names(ValidationData) == temp)[1L]
          if(!TrainOnFull.) {
            ValidationData[, eval(names(data.)[temp1]) := NULL]
          } else {
            if(TrainOnFull.) {
              if(length(which(names(data.) %chin% eval(TargetColumnName.))) > 1L) {
                temp1 <- which(names(data.) %chin% eval(TargetColumnName.))[1L]
                data.[, which(names(data.) %chin% eval(TargetColumnName.))[2L] := NULL]
              }
            } else {
              data.[, eval(names(data.)[temp]) := NULL]
            }
          }
          TransformationResults. <- TransformationResults.[, ID := 1L:.N][ID != max(ID)]
        }

        # Back transform
        ValidationData <- Rodeo::AutoTransformationScore(
          ScoringData = ValidationData,
          Type = 'Inverse',
          FinalResults = TransformationResults.,
          TransID = NULL,
          Path = NULL)

      } else {

        # Prepare transformation object
        TransformationResults. <- data.table::rbindlist(list(TransformationResults., TransformationResults.), use.names = TRUE, fill = TRUE)
        for(z in seq_along(TargetColumnName.)) TransformationResults.[length(TargetColumnName.) + z, ColumnName := paste0('Predict.V',z)]

        # Back transform
        ValidationData <- Rodeo::AutoTransformationScore(
          ScoringData = ValidationData,
          Type = 'Inverse',
          FinalResults = TransformationResults.,
          TransID = NULL,
          Path = NULL)
      }
    }
  }

  # Multiclass
  if(ModelType == 'multiclass') {
    if(!TrainOnFull.) {
      if(TestDataCheck) {
        ValidationData <- data.table::as.data.table(cbind(predict., TestMerge.))
        data.table::setnames(ValidationData, 'Target', TargetColumnName., skip_absent = TRUE)
        if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
          ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(TestData.), model = model., features = names(TestData.))$shap_contrib)
        } else {
          ShapValues <- NULL
        }
      } else {
        ValidationData <- data.table::as.data.table(cbind(predict., Target = TestTarget., dataTest.))
        if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
          ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(dataTest.), model = model., features = names(dataTest.), max_observations = 2000000000)$shap_contrib)
        } else {
          ShapValues <- NULL
        }
        if(length(TargetColumnName.) > 1L) {
          data.table::setnames(ValidationData, c(names(ValidationData)[seq_along(TargetColumnName.)]), c(TargetColumnName.))
        } else {
          data.table::setnames(ValidationData, 'Target', TargetColumnName.)
        }
      }
    } else if(!is.null(TrainMerge.)) {
      ValidationData <- data.table::as.data.table(cbind(predict., TrainMerge.))
      if(!any(class(model.) %chin% c('lgb.Booster', 'R6'))) {
        if(!is.null(dataTest.)) data. <- data.table::rbindlist(list(data., dataTest.), use.names = TRUE, fill = TRUE)
        ShapValues <- data.table::as.data.table(xgboost:::xgb.shap.data(as.matrix(data.), model = model., features = names(data.), max_observations = 2000000000)$shap_contrib)
      } else {
        ShapValues <- NULL
      }
    }
  }

  # Finalize data
  if('ID_Factorizer' %chin% names(ValidationData)) data.table::set(ValidationData, j = 'ID_Factorizer', value = NULL)
  if(!any(class(model.) %chin% c('lgb.Booster', 'R6')) && !is.null(ShapValues)) {
    data.table::setnames(ShapValues, names(ShapValues), paste0('Shap_', names(ShapValues)))
    ValidationData <- cbind(ValidationData, ShapValues)
  }

  # Save validation data
  if(SaveModelObjects. && !TrainOnFull.) {
    if(!is.null(metadata_path.)) {
      data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., '_ValidationData.csv')))
    } else {
      data.table::fwrite(ValidationData, file = file.path(model_path., paste0(ModelID., '_ValidationData.csv')))
    }
  } else if(SaveModelObjects.) {
    if(!is.null(metadata_path.)) {
      data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., '_TrainData.csv')))
    } else {
      data.table::fwrite(ValidationData, file = file.path(model_path., paste0(ModelID., '_TrainData.csv')))
    }
  }

  # Variable Importance
  if(!any(class(model.) %chin% c('lgb.Booster', 'R6')) && !is.null(ShapValues)) {
    VariableImportance <- tryCatch({data.table::as.data.table(xgboost::xgb.importance(model = model.))}, error = function(x) NULL)
  } else {
    VariableImportance <- tryCatch({data.table::as.data.table(lightgbm::lgb.importance(model = model., percentage = TRUE))}, error = function(x) NULL)
  }

  if(!is.null(VariableImportance)) {
    VariableImportance[, ':=' (Gain = round(Gain, 4L), Cover = round(Cover, 4L), Frequency = round(Frequency, 4L))]
    data.table::setnames(VariableImportance, c('Feature','Gain'), c('Variable','Importance'))
    if(SaveModelObjects.) {
      if(!is.null(metadata_path.)) {
        data.table::fwrite(VariableImportance, file = file.path(metadata_path., paste0(ModelID., '_Train_Importance_VariableImportance.csv')))
      } else {
        data.table::fwrite(VariableImportance, file = file.path(model_path., paste0(ModelID., '_Train_Importance_VariableImportance.csv')))
      }
    }
  }

  # Return
  return(list(
    ValidationData = ValidationData,
    VariableImportance = VariableImportance,
    ShapValues = ShapValues,
    TransformationResults = if(exists('TransformationResults.')) TransformationResults. else NULL))
}

#' @title XGBoostRegressionMetrics
#'
#' @author Adrian Antico
#' @family XGBoost Helpers
#'
#' @param grid_eval_metric Passthrough
#' @param MinVal = -1L,
#' @param calibEval Passthrough
#' @noRd
XGBoostRegressionMetrics <- function(grid_eval_metric,
                                     MinVal,
                                     calibEval) {
  if(tolower(grid_eval_metric) == 'poisson') {
    if(MinVal > 0L && min(calibEval[['p1']], na.rm = TRUE) > 0L) {
      calibEval[, Metric := p1 - Target * log(p1 + 1)]
      Metric <- calibEval[, mean(Metric, na.rm = TRUE)]
    }
  } else if(tolower(grid_eval_metric) == 'mae') {
    calibEval[, Metric := abs(Target - p1)]
    Metric <- calibEval[, mean(Metric, na.rm = TRUE)]
  } else if(tolower(grid_eval_metric) == 'mape') {
    calibEval[, Metric := abs((Target - p1) / (Target + 1))]
    Metric <- calibEval[, mean(Metric, na.rm = TRUE)]
  } else if(tolower(grid_eval_metric) == 'mse') {
    calibEval[, Metric := (Target - p1) ^ 2L]
    Metric <- calibEval[, mean(Metric, na.rm = TRUE)]
  } else if(tolower(grid_eval_metric) == 'msle') {
    if(MinVal > 0L && min(calibEval[['p1']], na.rm = TRUE) > 0L) {
      calibEval[, Metric := (log(Target + 1) - log(p1 + 1)) ^ 2L]
      Metric <- calibEval[, mean(Metric, na.rm = TRUE)]
    }
  } else if(tolower(grid_eval_metric) == 'kl') {
    if(MinVal > 0L && min(calibEval[['p1']], na.rm = TRUE) > 0L) {
      calibEval[, Metric := Target * log((Target + 1) / (p1 + 1))]
      Metric <- calibEval[, mean(Metric, na.rm = TRUE)]
    }
  } else if(tolower(grid_eval_metric) == 'cs') {
    calibEval[, ':=' (Metric1 = Target * p1, Metric2 = Target ^ 2L, Metric3 = p1 ^ 2L)]
    Metric <- calibEval[, sum(Metric1, na.rm = TRUE)] / (sqrt(calibEval[, sum(Metric2, na.rm = TRUE)]) * sqrt(calibEval[, sum(Metric3, na.rm = TRUE)]))
  } else if(tolower(grid_eval_metric) == 'r2') {
    Metric <- (calibEval[, stats::cor(eval(Target), p1)][[1L]]) ^ 2L
  }
  return(Metric)
}
AdrianAntico/ModelingTools documentation built on Feb. 1, 2024, 7:33 a.m.