R/CatBoostHelpers.R

Defines functions CatBoostGridParams CatBoostParameterGrids CatBoostRemoveFiles CatBoostPDF CatBoostImportances CatBoostValidationData CatBoostFinalParams CatBoostDataConversion CatBoostDataPrep CatBoostArgsCheck

# AutoQuant is a package for quickly creating high quality visualizations under a common and easy api.
# Copyright (C) <year>  <name of author>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

#' @title CatBoostArgsCheck
#'
#' @description Ensure arguments are defined correctly
#'
#' @author Adrian
#' @family CatBoost Helpers
#'
#' @param ModelType Passthrough
#' @param data. Passthrough
#' @param FeatureColNames. Passthrough
#' @param PrimaryDateColumn. Passthrough
#' @param GridTune. Passthrough
#' @param model_path. Passthrough
#' @param metadata_path. Passthrough
#' @param ClassWeights. Passthrough
#' @param LossFunction. Passthrough
#' @param loss_function. Passthrough regression
#' @param loss_function_value. Passthrough regression
#' @param eval_metric. Passthrough regression
#' @param eval_metric_value. Passthrough regression
#' @param task_type. Passthrough
#' @param NumGPUs. Passthrough
#' @param MaxModelsInGrid. Passthrough
#' @param NumOfParDepPlots. Passthrough
#' @param ReturnModelObject. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param PassInGrid. Passthrough
#' @param MetricPeriods. Passthrough
#' @param langevin. Passthrough
#' @param diffusion_temperature. Passthrough
#' @param Trees. Passthrough
#' @param Depth. Passthrough
#' @param LearningRate. Passthrough
#' @param L2_Leaf_Reg. Passthrough
#' @param RandomStrength. Passthrough
#' @param BorderCount. Passthrough
#' @param RSM. Passthrough
#' @param BootStrapType. Passthrough
#' @param GrowPolicy. Passthrough
#' @param model_size_reg. Passthrough
#' @param feature_border_type. Passthrough
#' @param sampling_unit. Passthrough
#' @param subsample. Passthrough
#' @param score_function. Passthrough
#' @param min_data_in_leaf. Passthrough
#'
#' @noRd
CatBoostArgsCheck <- function(ModelType = "regression",
                              data.=data,
                              FeatureColNames.=FeatureColNames,
                              PrimaryDateColumn. = PrimaryDateColumn,
                              GridTune. = GridTune,
                              model_path. = model_path,
                              metadata_path. = metadata_path,
                              ClassWeights. = ClassWeights,
                              LossFunction. = LossFunction,
                              loss_function. = loss_function,
                              loss_function_value. = loss_function_value,
                              eval_metric. = eval_metric,
                              eval_metric_value. = eval_metric_value,
                              task_type. = task_type,
                              NumGPUs. = NumGPUs,
                              MaxModelsInGrid. = MaxModelsInGrid,
                              NumOfParDepPlots. = NumOfParDepPlots,
                              ReturnModelObjects. = ReturnModelObjects,
                              SaveModelObjects. = SaveModelObjects,
                              PassInGrid. = PassInGrid,
                              MetricPeriods. = MetricPeriods,
                              langevin. = langevin,
                              diffusion_temperature. = diffusion_temperature,
                              Trees. = Trees,
                              Depth. = Depth,
                              LearningRate. = LearningRate,
                              L2_Leaf_Reg. = L2_Leaf_Reg,
                              RandomStrength. = RandomStrength,
                              BorderCount. = BorderCount,
                              RSM. = RSM,
                              BootStrapType. = BootStrapType,
                              GrowPolicy. = GrowPolicy,
                              model_size_reg. = model_size_reg,
                              feature_border_type. = feature_border_type,
                              sampling_unit. = sampling_unit,
                              subsample. = subsample,
                              score_function. = score_function,
                              min_data_in_leaf. = min_data_in_leaf) {

  # Regression loss_function and eval_metric setup
  if(ModelType %chin% c("regression","vector")) {
    if(is.null(loss_function.)) LossFunction. <- "RMSE" else LossFunction. <- loss_function.
    if(is.null(eval_metric.)) EvalMetric. <- "RMSE" else EvalMetric. <- eval_metric.
    if(LossFunction. == "MultiRMSE" || EvalMetric. == "MultiRMSE") task_type. <- "CPU"
    if(tolower(eval_metric.) == "tweedie") EvalMetric. <- paste0('Tweedie:variance_power=',eval_metric_value.)
    if(tolower(loss_function.) == "tweedie") LossFunction. <- paste0('Tweedie:variance_power=',loss_function_value.)
    if(tolower(eval_metric.) == "fairloss") EvalMetric. <- paste0('FairLoss:smoothness=',eval_metric_value.)
    if(tolower(loss_function.) == "fairloss") EvalMetric. <- paste0('FairLoss:smoothness=',eval_metric_value.)
    if(tolower(eval_metric.) == "numerrors") EvalMetric. <- paste0('NumErrors:greater_than=',eval_metric_value.)
    if(tolower(loss_function.) == "numerrors") LossFunction. <- paste0('NumErrors:greater_than=',loss_function_value.)
    if(tolower(eval_metric.) == "lq") EvalMetric. <- paste0('Lq:q=',eval_metric_value.)
    if(tolower(loss_function.) == "lq") LossFunction. <- paste0('Lq:q=',loss_function_value.)
    if(tolower(eval_metric.) == "huber") {
      EvalMetric. <- paste0('Huber:delta=',eval_metric_value)
      task_type <- "CPU"
    }
    if(tolower(loss_function.) == "huber") {
      LossFunction. <- paste0('Huber:delta=',loss_function_value.)
      task_type <- "CPU"
    }
    if(tolower(eval_metric.) == "expectile") EvalMetric. <- paste0('Expectile:alpha=',eval_metric_value.)
    if(tolower(loss_function.) == "expectile") LossFunction. <- paste0('Expectile:alpha=',loss_function_value.)
    if(tolower(eval_metric.) == "quantile") EvalMetric. <- paste0('Quantile:alpha=',eval_metric_value.)
    if(tolower(loss_function.) == "quantile") LossFunction. <- paste0('Quantile:alpha=',loss_function_value.)
    if(tolower(eval_metric.) == "loglinquantile") EvalMetric. <- paste0('LogLinQuantile:alpha=',eval_metric_value.)
    if(tolower(loss_function.) == "loglinquantile") LossFunction. <- paste0('LogLinQuantile:alpha=',loss_function_value.)
  } else {
    EvalMetric. <- NULL
  }

  # Ensure model_path and metadata_path exists if supplied by user
  if(!is.null(model_path.)) if(!dir.exists(file.path(model_path.))) dir.create(model_path.)
  if(!is.null(metadata_path.)) if(!is.null(metadata_path.)) if(!dir.exists(file.path(metadata_path.))) dir.create(metadata_path.)

  # Classification Loss Function
  if(ModelType == "classification") {
    if(is.null(LossFunction.)) LossFunction. <- "Logloss"
  } else if(ModelType == "multiclass") {
    if(is.null(loss_function.)) LossFunction. <- "MultiClassOneVsAll" else LossFunction. <- loss_function.
  }

  # Ensure only one value if not grid tuning
  if(!GridTune. && length(MetricPeriods.) > 1) stop("MetricPeriods cannot have more than one value supplied")
  if(!GridTune. && length(langevin.) > 1) stop("langevin cannot have more than one value supplied")
  if(!GridTune. && length(diffusion_temperature.) > 1) stop("diffusion_temperature cannot have more than one value supplied")
  if(!GridTune. && length(Trees.) > 1) stop("Trees cannot have more than one value supplied")
  if(!GridTune. && length(Depth.) > 1) stop("Depth cannot have more than one value supplied")
  if(!GridTune. && length(LearningRate.) > 1) stop("LearningRate cannot have more than one value supplied")
  if(!GridTune. && length(L2_Leaf_Reg.) > 1) stop("L2_Leaf_Reg cannot have more than one value supplied")
  if(!GridTune. && length(RandomStrength.) > 1) stop("RandomStrength cannot have more than one value supplied")
  if(!GridTune. && length(BorderCount.) > 1) stop("BorderCount cannot have more than one value supplied")
  if(!GridTune. && length(RSM.) > 1) stop("RSM cannot have more than one value supplied")
  if(!GridTune. && length(BootStrapType.) > 1) stop("BootStrapType cannot have more than one value supplied")
  if(!GridTune. && length(GrowPolicy.) > 1) stop("GrowPolicy cannot have more than one value supplied")
  if(!GridTune. && length(model_size_reg.) > 1) stop("model_size_reg cannot have more than one value supplied")
  if(!GridTune. && length(feature_border_type.) > 1) stop("feature_border_type cannot have more than one value supplied")
  if(!GridTune. && length(sampling_unit.) > 1) stop("sampling_unit cannot have more than one value supplied")
  if(!GridTune. && length(subsample.) > 1) stop("subsample cannot have more than one value supplied")
  if(!GridTune. && length(score_function.) > 1) stop("score_function cannot have more than one value supplied")
  if(!GridTune. && length(min_data_in_leaf.) > 1) stop("min_data_in_leaf cannot have more than one value supplied")

  # Logic arg check ----
  if(ModelType %chin% c("classifier") && is.null(ClassWeights.)) ClassWeights. <- c(1,1)
  if(!(tolower(task_type.) %chin% c("gpu", "cpu"))) stop("task_type needs to be either 'GPU' or 'CPU'")
  if(is.null(NumGPUs.)) NumGPUs. <- '0' else if(NumGPUs. > 1L) NumGPUs. <- paste0('0-', NumGPUs.-1L) else NumGPUs. <- '0'
  if(!is.null(PrimaryDateColumn.)) HasTime <- TRUE else HasTime <- FALSE
  if(any(Trees. < 1L)) stop("Trees must be greater than 1")
  if(!GridTune. %in% c(TRUE, FALSE)) stop("GridTune needs to be TRUE or FALSE")
  if(MaxModelsInGrid. < 1L & GridTune.) stop("MaxModelsInGrid needs to be at least 1")
  if(!is.null(model_path.)) if(!is.character(model_path.)) stop("model_path needs to be a character type")
  if(!is.null(metadata_path.)) if(!is.character(metadata_path.)) stop("metadata_path needs to be a character type")
  if(NumOfParDepPlots. < 0L) stop("NumOfParDepPlots needs to be a positive number")
  if(!(ReturnModelObjects. %in% c(TRUE, FALSE))) stop("ReturnModelObjects needs to be TRUE or FALSE")
  if(!(SaveModelObjects. %in% c(TRUE, FALSE))) stop("SaveModelObjects needs to be TRUE or FALSE")
  if(!is.null(PassInGrid.)) GridTune. <- FALSE
  if(GridTune. && any(Depth. > 16)) {
    Depth. <- Depth.[!Depth. > 16]
    if(length(Depth.) == 0) Depth. <- 6
  }
  if(is.null(GrowPolicy.)) GrowPolicy. <- "SymmetricTree"
  if(langevin. && task_type. == "GPU") {
    task_type. <- "CPU"
    print("task_type switched to CPU to enable langevin boosting")
  }

  # RSM management
  if(task_type. == "GPU") {
    RSM. <- NULL
  } else if(is.null(RSM.)) {
    RSM. <- 1
  }
  if(!is.null(sampling_unit.) && LossFunction. != "YetiRankPairWise") sampling_unit. <- "Object"
  if(!is.null(score_function.)) {
    if(task_type. == "CPU" && score_function. %chin% c("NewtonL2","NewtonCosine")) {
      if(!is.null(GrowPolicy.)) {
        if(GrowPolicy. == "Lossguide") score_function. <- "L2"
      } else {
        score_function. <- "Cosine"
        GrowPolicy. <- "SymmetricTree"
      }
    } else if(!is.null(GrowPolicy.)) {
      if(GrowPolicy. == "Lossguide" && score_function. == "NewtonCosine") score_function. <- "NewtonL2"
    } else {
      GrowPolicy. <- "SymmetricTree"
    }
  }
  if(is.null(BootStrapType.)) {
    if(task_type. == "GPU") BootStrapType. <- "Bayesian"
    if(task_type. == "CPU") BootStrapType. <- "MVS"
  } else if(task_type. == "GPU" && any(BootStrapType. %chin% "MVS")) {
    if(GridTune. && length(BootStrapType.) > 1L) {
      BootStrapType. <- BootStrapType.[!BootStrapType. %chin% "MVS"]
    }
  } else if(task_type. == 'CPU' && any(BootStrapType. %chin% 'Poisson')) {
    BootStrapType. <- 'Bayesian'
  }
  if(!is.null(sampling_unit.) && BootStrapType. == "MVS") sampling_unit. <- "Object"

  # Return
  return(list(
    HasTime = HasTime,
    GridTune = GridTune.,
    LossFunction = LossFunction.,
    EvalMetric = EvalMetric.,
    task_type = task_type.,
    NumGPUs = NumGPUs.,
    Depth = Depth.,
    RSM = RSM.,
    BootStrapType = BootStrapType.,
    sampling_unit = sampling_unit.,
    score_function = score_function.,
    GrowPolicy = GrowPolicy.))
}

#' @title CatBoostDataPrep
#'
#' @description Prepare data for loading into catboost format
#'
#' @family CatBoost Helpers
#' @author Adrian Antico
#'
#' @param OutputSelection. Passthrough
#' @param EncodeMethod. Passthrough
#' @param ModelType 'regression', 'vector', 'classification', or 'multiclass'
#' @param data. Passthrough
#' @param ValidationData. Passthrough
#' @param TestData. Passthrough
#' @param TargetColumnName. Passthrough
#' @param FeatureColNames. Passthrough
#' @param WeightsColumnName. Passthrough
#' @param PrimaryDateColumn. Passthrough
#' @param IDcols. Passthrough
#' @param TransformNumericColumns. Passthrough regression
#' @param Methods. Passthrough regression
#' @param ModelID. Passthrough regression
#' @param model_path. Passthrough regression
#' @param LossFunction. Passthrough regression
#' @param EvalMetric. Passthrough regression
#' @param TrainOnFull. Passthrough
#' @param SaveModelObjects. Passthrough
#'
#' @noRd
CatBoostDataPrep <- function(OutputSelection.=OutputSelection,
                             EncodeMethod. = 'credibility',
                             ModelType = 'regression',
                             data. = data,
                             ValidationData. = ValidationData,
                             TestData. = TestData,
                             TargetColumnName. = TargetColumnName,
                             FeatureColNames. = FeatureColNames,
                             WeightsColumnName. = NULL,
                             PrimaryDateColumn. = PrimaryDateColumn,
                             IDcols. = IDcols,
                             TransformNumericColumns. = TransformNumericColumns,
                             Methods. = Methods,
                             ModelID. = ModelID,
                             model_path. = model_path,
                             LossFunction. = LossFunction,
                             EvalMetric. = EvalMetric,
                             TrainOnFull. = TrainOnFull,
                             SaveModelObjects. = SaveModelObjects) {

  # Ensure data is a data.table ----
  if(!data.table::is.data.table(data.)) data.table::setDT(data.)
  if(!is.null(ValidationData.)) if(!data.table::is.data.table(ValidationData.)) data.table::setDT(ValidationData.)
  if(!is.null(TestData.)) if(!data.table::is.data.table(TestData.)) data.table::setDT(TestData.)

  # Target Name Storage ----
  if(!is.character(TargetColumnName.)) TargetColumnName. <- names(data.)[TargetColumnName.]

  # IDcol Name Storage ----
  if(!is.null(IDcols.)) if(!is.character(IDcols.)) IDcols. <- names(data.)[IDcols.]

  # Identify column numbers for factor variables ----
  if(ModelType != "multiclass") {
    CatFeatures <- sort(c(as.numeric(which(sapply(data., is.factor))), as.numeric(which(sapply(data., is.character)))))
    CatFeatures <- CatFeatures[CatFeatures %in% which(names(data.) %in% FeatureColNames.)]
    if(!is.null(IDcols.)) CatFeatures <- CatFeatures[!CatFeatures %in% which(names(data.) %in% IDcols.)]
    if(identical(CatFeatures, numeric(0))) CatFeatures <- NULL
  } else {
    CatFeatures <- sort(c(as.numeric(which(sapply(data., is.factor))), as.numeric(which(sapply(data., is.character)))))
    CatFeatures <- CatFeatures[CatFeatures %in% which(names(data.) %in% FeatureColNames.)]
    CatFeatures <- setdiff(CatFeatures, TargetColumnName.)
    if(!is.null(IDcols.)) CatFeatures <- CatFeatures[!CatFeatures %in% IDcols.]
    if(identical(CatFeatures, numeric(0))) CatFeatures <- NULL
  }

  # Partition
  if(is.null(ValidationData.) && is.null(TestData.) && !TrainOnFull.) {
    UseBestModel <- TRUE
    if(ModelType %chin% c("regression", "vector")) {
      if(!is.null(TransformNumericColumns.)) {
        dataSets <- Rodeo::AutoDataPartition(
          data = data.,
          NumDataSets = 3L,
          Ratios = c(0.80, 0.10, 0.10),
          PartitionType = "random",
          StratifyColumnNames = eval(TargetColumnName.[1L]),
          TimeColumnName = NULL)
        data. <- dataSets$TrainData
        ValidationData. <- dataSets$ValidationData
        TestData. <- dataSets$TestData
        rm(dataSets)

        # Mean of data----
        MeanTrainTarget <- mean(data.[[eval(TargetColumnName.[1L])]], na.rm = TRUE)

        # Transform data sets----
        Output <- Rodeo::AutoTransformationCreate(
          data.,
          ColumnNames = TransformNumericColumns.,
          Methods = Methods.,
          Path = model_path.,
          TransID = ModelID.,
          SaveOutput = SaveModelObjects.)
        data. <- Output$Data
        TransformationResults <- Output$FinalResults

        # Transform ValidationData----
        ValidationData. <- Rodeo::AutoTransformationScore(
          ScoringData = ValidationData.,
          Type = "Apply",
          FinalResults = TransformationResults,
          TransID = NULL,
          Path = NULL)

        # Transform TestData----
        if(!is.null(TestData.)) {
          TestData. <- Rodeo::AutoTransformationScore(
            ScoringData = TestData.,
            Type = "Apply",
            FinalResults = TransformationResults,
            TransID = NULL,
            Path = NULL)
        }
      } else {
        dataSets <- Rodeo::AutoDataPartition(
          data = data.,
          NumDataSets = 3L,
          Ratios = c(0.80, 0.10, 0.10),
          PartitionType = "random",
          StratifyColumnNames = eval(TargetColumnName.[1L]),
          TimeColumnName = NULL)
        data. <- dataSets$TrainData
        ValidationData. <- dataSets$ValidationData
        TestData. <- dataSets$TestData
        TransformationResults <- NULL
        rm(dataSets)
        if(length(TargetColumnName.) > 1) {
          MeanTrainTarget <- c()
          for(i in seq_len(length(TargetColumnName.))) MeanTrainTarget[i] <- mean(data.[[eval(TargetColumnName.[i])]], na.rm = TRUE)
          rm(i)
        } else {
          MeanTrainTarget <- mean(data.[[eval(TargetColumnName.)]], na.rm = TRUE)
        }
      }
    } else if(ModelType %chin% c("classification", "multiclass")) {
      dataSets <- Rodeo::AutoDataPartition(
        data = data.,
        NumDataSets = 3L,
        Ratios = c(0.80, 0.10, 0.10),
        PartitionType = "random",
        StratifyColumnNames = eval(TargetColumnName.),
        TimeColumnName = NULL)
      data. <- dataSets$TrainData
      ValidationData. <- dataSets$ValidationData
      TestData. <- dataSets$TestData
      TransformationResults <- NULL
    }
  } else {

    # Set value
    UseBestModel <- FALSE

    # Transform data sets----
    if(!is.null(TransformNumericColumns.)) {
      Output <- Rodeo::AutoTransformationCreate(
        data.,
        ColumnNames = TransformNumericColumns.,
        Methods = Methods.,
        Path = model_path.,
        TransID = ModelID.,
        SaveOutput = SaveModelObjects.)
      data. <- Output$Data
      TransformationResults <- Output$FinalResults

      # Transform ValidationData----
      if(length(ValidationData.) > 0L) {
        ValidationData. <- Rodeo::AutoTransformationScore(
          ScoringData = ValidationData.,
          Type = "Apply",
          FinalResults = TransformationResults,
          TransID = NULL,
          Path = NULL)
      }

      # Transform TestData----
      if(!is.null(TestData.)) {
        TestData. <- Rodeo::AutoTransformationScore(
          ScoringData = TestData.,
          Type = "Apply",
          FinalResults = TransformationResults,
          TransID = NULL,
          Path = NULL)
      }

    } else {
      TransformationResults <- NULL
    }
  }

  # Dummify ----
  if(length(CatFeatures) > 0L) {

    # Encode
    xx <- names(data.table::copy(data.))
    Output <- Rodeo::EncodeCharacterVariables(
      RunMode = 'train',
      ModelType = ModelType,
      TrainData = data.,
      ValidationData = ValidationData.,
      TestData = TestData.,
      TargetVariableName = TargetColumnName.[1L],
      CategoricalVariableNames = if(!is.character(CatFeatures)) names(data.)[CatFeatures] else CatFeatures,
      EncodeMethod = EncodeMethod.,
      KeepCategoricalVariables = TRUE,
      ReturnMetaData = TRUE,
      MetaDataPath = model_path.,
      MetaDataList = NULL,
      ImputeMissingValue = 0)
    data. <- Output$TrainData
    ValidationData. <- Output$ValidationData
    TestData. <- Output$TestData
    MetaData <- Output$MetaData

    # Update FeatureColNames.
    if(!is.character(CatFeatures)) zz <- names(data.)[CatFeatures] else zz <- CatFeatures
    IDcols. <- c(IDcols., zz)
    yy <- names(data.table::copy(data.))
    y <- setdiff(names(data.), yy)
    FeatureColNames. <- FeatureColNames.[!FeatureColNames. %in% zz]
    if(tolower(ModelType) != 'multiclass') FeatureColNames. <- c(FeatureColNames., paste0(zz, "_", EncodeMethod.)) else FeatureColNames. <- c(FeatureColNames., y[!y %in% IDcols.])
    CatFeatures <- NULL
  } else {
    MetaData <- NULL
    CatFeatures <- NULL
  }

  # Multiclass target levels
  if(ModelType == "multiclass") {

    # MultiClass Obtain Unique Target Levels
    if(!is.null(TestData.)) {
      temp <- data.table::rbindlist(list(data., ValidationData., TestData.))
    } else if(!is.null(ValidationData.)) {
      temp <- data.table::rbindlist(list(data., ValidationData.))
    } else {
      temp <- data.
    }
    TargetLevels <- data.table::as.data.table(sort(unique(as.character(temp[[eval(TargetColumnName.)]]))))
    data.table::setnames(TargetLevels, "V1", "OriginalLevels")
    TargetLevels[, NewLevels := seq_len(.N)]
    if(SaveModelObjects.) data.table::fwrite(TargetLevels, file = file.path(model_path., paste0(ModelID., "_TargetLevels.csv")))

    # MultiClass Convert Target to Numeric Factor
    data. <- merge(
      data.,
      TargetLevels,
      by.x = eval(TargetColumnName.),
      by.y = "OriginalLevels",
      all = FALSE)
    data.[, eval(TargetColumnName.) := NewLevels]
    data.[, NewLevels := NULL]
    if(TrainOnFull. != TRUE) {
      ValidationData. <- merge(
        ValidationData.,
        TargetLevels,
        by.x = eval(TargetColumnName.),
        by.y = "OriginalLevels",
        all = FALSE)
      ValidationData.[, eval(TargetColumnName.) := NewLevels]
      ValidationData.[, NewLevels := NULL]
      if(!is.null(TestData.)) {
        TestData. <- merge(
          TestData.,
          TargetLevels,
          by.x = eval(TargetColumnName.),
          by.y = "OriginalLevels",
          all = FALSE)
        TestData.[, eval(TargetColumnName.) := NewLevels]
        TestData.[, NewLevels := NULL]
      }
    }

    # Reorder Colnames
    data.table::setcolorder(data., c(2L:ncol(data.), 1L))
    if(!is.null(ValidationData.)) data.table::setcolorder(ValidationData., c(2L:ncol(ValidationData.), 1L))
    if(!is.null(TestData.)) data.table::setcolorder(TestData., c(2L:ncol(TestData.), 1L))
  } else {
    TargetLevels <- NULL
  }

  # Sort data if PrimaryDateColumn ----
  if(!is.null(PrimaryDateColumn.)) {
    data.table::setorderv(x = data., cols = eval(PrimaryDateColumn.), order = 1L)
    data.table::set(data., j = PrimaryDateColumn., value = NULL)
    if(!(eval(PrimaryDateColumn.) %in% IDcols.)) IDcols. <- unique(c(IDcols., PrimaryDateColumn.))
  }

  # Sort ValidationData if PrimaryDateColumn ----
  if(!is.null(PrimaryDateColumn.) && !TrainOnFull.) {
    data.table::setorderv(x = ValidationData., cols = eval(PrimaryDateColumn.), order = 1L)
    data.table::set(ValidationData., j = PrimaryDateColumn., value = NULL)
  }

  # Sort TestData if PrimaryDateColumn ----
  if(!is.null(TestData.) && !TrainOnFull. && !is.null(PrimaryDateColumn.)) {
    data.table::setorderv(x = TestData., cols = eval(PrimaryDateColumn.), order = -1L)
    data.table::set(TestData., j = PrimaryDateColumn., value = NULL)
  }

  # Data Subset Columns Needed ----
  if(!is.null(PrimaryDateColumn.) && PrimaryDateColumn. %chin% names(data.)) {
    keep <- unique(c(PrimaryDateColumn., WeightsColumnName., IDcols.))
  } else {
    keep <- unique(c(WeightsColumnName., IDcols.))
  }

  # Sorting is getting messed up here, I think
  if("score_traindata" %chin% tolower(OutputSelection.)) {
    TrainMerge <- data.table::rbindlist(list(data.,ValidationData.), fill = TRUE)
  } else {
    TrainMerge <- NULL
  }

  # Remove cols
  if(!is.null(keep) && any(keep %in% names(data.))) data.table::set(data., j = c(keep[c(keep %in% names(data.))]), value = NULL)
  if(!TrainOnFull. && !is.null(keep) && any(keep %in% names(ValidationData.))) data.table::set(ValidationData., j = c(keep[c(keep %in% names(ValidationData.))]), value = NULL)

  # TestData Subset Columns Needed ----
  if(!is.null(TestData.)) {
    TestMerge <- data.table::copy(TestData.)
    if(!is.null(keep)) data.table::set(TestData., j = c(keep[c(keep %in% names(TestData.))]), value = NULL)
  } else {
    TestMerge <- NULL
    TestData. <- NULL
  }

  # Train Rodeo::ModelDataPrep ----
  data. <- Rodeo::ModelDataPrep(data = data., Impute = TRUE, CharToFactor = TRUE, RemoveDates = TRUE, MissFactor = "0", MissNum = -1, IntToNumeric = TRUE, FactorToChar = FALSE, DateToChar = FALSE, IgnoreCols = NULL)
  if(!TrainOnFull. && length(ValidationData.) > 0L) ValidationData. <- Rodeo::ModelDataPrep(data = ValidationData., Impute = TRUE, CharToFactor = TRUE, RemoveDates = TRUE, MissFactor = "0", MissNum = -1, FactorToChar = FALSE, IntToNumeric = TRUE, DateToChar = FALSE, IgnoreCols = NULL)
  if(!is.null(TestData.)) TestData. <- Rodeo::ModelDataPrep(data = TestData., Impute = TRUE, CharToFactor = TRUE, RemoveDates = TRUE, MissFactor = "0", MissNum = -1, FactorToChar = FALSE, IntToNumeric = TRUE, DateToChar = FALSE, IgnoreCols = NULL)

  # Save Names of data ----
  Names <- data.table::as.data.table(setdiff(names(data.), c(TargetColumnName., PrimaryDateColumn., IDcols.)))
  if(!"V1" %chin% names(Names)) data.table::setnames(Names, "FeatureColNames.", "ColNames", skip_absent = TRUE) else data.table::setnames(Names, "V1", "ColNames", skip_absent = TRUE)
  if(SaveModelObjects.) data.table::fwrite(Names, file.path(model_path., paste0(ModelID., "_ColNames.csv")))

  # Subset Target Variables ----
  TrainTarget <- data.[, .SD, .SDcols = c(TargetColumnName.)]
  data.table::set(data., j = TargetColumnName., value = NULL)

  # Validation Data
  if(length(ValidationData.) > 0L) {
    TestTarget <- ValidationData.[, .SD, .SDcols = c(TargetColumnName.)]
    TrainTargetMerge <- data.table::rbindlist(list(TrainTarget, TestTarget))
    data.table::set(ValidationData., j = TargetColumnName., value = NULL)
  } else {
    TestTarget <- NULL
    TrainTargetMerge <- NULL
  }

  # Test Data
  if(length(TestData.) > 0L) {
    FinalTestTarget <- TestData.[, .SD, .SDcols = c(TargetColumnName.)]
    data.table::set(TestData., j = TargetColumnName., value = NULL)
  } else {
    FinalTestTarget <- NULL
  }

  if(ncol(TrainTarget) > 1L) TrainTarget <- as.matrix(TrainTarget) else TrainTarget <- TrainTarget[[1L]]
  if(length(ValidationData.) > 0L && length(TestTarget) > 0L && ncol(TestTarget) > 1L) TestTarget <- as.matrix(TestTarget) else TestTarget <- TestTarget[[1L]]
  if(length(TestData.) > 0L && ncol(FinalTestTarget) > 1L) FinalTestTarget <- as.matrix(FinalTestTarget) else FinalTestTarget <- FinalTestTarget[[1L]]

  # Convert CatFeatures to 1-indexed----
  if(length(CatFeatures) > 0L) CatFeatures <- CatFeatures - 1L

  # Return ----
  return(list(dataTrain = data.,
              TrainMerge = TrainMerge,
              dataTest = ValidationData.,
              TestData = TestData.,
              TestMerge = TestMerge,
              TrainTarget = TrainTarget,
              TrainTargetMerge = TrainTargetMerge,
              TestTarget = TestTarget,
              FinalTestTarget = FinalTestTarget,
              CatFeatures = CatFeatures,
              Names = Names,
              UseBestModel = UseBestModel,
              TransformationResults = TransformationResults,
              TargetLevels = TargetLevels,
              FactorLevelsList = MetaData,
              FeatureColNames = FeatureColNames.)
         )
}

#' @title CatBoostDataConversion
#'
#' @description Convert data to catboost format
#'
#' @author Adrian
#' @family CatBoost Helpers
#'
#' @param CatFeatures. Passthrough
#' @param dataTrain. Passthrough
#' @param dataTest. Passthrough
#' @param TestData. Passthrough
#' @param TrainTarget. Passthrough
#' @param TestTarget. Passthrough
#' @param FinalTestTarget. Passthrough
#' @param TrainOnFull. Passthrough
#' @param Weights. Passthrough
#'
#' @noRd
CatBoostDataConversion <- function(CatFeatures. = CatFeatures,
                                   dataTrain. = dataTrain,
                                   dataTest. = dataTest,
                                   TestData. = TestData,
                                   TrainTarget. = TrainTarget,
                                   TestTarget. = TestTarget,
                                   FinalTestTarget. = FinalTestTarget,
                                   TrainOnFull. = TrainOnFull,
                                   Weights. = NULL) {
  if(is.character(Weights.) && Weights. %chin% names(dataTrain.)) {
    TrainWeightVector <- dataTrain.[[eval(TrainWeights.)]]
    dataTrain.[, eval(TrainWeights.) := NULL]
    if(!is.null(dataTest.)) {
      ValidationWeightVector <- dataTest.[[eval(ValidationWeights.)]]
      dataTest.[, eval(ValidationWeights.) := NULL]
    } else {
      ValidationWeightVector <- NULL
    }
    if(!is.null(TestData.)) {
      TestWeightVector <- TestData.[[eval(TestWeights.)]]
      TestData.[, eval(TestWeights.) := NULL]
    }  else {
      TestWeightVector <- NULL
    }
  } else {
    TrainWeightVector <- NULL
    ValidationWeightVector <- NULL
    TestWeightVector <- NULL
  }
  if(!is.null(CatFeatures.) || length(CatFeatures.) > 0) {
    cats <- unique(c(as.numeric(which(unlist(lapply(dataTrain., is.factor)))) - 1L, as.numeric(which(unlist(lapply(dataTrain., is.character)))) - 1L))
    if(!is.null(TestData.)) {
      TrainPool <- catboost::catboost.load_pool(dataTrain., label = TrainTarget., cat_features = cats, weight = TrainWeightVector)
      if(!TrainOnFull.) {
        TestPool <- catboost::catboost.load_pool(dataTest., label = TestTarget., cat_features = cats, weight = ValidationWeightVector)
        FinalTestPool <- catboost::catboost.load_pool(TestData., label = FinalTestTarget., cat_features = cats, weight = TestWeightVector)
      }
    } else {
      TrainPool <- catboost::catboost.load_pool(dataTrain., label = TrainTarget., cat_features = cats, weight = TrainWeightVector)
      if(!TrainOnFull.) {
        TestPool <- catboost::catboost.load_pool(dataTest., label = TestTarget., cat_features = cats, weight = ValidationWeightVector)
      }
    }
  } else {
    if(!is.null(TestData.)) {
      TrainPool <- catboost::catboost.load_pool(dataTrain., label = TrainTarget., weight = TrainWeightVector)
      if(!TrainOnFull.) {
        TestPool <- catboost::catboost.load_pool(dataTest., label = TestTarget., weight = ValidationWeightVector)
        FinalTestPool <- catboost::catboost.load_pool(TestData., label = FinalTestTarget., weight = TestWeightVector)
      }
    } else {
      TrainPool <- catboost::catboost.load_pool(dataTrain., label = TrainTarget., weight = TrainWeightVector)
      if(!TrainOnFull. && length(TestTarget.) > 0L) TestPool <- catboost::catboost.load_pool(dataTest., label = TestTarget., weight = ValidationWeightVector) else TestPool <- NULL
    }
  }

  # Check
  if(!exists("TestPool")) TestPool <- NULL
  if(!exists("FinalTestPool")) FinalTestPool <- NULL

  # Return
  return(list(TrainPool = TrainPool, TestPool = TestPool, FinalTestPool = FinalTestPool))
}

#' @title CatBoostFinalParams
#'
#' @description Convert data to catboost format
#'
#' @author Adrian
#' @family CatBoost Helpers
#'
#' @param ModelType 'regression', 'classification', 'multiclass', 'vector'
#' @param UseBestModel. Passthrough
#' @param ClassWeights. Passthrough
#' @param TargetColumnName. Passthrough
#' @param PassInGrid. Passthrough
#' @param BestGrid. Passthrough
#' @param ExperimentalGrid. Passthrough
#' @param GridTune. Passthrough
#' @param TrainOnFull. Passthrough
#' @param MetricPeriods. Passthrough
#' @param LossFunction. Passthrough
#' @param EvalMetric. Passthrough
#' @param score_function. Passthrough
#' @param HasTime. Passthrough
#' @param task_type. Passthrough
#' @param NumGPUs. Passthrough
#' @param NTrees. Passthrough
#' @param Depth. Passthrough
#' @param LearningRate. Passthrough
#' @param L2_Leaf_Reg. Passthrough
#' @param langevin. Passthrough
#' @param diffusion_temperature. Passthrough
#' @param sampling_unit. Passthrough
#' @param RandomStrength. Passthrough
#' @param BorderCount. Passthrough
#' @param RSM. Passthrough
#' @param GrowPolicy. Passthrough
#' @param BootStrapType. Passthrough
#' @param model_size_reg. Passthrough
#' @param feature_border_type. Passthrough
#' @param subsample. Passthrough
#' @param min_data_in_leaf. Passthrough
#'
#' @noRd
CatBoostFinalParams <- function(ModelType = "classification",
                                UseBestModel. = UseBestModel,
                                ClassWeights. = ClassWeights,
                                PassInGrid. = PassInGrid,
                                ExperimentalGrid. = ExperimentalGrid,
                                BestGrid. = BestGrid,
                                GridTune. = GridTune,
                                TrainOnFull. = TrainOnFull,
                                MetricPeriods. = MetricPeriods,
                                LossFunction. = LossFunction,
                                EvalMetric. = EvalMetric,
                                score_function. = score_function,
                                HasTime. = HasTime,
                                task_type. = task_type,
                                NumGPUs. = NumGPUs,
                                NTrees. = Trees,
                                Depth. = Depth,
                                LearningRate. = LearningRate,
                                L2_Leaf_Reg. = L2_Leaf_Reg,
                                langevin. = langevin,
                                diffusion_temperature. = diffusion_temperature,
                                sampling_unit. = sampling_unit,
                                RandomStrength. = RandomStrength,
                                BorderCount. = BorderCount,
                                RSM. = RSM,
                                GrowPolicy. = GrowPolicy,
                                BootStrapType. = BootStrapType,
                                model_size_reg. = model_size_reg,
                                feature_border_type. = feature_border_type,
                                subsample. = subsample,
                                min_data_in_leaf. = min_data_in_leaf) {

  # Define parameters for case where you pass in a winning GridMetrics from grid tuning
  if(!is.null(PassInGrid.)) {
    if(PassInGrid.[,.N] > 1L) stop("PassInGrid needs to be a single row data.table")
    if(PassInGrid.[, BanditProbs_Grid_1] == -10) {
      PassInGrid. <- NULL

    } else {

      # Base Parameters
      base_params <- list()
      base_params[["class_weights"]] <- if(ModelType %chin% c("classification","multiclass")) ClassWeights. else NULL
      base_params[["use_best_model"]] <- UseBestModel.
      base_params[["best_model_min_trees"]] <- 10L
      base_params[["allow_writing_files"]] <- FALSE
      base_params[["thread_count"]] <- parallel::detectCores()

      # Additional Parameters
      base_params[["iterations"]] <- PassInGrid.[["NTrees"]]
      base_params[["depth"]] <- PassInGrid.[["Depth"]]
      base_params[["langevin"]] <- langevin.
      base_params[["diffusion_temperature"]] <- if(langevin.) diffusion_temperature. else NULL
      base_params[["learning_rate"]] <- PassInGrid.[["LearningRate"]]

      base_params[["l2_leaf_reg"]] <- PassInGrid.[["L2_Leaf_Reg"]]
      base_params[["random_strength"]] <- PassInGrid.[["RandomStrength"]]
      base_params[["border_count"]] <- PassInGrid.[["BorderCount"]]
      base_params[["rsm"]] <- PassInGrid.[["RSM"]]
      base_params[["sampling_unit"]] <- sampling_unit.

      # Speedup
      base_params[["metric_period"]] <- MetricPeriods.

      # Style of model
      base_params[["grow_policy"]] <- PassInGrid.[["GrowPolicy"]]
      base_params[["bootstrap_type"]] <- PassInGrid.[["BootStrapType"]]

      # Loss functions
      base_params[["loss_function"]] <- LossFunction.
      base_params[["eval_metric"]] <- EvalMetric.
      base_params[["score_function"]] <- score_function.

      # Data ordering for quality improvement
      base_params[["has_time"]] <- HasTime.

      # Hardware
      base_params[["task_type"]] <- task_type.
      base_params[["devices"]] <- NumGPUs.

      # Categorical Feature Args
      base_params[["model_size_reg"]] <- model_size_reg.

      # Numerical Feature Args
      base_params[["feature_border_type"]] <- feature_border_type.
      base_params[["subsample"]] <- if(any(PassInGrid.[["BootStrapType"]] %chin% c("Bayesian","No"))) NULL else if(!is.null(subsample.)) subsample. else NULL
      base_params[["min_data_in_leaf"]] <- if(PassInGrid.[["GrowPolicy"]] %chin% c("SymmetricTree")) NULL else if(!is.null(min_data_in_leaf.)) min_data_in_leaf. else NULL
    }
  }

  # Define parameters for case where you want to run grid tuning
  if(GridTune. && !TrainOnFull.) {

    # BaseCase check: if grid tuned and default is best, set BaseCase to TRUE
    # When that happens, need to select which default values to ustilize for params
    if(BestGrid.[["RunNumber"]] == 1) BaseCase <- TRUE else BaseCase <- FALSE

    # Base Parameters
    base_params <- list()
    base_params[["class_weights"]] <- if(ModelType %chin% c("classification","multiclass")) ClassWeights. else NULL
    base_params[["use_best_model"]] <- UseBestModel.
    base_params[["best_model_min_trees"]] <- 10L
    base_params[["allow_writing_files"]] <- FALSE
    base_params[["thread_count"]] <- parallel::detectCores()

    # Additional Parameters
    base_params[["iterations"]] <- if(BestGrid.[["GrowPolicy"]] == "aa" || BaseCase) max(NTrees.) else BestGrid.[["NTrees"]]
    base_params[["depth"]] <- if(BestGrid.[["GrowPolicy"]] == "aa" || BaseCase) NULL else BestGrid.[["Depth"]]
    base_params[["langevin"]] <- langevin.
    base_params[["diffusion_temperature"]] <- if(langevin.) diffusion_temperature. else NULL
    base_params[["learning_rate"]] <- if(BestGrid.[["GrowPolicy"]] == "aa" || BaseCase) NULL else BestGrid.[["LearningRate"]]

    base_params[["l2_leaf_reg"]] <- if(BestGrid.[["GrowPolicy"]] == "aa" || BaseCase) NULL else BestGrid.[["L2_Leaf_Reg"]]
    base_params[["random_strength"]] <- if(BestGrid.[["GrowPolicy"]] == "aa" || BaseCase) NULL else BestGrid.[["RandomStrength"]]
    base_params[["border_count"]] <- if(BestGrid.[["GrowPolicy"]] == "aa" || BaseCase) NULL else BestGrid.[["BorderCount"]]
    base_params[["rsm"]] <- if(BestGrid.[["GrowPolicy"]] == "aa" || BestGrid.[["RSM"]] == -1 || BaseCase) NULL else BestGrid.[["RSM"]]
    base_params[["sampling_unit"]] <- sampling_unit.

    # Speedup
    base_params[["metric_period"]] <- MetricPeriods.

    # Style of model
    base_params[["grow_policy"]] <- if(BestGrid.[["GrowPolicy"]] == "aa" || BaseCase) "SymmetricTree" else BestGrid.[["GrowPolicy"]]
    base_params[["bootstrap_type"]] <- if(BestGrid.[["GrowPolicy"]] == "aa" || BaseCase) NULL else BestGrid.[["BootStrapType"]]

    # Loss functions
    base_params[["loss_function"]] <- LossFunction.
    base_params[["eval_metric"]] <- EvalMetric.
    base_params[["score_function"]] <- score_function.

    # Data ordering for quality improvement
    base_params[["has_time"]] <- HasTime.

    # Hardware
    base_params[["task_type"]] <- task_type.
    base_params[["devices"]] <- NumGPUs.

    # Categorical Feature Args
    base_params[["model_size_reg"]] <- model_size_reg.

    # Numerical Feature Args
    base_params[["feature_border_type"]] <- feature_border_type.
    base_params[["subsample"]] <- if(any(BootStrapType. %chin% c("Bayesian","No"))) NULL else if(!is.null(subsample.)) subsample. else NULL
    base_params[["min_data_in_leaf"]] <- if(GrowPolicy. %chin% c("SymmetricTree")) NULL else if(!is.null(min_data_in_leaf.)) min_data_in_leaf. else NULL
  }

  # Define parameters Not pass in GridMetric and not grid tuning
  if(is.null(PassInGrid.) && !GridTune.) {


    # Base Parameters
    base_params <- list()
    base_params[["class_weights"]] <- if(ModelType %chin% c("classification","multiclass")) ClassWeights. else NULL
    base_params[["use_best_model"]] <- UseBestModel.
    base_params[["best_model_min_trees"]] <- 10L
    base_params[["allow_writing_files"]] <- FALSE
    base_params[["thread_count"]] <- parallel::detectCores()

    # Additional Parameters
    base_params[["iterations"]] <- NTrees.
    base_params[["depth"]] <- Depth.
    base_params[["langevin"]] <- langevin.
    base_params[["diffusion_temperature"]] <- if(langevin.) diffusion_temperature. else NULL
    base_params[["learning_rate"]] <- LearningRate.

    base_params[["l2_leaf_reg"]] <- L2_Leaf_Reg.
    base_params[["random_strength"]] <- RandomStrength.
    base_params[["border_count"]] <- BorderCount.
    base_params[["rsm"]] <- RSM.
    base_params[["sampling_unit"]] <- sampling_unit.

    # Speedup
    base_params[["metric_period"]] <- MetricPeriods.

    # Style of model
    base_params[["grow_policy"]] <- GrowPolicy.
    base_params[["bootstrap_type"]] <- BootStrapType.

    # Loss functions
    base_params[["loss_function"]] <- LossFunction.
    base_params[["eval_metric"]] <- EvalMetric.
    base_params[["score_function"]] <- score_function.

    # Data ordering for quality improvement
    base_params[["has_time"]] <- HasTime.

    # Hardware
    base_params[["task_type"]] <- task_type.
    base_params[["devices"]] <- NumGPUs.

    # Categorical Feature Args
    base_params[["model_size_reg"]] <- model_size_reg.

    # Numerical Feature Args
    base_params[["feature_border_type"]] <- feature_border_type.
    base_params[["subsample"]] <- if(any(BootStrapType. %chin% c("Bayesian","No"))) NULL else if(!is.null(subsample.)) subsample. else NULL
    base_params[["min_data_in_leaf"]] <- if(GrowPolicy. %chin% c("SymmetricTree")) NULL else if(!is.null(min_data_in_leaf.)) min_data_in_leaf. else NULL
  }

  # Return params
  return(base_params)
}

#' @title CatBoostValidationData
#'
#' @description Return validation data with predictions and save to file if requested
#'
#' @author Adrian Antico
#' @family CatBoost Helpers
#'
#' @param ModelType = "classification",
#' @param TestDataCheck = !is.null(TestData),
#' @param TrainOnFull. Passthrough
#' @param FinalTestTarget. Passthrough
#' @param TestTarget. Passthrough
#' @param TrainTarget. Passthrough
#' @param TrainMerge. Passthrough
#' @param TestMerge. Passthrough
#' @param dataTest. Passthrough
#' @param data. Passthrough
#' @param predict. Passthrough
#' @param TargetColumnName. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param metadata_path. Passthrough
#' @param model_path. Passthrough
#' @param ModelID. Passthrough
#' @param LossFunction. Passthrough regression
#' @param TransformNumericColumns. Passthrough regression
#' @param GridTune. Passthrough regression
#' @param TransformationResults. Passthrough regression
#' @param TargetLevels. Passthrough multiclass
#' @param Debug = FALSE
#'
#' @noRd
CatBoostValidationData <- function(ModelType = "classification",
                                   TrainOnFull. = TrainOnFull,
                                   TestDataCheck = FALSE,
                                   FinalTestTarget. = FinalTestTarget,
                                   TestTarget. = TestTarget,
                                   TrainTarget. = TrainTarget,
                                   TrainMerge. = NULL,
                                   TestMerge. = TestMerge,
                                   dataTest. = dataTest,
                                   data. = data,
                                   predict. = predict,
                                   TargetColumnName. = TargetColumnName,
                                   SaveModelObjects. = SaveModelObjects,
                                   metadata_path. = metadata_path,
                                   model_path. = model_path,
                                   ModelID. = ModelID,
                                   LossFunction. = LossFunction,
                                   TransformNumericColumns. = TransformNumericColumns,
                                   GridTune. = GridTune,
                                   TransformationResults. = TransformationResults,
                                   TargetLevels.=TargetLevels,
                                   Debug = FALSE) {

  # Classification
  if(ModelType == "classification") {
    if(!TrainOnFull.) {
      if(TestDataCheck) {
        ValidationData <- data.table::as.data.table(cbind(TestMerge., p1 = predict.))
      } else {
        ValidationData <- data.table::as.data.table(cbind(Target = TestTarget., dataTest., p1 = predict.))
        data.table::setnames(ValidationData, "Target", eval(TargetColumnName.), skip_absent = TRUE)
      }
      if(SaveModelObjects.) {
        if(!is.null(metadata_path.)) {
          data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., "_ValidationData.csv")))
        } else {
          data.table::fwrite(ValidationData, file = file.path(model_path., paste0(ModelID., "_ValidationData.csv")))
        }
      }
    } else if(!is.null(TrainMerge.)) {
      ValidationData <- data.table::as.data.table(cbind(TrainMerge., predict.))
      if(SaveModelObjects.) {
        if(!is.null(metadata_path.)) {
          data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., "_TrainData.csv")))
        } else {
          data.table::fwrite(ValidationData, file = file.path(model_path., paste0(ModelID., "_TrainData.csv")))
        }
      }
    } else {
      ValidationData <- data.table::as.data.table(cbind(Target = TrainTarget., data., p1 = predict.))
      data.table::setnames(ValidationData, "Target", eval(TargetColumnName.), skip_absent = TRUE)
      if(SaveModelObjects.) {
        if(!is.null(metadata_path.)) {
          data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., "_TrainData.csv")))
        } else {
          data.table::fwrite(ValidationData, file = file.path(model_path., paste0(ModelID., "_TrainData.csv")))
        }
      }
    }

    # Return
    return(ValidationData)
  }

  # Regression
  if(ModelType == "regression") {

    # Generate validation data
    if(!TrainOnFull.) {
      if(TestDataCheck) {
        if(length(TargetColumnName.) > 1L) {
          ValidationData <- data.table::as.data.table(cbind(TestMerge., Predict = predict.))
        } else {
          ValidationData <- data.table::as.data.table(cbind(TestMerge., Predict = predict.))
          data.table::setnames(ValidationData, "Target", TargetColumnName., skip_absent = TRUE)
        }
      } else {
        ValidationData <- data.table::as.data.table(cbind(Target = TestTarget., dataTest., Predict = predict.))
        if(length(TargetColumnName.) > 1L) {
          data.table::setnames(ValidationData, c(names(ValidationData)[seq_along(TargetColumnName.)]), c(TargetColumnName.))
        } else {
          data.table::setnames(ValidationData, "Target", TargetColumnName.)
        }
      }
    } else {
      if(!is.null(dataTest.)) {
        ValidationData <- data.table::as.data.table(cbind(dataTest., Predict = predict.))
      } else if(!is.null(TrainMerge.)) {
        ValidationData <- data.table::as.data.table(cbind(TrainMerge., predict.))
      } else {
        ValidationData <- data.table::as.data.table(cbind(Target = TrainTarget., data., Predict = predict.))
        if(length(TargetColumnName.) > 1L) {
          data.table::setnames(ValidationData, c(names(ValidationData)[seq_along(TargetColumnName.)]), c(TargetColumnName.))
        } else {
          data.table::setnames(ValidationData, "Target", TargetColumnName.)
        }
      }
      if("ID_Factorizer" %chin% names(ValidationData)) data.table::set(ValidationData, j = "ID_Factorizer", value = NULL)
    }

    # Back transform before running metrics and plots
    if(!is.null(TransformNumericColumns.)) {
      if(GridTune. && !TrainOnFull.) TransformationResults. <- TransformationResults.[ColumnName != "Predicted"]
      if(length(TargetColumnName.) == 1L) {

        # Prepare transformation object
        TransformationResults. <- data.table::rbindlist(list(
          TransformationResults.,
          data.table::data.table(
            ColumnName = c("Predict"),
            MethodName = TransformationResults.[ColumnName == eval(TargetColumnName.), MethodName],
            Lambda = TransformationResults.[ColumnName == eval(TargetColumnName.), Lambda],
            NormalizedStatistics = 0L)))
        if(length(unique(TransformationResults.[["ColumnName"]])) != nrow(TransformationResults.)) {
          temp <- TransformationResults.[, .N, by = "ColumnName"][N != 1L][[1L]]
          if(!is.null(ValidationData)) temp1 <- which(names(ValidationData) == temp)[1L]
          if(!TrainOnFull.) {
            ValidationData[, eval(names(data.)[temp1]) := NULL]
          } else {
            if(TrainOnFull.) {
              if(length(which(names(data.) %chin% eval(TargetColumnName.))) > 1L) {
                temp1 <- which(names(data.) %chin% eval(TargetColumnName.))[1L]
                data.[, which(names(data.) %chin% eval(TargetColumnName.))[2L] := NULL]
              }
            } else {
              data.[, eval(names(data.)[temp]) := NULL]
            }
          }
          TransformationResults. <- TransformationResults.[, ID := 1L:.N][ID != max(ID)]
        }

        # Back transform
        ValidationData <- Rodeo::AutoTransformationScore(
          ScoringData = ValidationData,
          Type = "Inverse",
          FinalResults = TransformationResults.,
          TransID = NULL,
          Path = NULL)

      } else {

        # Prepare transformation object
        TransformationResults. <- data.table::rbindlist(list(TransformationResults., TransformationResults.))
        for(z in seq_along(TargetColumnName.)) TransformationResults.[length(TargetColumnName.) + z, ColumnName := paste0("Predict.V",z)]

        # Back transform
        ValidationData <- Rodeo::AutoTransformationScore(
          ScoringData = ValidationData,
          Type = "Inverse",
          FinalResults = TransformationResults.,
          TransID = NULL,
          Path = NULL)
      }
    }

    # Save validation data
    if(SaveModelObjects. && is.null(TrainMerge.)) {
      if(!is.null(metadata_path.)) {
        data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., "_ValidationData.csv")))
      } else {
        data.table::fwrite(ValidationData, file = file.path(model_path., paste0(ModelID., "_ValidationData.csv")))
      }
    } else if(SaveModelObjects. && !is.null(TrainMerge.)) {
      if(!is.null(metadata_path.)) {
        data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., "_TrainData.csv")))
      } else {
        data.table::fwrite(ValidationData, file = file.path(model_path., paste0(ModelID., "_TrainData.csv")))
      }
    }

    # Return
    return(ValidationData)
  }

  # MultiClass
  if(ModelType == "multiclass") {

    if(Debug) print('Start Here 1')

    # MultiClass Grid Validation Data ----
    if(TestDataCheck) {
      if(Debug) print('Start Here 2')
      zz <- unique(names(TestMerge.))
      zz <- zz[!zz %chin% TargetColumnName.]
      ValidationData <- data.table::as.data.table(cbind(Target = FinalTestTarget., predict., TestMerge.[, .SD, .SDcols = zz]))
    } else if(!TrainOnFull. & length(TestTarget.) == predict.[, .N]) {
      if(Debug) print('Start Here 3')
      ValidationData <- data.table::as.data.table(cbind(Target = TestTarget., predict.))
    } else {
      if(Debug) print('Start Here 4')
      if(!is.null(TrainMerge.)) {
        if(Debug) print('Start Here 5')
        ValidationData <- data.table::as.data.table(cbind(TrainMerge., predict.))
        data.table::setnames(ValidationData, TargetColumnName., 'Target')
        data.table::set(ValidationData, j = 'Target', value = as.integer(ValidationData[['Target']]))
      } else {
        if(Debug) print('Start Here 6')
        ValidationData <- data.table::as.data.table(cbind(Target = TrainTarget., predict.))
      }
    }
    if(Debug) print('Start Here 7')
    if(TrainOnFull.) {
      if(Debug) print('Start Here 8')
      ValidationData <- merge(
        ValidationData,
        TargetLevels.,
        by.x = "Target",
        by.y = "NewLevels",
        all = FALSE)
      if(Debug) print('Start Here 9')
      ValidationData[, Target := OriginalLevels][, OriginalLevels := NULL]
      if(Debug) print('Start Here 10')
      ValidationData <- merge(
        ValidationData,
        TargetLevels.,
        by.x = "V1",
        by.y = "NewLevels",
        all = FALSE)
      if(Debug) print('Start Here 11')
      ValidationData[, V1 := OriginalLevels][, OriginalLevels := NULL]
    } else if(is.null(TrainMerge.)) {
      if(Debug) print('Start Here 12')
      ValidationData <- merge(
        ValidationData,
        TargetLevels.,
        by.x = "V1",
        by.y = "NewLevels",
        all = FALSE)
      if(Debug) print('Start Here 13')
      ValidationData[, V1 := OriginalLevels][, OriginalLevels := NULL]
      if(Debug) print('Start Here 14')
      ValidationData <- merge(
        ValidationData,
        TargetLevels.,
        by.x = "Target",
        by.y = "NewLevels",
        all = FALSE)
      if(Debug) print('Start Here 15')
      ValidationData[, Target := OriginalLevels][, OriginalLevels := NULL]
    }

    # MultiClass Update Names for Predicted Value Columns ----
    if(Debug) print('Start Here 16')
    k <- 1L
    for(name in as.character(TargetLevels.[[1L]])) {
      k <- k + 1L
      data.table::setnames(ValidationData, paste0("V", k), name)
    }
    if(Debug) print('Start Here 17')
    if(!all(c("V1","Target") %in% names(ValidationData)) && !TrainOnFull. || !is.null(TrainMerge.)) {
      if(Debug) print('Start Here 18')
      data.table::setnames(ValidationData, c("V1","Target"), c("Predict", eval(TargetColumnName.)), skip_absent = TRUE)
      if(Debug) print('Start Here 19')
      if(!class(ValidationData[[eval(TargetColumnName.)]])[[1L]] %in% c('character','factor')) data.table::set(ValidationData, j = eval(TargetColumnName.), value = as.integer(ValidationData[[eval(TargetColumnName.)]]))
      if(Debug) print('Start Here 20')
      if(!class(ValidationData[['Predict']])[[1L]] %in% c('character','factor')) data.table::set(ValidationData, j = 'Predict', value = as.integer(ValidationData[[eval(TargetColumnName.)]]))
      if(Debug) print('Start Here 21')
      if(!is.null(TrainMerge.)) {
        if(!class(ValidationData[['Predict']])[[1L]] %in% c('character','factor')) {
          print('Start Here 22')
          ValidationData <- merge(
            ValidationData,
            TargetLevels.,
            by.x = "Predict",
            by.y = "NewLevels",
            all = FALSE)
          ValidationData[, Predict := OriginalLevels][, OriginalLevels := NULL]
        }
        if(Debug) print('Start Here 24')
        data.table::setcolorder(ValidationData, c(1L, (ncol(ValidationData)-TargetLevels.[,.N]+1L):ncol(ValidationData), 2L:(ncol(ValidationData)-TargetLevels.[,.N])))
        if(Debug) print('Start Here 25')
        if(!class(ValidationData[[eval(TargetColumnName.)]])[[1L]] %in% c('character','factor')) {
          ValidationData <- merge(
            ValidationData,
            TargetLevels.,
            by.x = TargetColumnName.,
            by.y = "NewLevels",
            all = FALSE)
          if(Debug) print('Start Here 26')
          ValidationData[, eval(TargetColumnName.) := OriginalLevels][, OriginalLevels := NULL]
        }
      }
    } else {
      if(Debug) print('Start Here 27')
      data.table::setnames(ValidationData, c("V1","Target"), c("Predict", eval(TargetColumnName.)))
      if(Debug) print('Start Here 28')
      data.table::set(ValidationData, j = eval(TargetColumnName.), value = as.character(ValidationData[[eval(TargetColumnName.)]]))
    }
    if(Debug) print('Start Here 29')
    data.table::set(ValidationData, j = "Predict", value = as.character(ValidationData[["Predict"]]))

    # Save validation data
    if(SaveModelObjects. && is.null(TrainMerge.)) {
      if(!is.null(metadata_path.)) {
        data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., "_ValidationData.csv")))
      } else {
        data.table::fwrite(ValidationData, file = file.path(model_path., paste0(ModelID., "_ValidationData.csv")))
      }
    } else if(SaveModelObjects. && !is.null(TrainMerge.)) {
      if(!is.null(metadata_path.)) {
        data.table::fwrite(ValidationData, file = file.path(metadata_path., paste0(ModelID., "_TrainData.csv")))
      } else {
        data.table::fwrite(ValidationData, file = file.path(model_path., paste0(ModelID., "_TrainData.csv")))
      }
    }

    # Return
    return(ValidationData)
  }
}

#' @title CatBoostImportances
#'
#' @description Generate variable importance, interaction importance, and shap values
#'
#' @author Adrian
#' @family CatBoost Helpers
#'
#' @param ModelType 'regression', 'classification', or 'multiclass'
#' @param TargetColumnName. Passthrough
#' @param TrainPool. Passthrough
#' @param TestPool. Passthrough
#' @param FinalTestPool. Passthrough
#' @param TrainData. Passthrough
#' @param ValidationData. Passthrough
#' @param SaveModelObjects. Passthrough
#' @param model. Passthrough
#' @param ModelID. Passthrough
#' @param model_path. Passthrough
#' @param metadata_path. Passthrough
#' @param GrowPolicy. = GrowPolicy
#' @param ReturnShap = TRUE
#'
#' @noRd
CatBoostImportances <- function(ModelType = "regression",
                                TargetColumnName.=TargetColumnName,
                                TrainPool. = TrainPool,
                                TestPool. = TestPool,
                                FinalTestPool. = FinalTestPool,
                                TrainData. = NULL,
                                ValidationData. = ValidationData,
                                SaveModelObjects. = SaveModelObjects,
                                model. = model,
                                ModelID. = ModelID,
                                model_path. = model_path,
                                metadata_path. = metadata_path,
                                GrowPolicy. = GrowPolicy,
                                ReturnShap = TRUE) {

  # Gather artifacts
  if(!GrowPolicy. %chin% c("Depthwise","Lossguide")) {

    # Interaction Importance
    InteractionList <- list()
    if(!is.null(TrainPool.)) InteractionList[["Train_Interaction"]] <- data.table::as.data.table(catboost::catboost.get_feature_importance(model., pool = TrainPool., type = "Interaction"))
    if(!is.null(TestPool.)) InteractionList[["Validation_Interaction"]] <- data.table::as.data.table(catboost::catboost.get_feature_importance(model., pool = TestPool., type = "Interaction"))
    if(!is.null(FinalTestPool.)) InteractionList[["Test_Interaction"]] <- data.table::as.data.table(catboost::catboost.get_feature_importance(model., pool = FinalTestPool., type = "Interaction"))

    # Importance
    ImportanceList <- list()
    if(!is.null(TrainPool.)) ImportanceList[["Train_Importance"]] <- catboost::catboost.get_feature_importance(model., pool = TrainPool., type = "PredictionValuesChange")
    if(!is.null(TestPool.)) ImportanceList[["Validation_Importance"]] <- catboost::catboost.get_feature_importance(model., pool = TestPool., type = "PredictionValuesChange")
    if(!is.null(FinalTestPool.)) ImportanceList[["Test_Importance"]] <- catboost::catboost.get_feature_importance(model., pool = FinalTestPool., type = "PredictionValuesChange")
    if(length(ImportanceList) > 0L) {
      for(i in names(ImportanceList)) ImportanceList[[i]] <- data.table::data.table(cbind(Variable = rownames(ImportanceList[[i]]), ImportanceList[[i]]))
    }

    # Shap Values
    if(ReturnShap) {
      ShapList <- list()
      if(ModelType != "multiclass" && length(TargetColumnName.) == 1L) {
        if(!is.null(TrainPool.)) ShapList[["Train_Shap"]] <- data.table::as.data.table(catboost::catboost.get_feature_importance(model., pool = TrainPool., type = "ShapValues"))
        if(!is.null(TestPool.)) {
          ShapList[["Validation_Shap"]] <- data.table::as.data.table(catboost::catboost.get_feature_importance(model., pool = TestPool., type = "ShapValues"))
          ShapList[["Train_Shap"]] <- data.table::rbindlist(list(ShapList$Train_Shap, ShapList$Validation_Shap))
          ShapList$Validation_Shap <- NULL
        }
        if(!is.null(FinalTestPool.)) ShapList[["Test_Shap"]] <- data.table::as.data.table(catboost::catboost.get_feature_importance(model., pool = FinalTestPool., type = "ShapValues"))
      }
    } else {
      ShapList <- list()
    }

    # Gather importances
    temp <- data.table::data.table(ColNames = ImportanceList[[1L]][[1L]])[, Index := 0:(.N - 1)]
    if(length(ShapList) > 0L) {
      for(i in names(ShapList)) {
        data.table::setnames(ShapList[[i]], names(ShapList[[i]]), c(paste0("Shap_", temp[[1L]]), "Predictions"), skip_absent = TRUE)
        data.table::set(ShapList[[i]], j = "Predictions", value = NULL)
        if(i == "Test_Shap") ShapList[[i]] <- cbind(ValidationData., ShapList[[i]]) else if(!is.null(TrainData.)) ShapList[[i]] <- cbind(TrainData., ShapList[[i]]) else ShapList[[i]] <- cbind(ValidationData., ShapList[[i]])
      }
    }

    # Gather interaction importances
    for(i in names(InteractionList)) {
      if(length(InteractionList) > 0L) {
        InteractionList[[i]] <- merge(InteractionList[[i]], temp, by.x = "feature1_index", by.y = "Index", all = FALSE)
        data.table::setnames(InteractionList[[i]], "ColNames", "Features1")
        InteractionList[[i]] <- merge(InteractionList[[i]], temp, by.x = "feature2_index", by.y = "Index", all = FALSE)
        data.table::setnames(InteractionList[[i]], "ColNames", "Features2")
        InteractionList[[i]][, ":=" (feature2_index = NULL, feature1_index = NULL)]
        data.table::setcolorder(InteractionList[[i]], c(2L, 3L, 1L))
        data.table::setorderv(InteractionList[[i]], "score", -1)
      }
    }

    # Structure data
    for(i in names(ImportanceList)) {
      tryCatch({data.table::setnames(ImportanceList[[i]], "V2", "Importance")}, error = function(x) data.table::setnames(ImportanceList[[i]], "V1", "Importance"))
      ImportanceList[[i]][, Importance := round(as.numeric(Importance), 4L)]
      ImportanceList[[i]] <- ImportanceList[[i]][order(-Importance)]
      if(SaveModelObjects.) {
        if(!is.null(metadata_path.)) {
          data.table::fwrite(ImportanceList[[i]], file = file.path(metadata_path., paste0(ModelID., "_", i, "_VariableImportance.csv")))
        } else {
          data.table::fwrite(ImportanceList[[i]], file = file.path(model_path., paste0(ModelID., "_", i, "_VariableImportance.csv")))
        }
      }
    }

    # Structure data
    if(SaveModelObjects. && length(ShapList) > 0L) {
      for(i in names(ShapList)) {
        if(!is.null(metadata_path.)) {
          if(!is.null(ShapList[[i]])) data.table::fwrite(ShapList[[i]], file = file.path(metadata_path., paste0(ModelID., "_", i, "_ShapValues.csv")))
        } else {
          if(!is.null(ShapList[[i]])) data.table::fwrite(ShapList[[i]], file = file.path(model_path., paste0(ModelID., "_", i, "_ShapValues.csv")))
        }
      }
    }

    # Structure data
    if(SaveModelObjects.) {
      for(i in names(InteractionList)) {
        if(!is.null(metadata_path.)) {
          if(!is.null(InteractionList[[i]])) data.table::fwrite(InteractionList[[i]], file = file.path(metadata_path., paste0(ModelID., "_", i, "_Interaction.csv")))
        } else {
          if(!is.null(InteractionList[[i]])) data.table::fwrite(InteractionList[[i]], file = file.path(model_path., paste0(ModelID., "_", i, "_Interaction.csv")))
        }
      }
    }
  } else {
    ImportanceList <- NULL
    InteractionList <- NULL
    ShapList <- NULL
  }

  # Return
  return(list(
    Interaction = InteractionList,
    VariableImportance = ImportanceList,
    ShapValues = ShapList))
}

#' @title CatBoostPDF
#'
#' @description Send model insights to pdf
#'
#' @author Adrian
#' @family CatBoost Helpers
#'
#' @param ModelClass 'catboost', 'xgboost', 'h2o'
#' @param ModelType 'regression', 'classification', 'multiclass', or 'vector'
#' @param TrainOnFull. Passthrough
#' @param SaveInfoToPDF. Passthrough
#' @param PlotList. Passthrough
#' @param VariableImportance. Passthrough
#' @param EvalMetricsList. Passthrough
#' @param Interaction. Passthrough
#' @param model_path. Passthrough
#' @param metadata_path. Passthrough
#'
#' @noRd
CatBoostPDF <- function(ModelClass = "catboost",
                        ModelType = "regression",
                        TrainOnFull. = TrainOnFull,
                        SaveInfoToPDF. = SaveInfoToPDF,
                        EvalMetricsList. = EvalMetricsList,
                        PlotList. = PlotList,
                        VariableImportance. = VariableImportance,
                        Interaction. = Interaction,
                        model_path. = model_path,
                        metadata_path. = metadata_path) {

  # Prepare objects
  if(!TrainOnFull. && SaveInfoToPDF.) {
    if(ModelType == "classification") {
      for(i in names(EvalMetricsList.)) {
        EvalMetricsList.[[i]] <- EvalMetricsList.[[i]][, .SD, .SDcols = c("Threshold", "TN", "TP", "FN", "FP", "N", "P", "Utility", "MCC", "Accuracy")]
      }
    }
    VI_List <- list()
    if(!data.table::is.data.table(VariableImportance.)) {
      for(i in names(VariableImportance.)) {
        VI_List[[i]] <- VI_Plot(Type = ModelClass, VI_Data = VariableImportance.[[i]])
      }
    } else {
      VI_List[[1L]] <- VI_Plot(Type = ModelClass, VI_Data = VariableImportance.)
    }
    EvalPlotList <- list(PlotList., if(!is.null(VariableImportance.)) VI_List else NULL)
    TableMetrics <- list(EvalMetricsList., if(!is.null(VariableImportance.)) VariableImportance. else NULL, if(!is.null(Interaction.)) Interaction. else NULL)
  } else {
    return(NULL)
  }

  # Print to pdf
  try(PrintToPDF(
    Path = if(!is.null(metadata_path.)) metadata_path. else if(!is.null(model_path.)) model_path. else getwd(),
    OutputName = "EvaluationPlots",
    ObjectList = EvalPlotList,
    Title = "Model Evaluation Plots",
    Width = 12, Height = 7, Paper = "USr", BackgroundColor = "transparent", ForegroundColor = "black"))
  try(PrintToPDF(
    Path = if(!is.null(metadata_path.)) metadata_path. else if(!is.null(model_path.)) model_path. else getwd(),
    OutputName = "Metrics_and_Importances",
    ObjectList = TableMetrics,
    MaxPages = 100,
    Tables = TRUE,
    Title = "Model Metrics and Variable Importances",
    Width = 12, Height = 7, Paper = "USr", BackgroundColor = "transparent", ForegroundColor = "black"))
  while(dev.cur() > 1) grDevices::dev.off()
}

#' @title CatBoostRemoveFiles
#'
#' @description Remove temp files generated by catboost
#'
#' @author Adrian
#' @family CatBoost Helpers
#'
#' @param GridTune. Passthrough
#'
#' @noRd
CatBoostRemoveFiles <- function(GridTune. = GridTune, model_path.=model_path) {
  if(GridTune.) {
    if(file.exists(file.path(model_path.,"catboost_training.json"))) file.remove(file.path(model_path.,"catboost_training.json"))
    if(file.exists(file.path(model_path.,"learn_error.tsv"))) file.remove(file.path(model_path.,"learn_error.tsv"))
    if(file.exists(file.path(model_path.,"test_error.tsv"))) file.remove(file.path(model_path.,"test_error.tsv"))
    if(file.exists(file.path(model_path.,"time_left.tsv"))) file.remove(file.path(model_path.,"time_left.tsv"))
    if(dir.exists(file.path(model_path.,"catboost_info"))) unlink(x = file.path(model_path.,"catboost_info"), recursive = TRUE)
    if(dir.exists(file.path(model_path.,"learn"))) unlink(x = file.path(model_path.,"learn"), recursive = TRUE)
    if(dir.exists(file.path(model_path.,"test"))) unlink(x = file.path(model_path.,"test"), recursive = TRUE)
    if(dir.exists(file.path(model_path.,"tmp"))) unlink(x = file.path(model_path.,"tmp"), recursive = TRUE)
  } else {
    if(dir.exists(file.path(model_path.,"catboost_info"))) unlink(x = file.path(model_path.,"catboost_info"), recursive = TRUE)
  }
}

#' @title CatBoostParameterGrids
#'
#' @description CatBoostParameterGrids https://catboost.ai/docs/concepts/r-training-parameters.html
#'
#' @author Adrian Antico
#' @family CatBoost Helpers
#'
#' @param TaskType "GPU" or "CPU"
#' @param Shuffles The number of shuffles you want to apply to each grid
#' @param NTrees seq(1000L, 10000L, 1000L)
#' @param Depth seq(4L, 16L, 2L)
#' @param LearningRate seq(0.01,.10,0.01)
#' @param L2_Leaf_Reg c(1.0:10.0)
#' @param RandomStrength seq(1, 2, 0.1)
#' @param BorderCount seq(32,256,32)
#' @param RSM CPU ONLY, Random subspace method.c(0.80, 0.85, 0.90, 0.95, 1.0)
#' @param BootStrapType c("Bayesian", "Bernoulli", "Poisson", "MVS", "No")
#' @param GrowPolicy c("SymmetricTree", "Depthwise", "Lossguide")
#'
#' @return A list containing data.table's with the parameters shuffled and ready to test in the bandit framework
#' @noRd
CatBoostParameterGrids <- function(TaskType = "CPU",
                                   Shuffles = 1L,
                                   NTrees = seq(1000L, 10000L, 1000L),
                                   Depth = seq(4L, 16L, 2L),
                                   LearningRate = c(0.01,0.02,0.03,0.04),
                                   L2_Leaf_Reg = seq(1.0, 10.0, 1.0),
                                   RandomStrength = seq(1, 2, 0.1),
                                   BorderCount = seq(32,256,32),
                                   RSM = c(0.80, 0.85, 0.90, 0.95, 1.0),
                                   BootStrapType = c("Bayesian", "Bernoulli", "Poisson", "MVS", "No"),
                                   GrowPolicy = c("SymmetricTree", "Depthwise", "Lossguide")) {

  # Create grid sets----
  GridList <- list()
  if(!is.null(NTrees)) GridList[["NTrees"]] <- sort(NTrees, decreasing = FALSE) else GridList[["NTrees"]] <- seq(1000L, 10000L, 1000L)
  if(!is.null(Depth)) GridList[["Depth"]] <- sort(Depth, decreasing = FALSE) else GridList[["Depth"]] <- seq(4L, 16L, 2L)
  if(!is.null(LearningRate)) GridList[["LearningRate"]] <- sort(LearningRate, decreasing = FALSE) else GridList[["LearningRate"]] <- seq(0.01,0.10,0.01)
  if(!is.null(L2_Leaf_Reg)) GridList[["L2_Leaf_Reg"]] <- L2_Leaf_Reg else GridList[["L2_Leaf_Reg"]] <- seq(1.0, 10.0, 1.0)
  if(!is.null(RandomStrength)) GridList[["RandomStrength"]] <- RandomStrength else GridList[["RandomStrength"]] <- seq(1,2,0.1)
  if(!is.null(BorderCount)) GridList[["BorderCount"]] <- BorderCount else GridList[["BorderCount"]] <- seq(32,256,32)
  if(!is.null(RSM)) GridList[["RSM"]] <- RSM else GridList[["RSM"]] <- c(0.80, 0.85, 0.90, 0.95, 1.0)
  if(!is.null(BootStrapType)) GridList[["BootStrapType"]] <- BootStrapType else GridList[["BootStrapType"]] <- c("Bayesian", "Bernoulli", "Poisson", "MVS", "No")
  if(!is.null(GrowPolicy)) GridList[["GrowPolicy"]] <- GrowPolicy else GridList[["GrowPolicy"]] <- c("SymmetricTree", "Depthwise", "Lossguide")

  # Create grid ----
  Grid <- do.call(data.table::CJ, GridList)

  # Filter out invalid grids----
  if(tolower(TaskType) == "gpu") {
    data.table::set(Grid, j = "RSM", value = NULL)
    Grid <- Grid[!BootStrapType %chin% c("MVS")]
    Grid <- unique(Grid[BootStrapType != "Poisson" & GrowPolicy != "Lossguide"])
  } else {
    Grid <- Grid[!tolower(BootStrapType) %chin% c("poisson")]
  }

  # Total loops----
  N_NTrees <- length(unique(Grid[["NTrees"]]))
  N_Depth <- length(unique(Grid[["Depth"]]))
  N_LearningRate <- length(unique(Grid[["LearningRate"]]))
  N_L2_Leaf_Reg <- length(unique(Grid[["L2_Leaf_Reg"]]))
  Runs <- max(N_NTrees, N_Depth, N_LearningRate)
  Grids <- list()

  # Create grid sets----
  for(i in seq_len(Runs)) {
    if(i == 1L) {
      Grids[[paste0("Grid_",i)]] <- Grid[NTrees <= unique(Grid[["NTrees"]])[min(i,N_NTrees)] & Depth <= unique(Grid[["Depth"]])[min(i,N_Depth)] & LearningRate <= unique(Grid[["LearningRate"]])[min(i,N_LearningRate)]]
    } else {
      Grids[[paste0("Grid_",i)]] <- data.table::fsetdiff(
        Grid[NTrees <= unique(Grid[["NTrees"]])[min(i,N_NTrees)] & Depth <= unique(Grid[["Depth"]])[min(i,N_Depth)] & LearningRate <= unique(Grid[["LearningRate"]])[min(i,N_LearningRate)]],
        Grid[NTrees <= unique(Grid[["NTrees"]])[min(i-1L,N_NTrees)] & Depth <= unique(Grid[["Depth"]])[min(i-1L,N_Depth)] & LearningRate <= unique(Grid[["LearningRate"]])[min(i-1L,N_LearningRate)]])
    }
  }

  # Define experimental grid----
  eGrid <- data.table::data.table(
    GridNumber = rep(-1, 10000L),
    RunNumber = 1L:10000L,
    MetricValue = runif(10000L),
    RunTime = rep(-1, 10000L),
    TreesBuilt = rep(-1,10000L),
    NTrees = rep(-1,10000L),
    Depth = rep(-1,10000L),
    LearningRate = rep(-1,10000L),
    L2_Leaf_Reg = rep(-1,10000L),
    RandomStrength = rep(-1,10000L),
    BorderCount = rep(-1,10000L),
    RSM = rep(-1,10000L),
    BootStrapType = rep("aa", 10000L),
    GrowPolicy = rep("aa", 10000L))

  # Add columns for bandit probs
  data.table::set(eGrid, j = paste0("BanditProbs_", names(Grids)), value = -10)

  # Shuffle grid sets----
  for(shuffle in seq_len(Shuffles)) for(i in seq_len(Runs)) Grids[[paste0("Grid_", i)]] <- Grids[[paste0("Grid_",i)]][order(runif(Grids[[paste0("Grid_",i)]][,.N]))]

  # Return grid----
  return(list(Grid = Grid, Grids = Grids, ExperimentalGrid = eGrid))
}

#' @title CatBoostClassifierParams
#'
#' @author Adrian Antico
#' @family CatBoost Helpers
#'
#' @param N. Iteration for specific grid in grid clusters
#' @param counter. Passthrough
#' @param BanditArmsN. Passthrough
#' @param HasTime. Passthrough
#' @param MetricPeriods. Passthrough
#' @param ClassWeights. Passthrough
#' @param EvalMetric. Passthrough
#' @param LossFunction. Passthrough
#' @param task_type. Passthrough
#' @param NumGPUs. Passthrough
#' @param model_path. Passthrough
#' @param NewGrid. Passthrough
#' @param Grid. Passthrough
#' @param GridClusters. Passthrough
#'
#' @noRd
CatBoostGridParams <- function(N.=N,
                               counter. = NULL,
                               BanditArmsN. = NULL,
                               HasTime. = NULL,
                               MetricPeriods. = NULL,
                               ClassWeights. = NULL,
                               EvalMetric. = NULL,
                               LossFunction. = NULL,
                               task_type. = NULL,
                               NumGPUs. = NULL,
                               model_path. = NULL,
                               NewGrid. = NULL,
                               Grid. = NULL,
                               GridClusters. = NULL) {

  # Create base_params (independent of runs)
  base_params <- list()
  base_params$has_time <- HasTime.
  base_params$metric_period <- MetricPeriods.
  base_params$loss_function <- LossFunction.
  base_params$eval_metric <- EvalMetric.
  base_params$use_best_model <- TRUE
  base_params$best_model_min_trees <- 10L
  base_params$task_type <- task_type.
  base_params$devices <- NumGPUs.
  base_params$thread_count <- parallel::detectCores()
  base_params$train_dir <- model_path.
  base_params$class_weights <- ClassWeights.

  # Run-dependent args and updates
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$iterations <- GridClusters.[[paste0("Grid_", counter.-1L)]][["NTrees"]][1L] else if(counter. != 1L) base_params$iterations <- GridClusters.[[paste0("Grid_",NewGrid.)]][["NTrees"]][N.] else base_params$iterations <- max(Grid.$NTrees)
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$depth <- GridClusters.[[paste0("Grid_", counter.-1L)]][["Depth"]][1L] else if(counter. != 1) base_params$depth <- GridClusters.[[paste0("Grid_",NewGrid.)]][["Depth"]][N.]
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$learning_rate <- GridClusters.[[paste0("Grid_", counter.-1L)]][["LearningRate"]][1L] else if(counter. != 1L) base_params$learning_rate <- GridClusters.[[paste0("Grid_",NewGrid.)]][["LearningRate"]][N.]
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$l2_leaf_reg <- GridClusters.[[paste0("Grid_",counter.-1L)]][["L2_Leaf_Reg"]][1L] else if(counter. != 1L) base_params$l2_leaf_reg <- GridClusters.[[paste0("Grid_",NewGrid.)]][["L2_Leaf_Reg"]][N.]
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$random_strength <- GridClusters.[[paste0("Grid_",counter.-1L)]][["RandomStrength"]][1L] else if(counter. != 1L) base_params$random_strength <- GridClusters.[[paste0("Grid_",NewGrid.)]][["RandomStrength"]][N.]
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$border_count <- GridClusters.[[paste0("Grid_",counter.-1L)]][["BorderCount"]][1L] else if(counter. != 1L) base_params$border_count <- GridClusters.[[paste0("Grid_",NewGrid.)]][["BorderCount"]][N.]
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) base_params$bootstrap_type <- GridClusters.[[paste0("Grid_",counter.-1L)]][["BootStrapType"]][1L] else if(counter. != 1L) base_params$bootstrap_type <- GridClusters.[[paste0("Grid_",NewGrid.)]][["BootStrapType"]][N.]

  # TaskType-dependent args
  if(counter. != 1L && counter. <= BanditArmsN. + 1L) {
    if(tolower(task_type.) == "gpu") {
      base_params$rsm <- NULL
      base_params$grow_policy <- GridClusters.[[paste0("Grid_", counter.-1L)]][["GrowPolicy"]][N.]
    } else {
      base_params$rsm <- GridClusters.[[paste0("Grid_", counter.-1L)]][["RSM"]][N.]
      base_params$grow_policy <- NULL
    }
  } else if(counter. != 1L) {
    if(tolower(task_type.) == "gpu") {
      base_params$rsm <- NULL
      base_params$grow_policy <- GridClusters.[[paste0("Grid_",NewGrid.)]][["GrowPolicy"]][N.]
    } else {
      base_params$rsm <- GridClusters.[[paste0("Grid_", NewGrid.)]][["RSM"]][N.]
      base_params$grow_policy <- NULL
    }
  }

  # Return
  return(base_params)
}
AdrianAntico/RemixAutoML documentation built on Feb. 3, 2024, 3:32 a.m.