ModelingTools: AutoQuant

Documented in ModelInsightsReport

# AutoQuant is a package for quickly creating high quality visualizations under a common and easy api.
# Copyright (C) <year>  <name of author>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

#' @title ModelInsightsReport
#'
#' @description ModelInsightsReport is an Rmarkdown report for viewing the model insights generated by AutoQuant supervised learning functions
#'
#' @author Adrian Antico
#' @family Model Insights
#'
#' @param TrainData data.table or something that converts to data.table via as.data.table
#' @param ValidationData = data.table or something that converts to data.table via as.data.table
#' @param TestData = data.table or something that converts to data.table via as.data.table
#' @param TargetColumnName NULL
#' @param PredictionColumnName NULL
#' @param FeatureColumnNames NULL
#' @param DateColumnName NULL
#' @param TargetColumnName For selecting columns
#' @param TargetLevels Character vector of the unique target levels
#' @param TargetType 'regression', 'classification', or 'multiclass'
#' @param ModelID ModelID used in the AutoQuant supervised learning function
#' @param Algo 'catboost', 'xgboost', 'lightgbm', 'h2o'
#' @param ModelObject Output from a AutoQuant supervised learning function
#' @param Path Path to Model Output if ModelObject is left NULL
#' @param SourcePath Path to directory with AutoQuant Model Output
#' @param OutputPath Path to directory where the html will be saved
#' @param ModelObject Model Output passed from AutoQuant::Auto_*() functions
#' @param Test_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Validation_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Train_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Test_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Feature1', 'Feature2' and 'Importance'
#' @param Validation_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Feature1', 'Feature2'  and 'Importance'
#' @param Train_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Feature1', 'Feature2'  and 'Importance'
#'
#' @noRd
Run_ModelInsightsReport <- function(# Meta info
                                    TargetColumnName = NULL,
                                    TargetType = 'regression',
                                    TargetLevels = NULL,
                                    PredictionColumnName = 'Predict',
                                    FeatureColumnNames = NULL,
                                    DateColumnName = NULL,

                                    # Control options
                                    ModelID = 'ModelTest',
                                    Algo = 'catboost',
                                    SourcePath = NULL,
                                    OutputPath = NULL,
                                    ModelObject = NULL,

                                    # If you want to pass in data
                                    TrainData = NULL,
                                    ValidationData = NULL,
                                    TestData = NULL,

                                    # If you want to pass in your
                                    #  own variable importance
                                    Test_Importance_dt = NULL,
                                    Validation_Importance_dt = NULL,
                                    Train_Importance_dt = NULL,
                                    Test_Interaction_dt = NULL,
                                    Validation_Interaction_dt = NULL,
                                    Train_Interaction_dt = NULL) {

  # Directory reference
  appDir <- system.file("r-markdowns", package = "AutoQuant")

  # data.table conversion
  for(data_name in c('TrainData', 'ValidationData', 'TestData')) {
    if(!is.null(eval(parse(text = data_name))) && !data.table::is.data.table(eval(parse(text = data_name)))) {
      assign(x = data_name, value = data.table::as.data.table(eval(parse(text = data_name))), envir = .GlobalEnv)
    }
  }

  # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  # Globalize the parameters
  # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

  # ModelObject
  ModelObject <<- ModelObject

  # Create global vars

  # Finalize Passthrough
  if(!is.null(ModelObject)) {

    # DataSets
    TestData <- ModelObject[['TestData']]
    TrainData <- ModelObject[['TrainData']]

    # Meta info
    TargetColumnName <- ModelObject[['ArgsList']][['TargetColumnName']]
    PredictionColumnName <- PredictionColumnName
    if(is.null(FeatureColumnNames)) {
      FeatureColumnNames <- ModelObject[['ColNames']][[1L]]
    }
    if(is.null(DateColumnName) && !is.null(ModelObject[['ArgsList']][['PrimaryDateColumn']])) {
      DateColumnName <- ModelObject[['ArgsList']][['PrimaryDateColumn']]
    } else {
      DateColumnName <- NULL
    }
    ArgsList <- ModelObject[['ArgsList']]

  } else {

    # DataSets
    if(is.null(TestData) && file.exists(file.path(SourcePath, paste0(ModelID, "_ValidationData.csv")))) {
      TestData <- data.table::fread(file = file.path(SourcePath, paste0(ModelID, "_ValidationData.csv")))
    }
    # Validate
    if(is.null(ValidationData) && file.exists(file.path(SourcePath, paste0(ModelID, "_ValData.csv")))) {
      ValidationData <- data.table::fread(file = file.path(SourcePath, paste0(ModelID, "_ValData.csv")))
    }
    # Train
    if(is.null(TrainData) && file.exists(file.path(SourcePath, paste0(ModelID, "_TrainData.csv")))) {
      TrainData <- data.table::fread(file = file.path(SourcePath, paste0(ModelID, "_TrainData.csv")))
    }

    # Meta info
    TargetColumnName <- TargetColumnName
    PredictionColumnName <- PredictionColumnName
    if(is.null(FeatureColumnNames) && !is.null(TestData)) {
      FeatureColumnNames <- names(TestData)[!names(TestData) %in% c(TargetColumnName, PredictionColumnName)]
    }
    if(is.null(FeatureColumnNames) && !is.null(ValidationData)) {
      FeatureColumnNames <- names(ValidationData)[!names(ValidationData) %in% c(TargetColumnName, PredictionColumnName)]
    }
    if(is.null(FeatureColumnNames) && !is.null(TrainData)) {
      FeatureColumnNames <- names(TrainData)[!names(TrainData) %in% c(TargetColumnName, PredictionColumnName)]
    }
    if(is.list(FeatureColumnNames) || data.table::is.data.table(FeatureColumnNames)) {
      FeatureColumnNames <- FeatureColumnNames[[1L]]
    }
    if(is.null(DateColumnName) && !is.null(ModelObject[['ArgsList']][['PrimaryDateColumn']])) {
      DateColumnName <- ModelObject[['ArgsList']][['PrimaryDateColumn']]
    } else {
      DateColumnName <- NULL
    }
    if(file.exists(file.path(SourcePath, paste0(ModelID, '_ArgsList.Rdata')))) {
      load(file.path(SourcePath, paste0(ModelID, '_ArgsList.Rdata')))
      ArgsList <- ArgsList
    } else {
      ArgsList <- NULL
    }
  }

  # Metadata args
  TargetType <- TargetType
  TargetLevels <- TargetLevels
  ModelID <- ModelID
  Algo <- Algo
  SourcePath <- SourcePath
  OutputPath <- OutputPath
  OutputPathName <- file.path(OutputPath, paste0('MLReport-', ModelID, '-', TargetType, '.html'))

  # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  # Validitity Check to Run the Rmarkdown file
  # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  msg <- c()

  # Logic check
  DataSupplied <- !all(is.null(TestData) && is.null(ValidationData) && is.null(TrainData))
  if(!is.null(SourcePath) && !is.null(ModelID)) {
    FileData <- file.exists(file.path(SourcePath, paste0(ModelID, '_TestData.csv'))) ||
      file.exists(file.path(SourcePath, paste0(ModelID, '_ValidationData.csv'))) ||
      file.exists(file.path(SourcePath, paste0(ModelID, '_TrainData.csv')))
  } else {
    FileData <- NULL
  }
  DataExists <- DataSupplied || FileData

  # If not ModelObject and no TargetColumnName then no way to know what the target variable is
  if(is.null(ModelObject) && !DataExists) {
    msg <- c(msg, 'Must supply either ModelObject or to have in file or provide at least one of TrainData, ValidationData, or TestData')
  }

  # If not ModelObject and no TargetColumnName then no way to know what the target variable is
  if(is.null(TargetColumnName) && is.null(ModelObject)) {
    msg <- c(msg, 'You have to supply the TargetColumnName')
  }

  # Check if PredictionColumnName is in all existing data sets
  if(is.null(PredictionColumnName) && is.null(ModelObject) && DataExists) {
    if(!is.null(TestData) && !PredictionColumnName %in% names(TestData)) {
      msg <- c(msg, paste0(PredictionColumnName," is not found in TestData"))
    }
    if(!is.null(ValidationData) && PredictionColumnName %in% names(ValidationData)) {
      msg <- c(msg, paste0(PredictionColumnName," is not found in ValidationData"))
    }
    if(!is.null(TrainData) && PredictionColumnName %in% names(TrainData)) {
      msg <- c(msg, paste0(PredictionColumnName," is not found in TrainData"))
    }
  }

  # Check if PredictionColumnName is in all existing data sets
  if(!is.null(PredictionColumnName) && !is.null(TestData) && !PredictionColumnName %in% names(TestData)) {
    msg <- c(msg, 'PredictionColumnName value is not in TestData')
  }
  if(!is.null(PredictionColumnName) && !is.null(ValidationData) && !PredictionColumnName %in% names(ValidationData)) {
    msg <- c(msg, 'PredictionColumnName value is not in ValidationData')
  }
  if(!is.null(PredictionColumnName) && !is.null(TrainData) && !PredictionColumnName %in% names(TrainData)) {
    msg <- c(msg, 'PredictionColumnName value is not in TrainData')
  }

  # Print error messages and Stop function
  if(!is.null(msg)) {
    for(m in seq_along(msg)) print(msg[m]); stop('See above')
  }

  # Variable Importance Tables
  Test_Importance_dt <- Test_Importance_dt
  Validation_Importance_dt <- Validation_Importance_dt
  Train_Importance_dt <- Train_Importance_dt
  Test_Interaction_dt <- Test_Interaction_dt
  Validation_Interaction_dt <- Validation_Interaction_dt
  Train_Interaction_dt <- Train_Interaction_dt


  # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  # Select Rmarkdown Report and Run it ----
  # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

  # Regression Markdown
  if(tolower(TargetType) == 'regression') {
    rmarkdown::render(
      input = file.path(appDir, 'Regression_ModelInsights.Rmd'),
      output_file = file.path(OutputPathName))
  }

  # Classification Markdown
  if(tolower(TargetType) == 'classification') {
    rmarkdown::render(
      input = file.path(appDir, 'Classification_ModelInsights.Rmd'),
      output_file = file.path(OutputPathName))
  }

  # MultiClass Markdown
  if(tolower(TargetType) == 'multiclass') {
    rmarkdown::render(
      input = file.path(appDir, 'MultiClass_ModelInsights.Rmd'),
      output_file = file.path(OutputPathName))
  }
}

#' @title ModelInsightsReport
#'
#' @description ModelInsightsReport is an Rmarkdown report for viewing the model insights generated by AutoQuant supervised learning functions
#'
#' @author Adrian Antico
#' @family Model Insights
#'
#' @param TrainData data.table or something that converts to data.table via as.data.table
#' @param ValidationData data.table or something that converts to data.table via as.data.table
#' @param TestData data.table or something that converts to data.table via as.data.table
#' @param TargetColumnName NULL. Target variable column name as character
#' @param PredictionColumnName NULL. Predicted value column name as character. 'p1' for AutoQuant functions
#' @param FeatureColumnNames NULL. Feature column names as character vector.
#' @param DateColumnName NULL. Date column name as character
#' @param TargetType 'regression', 'classification', or 'multiclass'
#' @param ModelID ModelID used in the AutoQuant supervised learning function
#' @param Algo 'catboost' or 'other'. Use 'catboost' if using AutoQuant::AutoCatBoost_() functions. Otherwise, 'other'
#' @param Path Path to Model Output if ModelObject is left NULL
#' @param SourcePath Path to directory with AutoQuant Model Output
#' @param OutputPath Path to directory where the html will be saved
#' @param ModelObject Returned output from regression, classificaiton, and multiclass Remix Auto_() models. Currenly supports CatBoost, XGBoost, and LightGBM models
#' @param GlobalVars ls() don't use
#' @param KeepOutput NULL A list of output names to select. Pass in as a character vector. E.g. c('Test_VariableImportance', 'Train_VariableImportance')
#' @param Test_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Validation_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Train_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Test_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Features1', 'Features2'  and 'score'
#' @param Validation_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Features1', 'Features2'  and 'score'
#' @param Train_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Features1', 'Features2'  and 'score'
#'
#' @examples
#' \dontrun{
#'
#' #####################################################
#' # CatBoost
#' #####################################################
#'
#' # Create some dummy correlated data
#' data <- AutoQuant::FakeDataGenerator(
#'   Correlation = 0.85,
#'   N = 10000,
#'   ID = 2,
#'   ZIP = 0,
#'   AddDate = FALSE,
#'   Classification = FALSE,
#'   MultiClass = FALSE)
#'
#' # Copy data
#' data1 <- data.table::copy(data)
#'
#' # Run function
#' ModelObject <- AutoQuant::AutoCatBoostRegression(
#'
#'   # GPU or CPU and the number of available GPUs
#'   TrainOnFull = FALSE,
#'   task_type = 'GPU',
#'   NumGPUs = 1,
#'   DebugMode = FALSE,
#'
#'   # Metadata args
#'   OutputSelection = c('Importances','EvalPlots','EvalMetrics','Score_TrainData'),
#'   ModelID = 'Test_Model_1',
#'   model_path = getwd(),
#'   metadata_path = getwd(),
#'   SaveModelObjects = FALSE,
#'   SaveInfoToPDF = FALSE,
#'   ReturnModelObjects = TRUE,
#'
#'   # Data args
#'   data = data1,
#'   ValidationData = NULL,
#'   TestData = NULL,
#'   TargetColumnName = 'Adrian',
#'   FeatureColNames = names(data1)[!names(data1) %in% c('IDcol_1','IDcol_2','Adrian')],
#'   PrimaryDateColumn = NULL,
#'   WeightsColumnName = NULL,
#'   IDcols = c('IDcol_1','IDcol_2'),
#'   TransformNumericColumns = 'Adrian',
#'   Methods = c('Asinh','Asin','Log','LogPlus1','Sqrt','Logit'),
#'
#'   # Model evaluation
#'   eval_metric = 'RMSE',
#'   eval_metric_value = 1.5,
#'   loss_function = 'RMSE',
#'   loss_function_value = 1.5,
#'   MetricPeriods = 10L,
#'   NumOfParDepPlots = ncol(data1)-1L-2L,
#'
#'   # Grid tuning args
#'   PassInGrid = NULL,
#'   GridTune = FALSE,
#'   MaxModelsInGrid = 30L,
#'   MaxRunsWithoutNewWinner = 20L,
#'   MaxRunMinutes = 60*60,
#'   BaselineComparison = 'default',
#'
#'   # ML args
#'   langevin = FALSE,
#'   diffusion_temperature = 10000,
#'   Trees = 500,
#'   Depth = 9,
#'   L2_Leaf_Reg = NULL,
#'   RandomStrength = 1,
#'   BorderCount = 128,
#'   LearningRate = NULL,
#'   RSM = 1,
#'   BootStrapType = NULL,
#'   GrowPolicy = 'SymmetricTree',
#'   model_size_reg = 0.5,
#'   feature_border_type = 'GreedyLogSum',
#'   sampling_unit = 'Object',
#'   subsample = NULL,
#'   score_function = 'Cosine',
#'   min_data_in_leaf = 1)
#'
#' # Create Model Insights Report
#' AutoQuant::ModelInsightsReport(
#'
#'   # Items to keep in global environment when
#'   #   function finishes execution
#'   KeepOutput = 'Test_VariableImportance',
#'
#'   # DataSets
#'   TrainData = NULL,
#'   ValidationData = NULL,
#'   TestData = NULL,
#'
#'   # Meta info
#'   TargetColumnName = NULL,
#'   PredictionColumnName = NULL,
#'   FeatureColumnNames = NULL,
#'   DateColumnName = NULL,
#'
#'   # Variable Importance
#'   Test_Importance_dt = NULL,
#'   Validation_Importance_dt = NULL,
#'   Train_Importance_dt = NULL,
#'   Test_Interaction_dt = NULL,
#'   Validation_Interaction_dt = NULL,
#'   Train_Interaction_dt = NULL,
#'
#'   # Control options
#'   TargetType = 'regression',
#'   ModelID = 'ModelTest',
#'   Algo = 'catboost',
#'   SourcePath = getwd(),
#'   OutputPath = getwd(),
#'   ModelObject = ModelObject)
#' }
#'
#' @export
ModelInsightsReport <- function(KeepOutput = NULL,

                                # Datasets
                                TrainData = NULL,
                                ValidationData = NULL,
                                TestData = NULL,

                                # Meta info
                                TargetColumnName = NULL,
                                PredictionColumnName = 'Predict',
                                FeatureColumnNames = NULL,
                                DateColumnName = NULL,

                                # Control options
                                TargetType = 'regression',
                                ModelID = 'ModelTest',
                                Algo = 'catboost',
                                SourcePath = NULL,
                                OutputPath = NULL,
                                ModelObject = NULL,

                                # Variable Importance
                                Test_Importance_dt = NULL,
                                Validation_Importance_dt = NULL,
                                Train_Importance_dt = NULL,
                                Test_Interaction_dt = NULL,
                                Validation_Interaction_dt = NULL,
                                Train_Interaction_dt = NULL,

                                # Dont use
                                GlobalVars = ls()) {

  # Run Function
  Run_ModelInsightsReport(

    # Data Sets
    TrainData = TrainData,
    ValidationData = ValidationData,
    TestData = TestData,

    # Data Meta
    TargetColumnName = TargetColumnName,
    PredictionColumnName = PredictionColumnName,
    FeatureColumnNames = FeatureColumnNames,
    DateColumnName = DateColumnName,

    # Control Options
    TargetType = TargetType,
    ModelID = ModelID,
    Algo = Algo,
    SourcePath = SourcePath,
    OutputPath = OutputPath,
    ModelObject = ModelObject,

    # Variable Importance data.table's
    Test_Importance_dt = Test_Importance_dt,
    Validation_Importance_dt = Validation_Importance_dt,
    Train_Importance_dt = Train_Importance_dt,
    Test_Interaction_dt = Test_Interaction_dt,
    Validation_Interaction_dt = Validation_Interaction_dt,
    Train_Interaction_dt = Train_Interaction_dt)

  # Remove objects
  GlobalVarsNew <- ls()
  rm(list = c(setdiff(GlobalVarsNew, c(GlobalVars, KeepOutput))))
}