# AutoQuant is a package for quickly creating high quality visualizations under a common and easy api.
# Copyright (C) <year> <name of author>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#' @title ModelInsightsReport
#'
#' @description ModelInsightsReport is an Rmarkdown report for viewing the model insights generated by AutoQuant supervised learning functions
#'
#' @author Adrian Antico
#' @family Model Insights
#'
#' @param TrainData data.table or something that converts to data.table via as.data.table
#' @param ValidationData = data.table or something that converts to data.table via as.data.table
#' @param TestData = data.table or something that converts to data.table via as.data.table
#' @param TargetColumnName NULL
#' @param PredictionColumnName NULL
#' @param FeatureColumnNames NULL
#' @param DateColumnName NULL
#' @param TargetColumnName For selecting columns
#' @param TargetLevels Character vector of the unique target levels
#' @param TargetType 'regression', 'classification', or 'multiclass'
#' @param ModelID ModelID used in the AutoQuant supervised learning function
#' @param Algo 'catboost', 'xgboost', 'lightgbm', 'h2o'
#' @param ModelObject Output from a AutoQuant supervised learning function
#' @param Path Path to Model Output if ModelObject is left NULL
#' @param SourcePath Path to directory with AutoQuant Model Output
#' @param OutputPath Path to directory where the html will be saved
#' @param ModelObject Model Output passed from AutoQuant::Auto_*() functions
#' @param Test_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Validation_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Train_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Test_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Feature1', 'Feature2' and 'Importance'
#' @param Validation_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Feature1', 'Feature2' and 'Importance'
#' @param Train_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Feature1', 'Feature2' and 'Importance'
#'
#' @noRd
Run_ModelInsightsReport <- function(# Meta info
TargetColumnName = NULL,
TargetType = 'regression',
TargetLevels = NULL,
PredictionColumnName = 'Predict',
FeatureColumnNames = NULL,
DateColumnName = NULL,
# Control options
ModelID = 'ModelTest',
Algo = 'catboost',
SourcePath = NULL,
OutputPath = NULL,
ModelObject = NULL,
# If you want to pass in data
TrainData = NULL,
ValidationData = NULL,
TestData = NULL,
# If you want to pass in your
# own variable importance
Test_Importance_dt = NULL,
Validation_Importance_dt = NULL,
Train_Importance_dt = NULL,
Test_Interaction_dt = NULL,
Validation_Interaction_dt = NULL,
Train_Interaction_dt = NULL) {
# Directory reference
appDir <- system.file("r-markdowns", package = "AutoQuant")
# data.table conversion
for(data_name in c('TrainData', 'ValidationData', 'TestData')) {
if(!is.null(eval(parse(text = data_name))) && !data.table::is.data.table(eval(parse(text = data_name)))) {
assign(x = data_name, value = data.table::as.data.table(eval(parse(text = data_name))), envir = .GlobalEnv)
}
}
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Globalize the parameters
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# ModelObject
ModelObject <<- ModelObject
# Create global vars
# Finalize Passthrough
if(!is.null(ModelObject)) {
# DataSets
TestData <- ModelObject[['TestData']]
TrainData <- ModelObject[['TrainData']]
# Meta info
TargetColumnName <- ModelObject[['ArgsList']][['TargetColumnName']]
PredictionColumnName <- PredictionColumnName
if(is.null(FeatureColumnNames)) {
FeatureColumnNames <- ModelObject[['ColNames']][[1L]]
}
if(is.null(DateColumnName) && !is.null(ModelObject[['ArgsList']][['PrimaryDateColumn']])) {
DateColumnName <- ModelObject[['ArgsList']][['PrimaryDateColumn']]
} else {
DateColumnName <- NULL
}
ArgsList <- ModelObject[['ArgsList']]
} else {
# DataSets
if(is.null(TestData) && file.exists(file.path(SourcePath, paste0(ModelID, "_ValidationData.csv")))) {
TestData <- data.table::fread(file = file.path(SourcePath, paste0(ModelID, "_ValidationData.csv")))
}
# Validate
if(is.null(ValidationData) && file.exists(file.path(SourcePath, paste0(ModelID, "_ValData.csv")))) {
ValidationData <- data.table::fread(file = file.path(SourcePath, paste0(ModelID, "_ValData.csv")))
}
# Train
if(is.null(TrainData) && file.exists(file.path(SourcePath, paste0(ModelID, "_TrainData.csv")))) {
TrainData <- data.table::fread(file = file.path(SourcePath, paste0(ModelID, "_TrainData.csv")))
}
# Meta info
TargetColumnName <- TargetColumnName
PredictionColumnName <- PredictionColumnName
if(is.null(FeatureColumnNames) && !is.null(TestData)) {
FeatureColumnNames <- names(TestData)[!names(TestData) %in% c(TargetColumnName, PredictionColumnName)]
}
if(is.null(FeatureColumnNames) && !is.null(ValidationData)) {
FeatureColumnNames <- names(ValidationData)[!names(ValidationData) %in% c(TargetColumnName, PredictionColumnName)]
}
if(is.null(FeatureColumnNames) && !is.null(TrainData)) {
FeatureColumnNames <- names(TrainData)[!names(TrainData) %in% c(TargetColumnName, PredictionColumnName)]
}
if(is.list(FeatureColumnNames) || data.table::is.data.table(FeatureColumnNames)) {
FeatureColumnNames <- FeatureColumnNames[[1L]]
}
if(is.null(DateColumnName) && !is.null(ModelObject[['ArgsList']][['PrimaryDateColumn']])) {
DateColumnName <- ModelObject[['ArgsList']][['PrimaryDateColumn']]
} else {
DateColumnName <- NULL
}
if(file.exists(file.path(SourcePath, paste0(ModelID, '_ArgsList.Rdata')))) {
load(file.path(SourcePath, paste0(ModelID, '_ArgsList.Rdata')))
ArgsList <- ArgsList
} else {
ArgsList <- NULL
}
}
# Metadata args
TargetType <- TargetType
TargetLevels <- TargetLevels
ModelID <- ModelID
Algo <- Algo
SourcePath <- SourcePath
OutputPath <- OutputPath
OutputPathName <- file.path(OutputPath, paste0('MLReport-', ModelID, '-', TargetType, '.html'))
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Validitity Check to Run the Rmarkdown file
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
msg <- c()
# Logic check
DataSupplied <- !all(is.null(TestData) && is.null(ValidationData) && is.null(TrainData))
if(!is.null(SourcePath) && !is.null(ModelID)) {
FileData <- file.exists(file.path(SourcePath, paste0(ModelID, '_TestData.csv'))) ||
file.exists(file.path(SourcePath, paste0(ModelID, '_ValidationData.csv'))) ||
file.exists(file.path(SourcePath, paste0(ModelID, '_TrainData.csv')))
} else {
FileData <- NULL
}
DataExists <- DataSupplied || FileData
# If not ModelObject and no TargetColumnName then no way to know what the target variable is
if(is.null(ModelObject) && !DataExists) {
msg <- c(msg, 'Must supply either ModelObject or to have in file or provide at least one of TrainData, ValidationData, or TestData')
}
# If not ModelObject and no TargetColumnName then no way to know what the target variable is
if(is.null(TargetColumnName) && is.null(ModelObject)) {
msg <- c(msg, 'You have to supply the TargetColumnName')
}
# Check if PredictionColumnName is in all existing data sets
if(is.null(PredictionColumnName) && is.null(ModelObject) && DataExists) {
if(!is.null(TestData) && !PredictionColumnName %in% names(TestData)) {
msg <- c(msg, paste0(PredictionColumnName," is not found in TestData"))
}
if(!is.null(ValidationData) && PredictionColumnName %in% names(ValidationData)) {
msg <- c(msg, paste0(PredictionColumnName," is not found in ValidationData"))
}
if(!is.null(TrainData) && PredictionColumnName %in% names(TrainData)) {
msg <- c(msg, paste0(PredictionColumnName," is not found in TrainData"))
}
}
# Check if PredictionColumnName is in all existing data sets
if(!is.null(PredictionColumnName) && !is.null(TestData) && !PredictionColumnName %in% names(TestData)) {
msg <- c(msg, 'PredictionColumnName value is not in TestData')
}
if(!is.null(PredictionColumnName) && !is.null(ValidationData) && !PredictionColumnName %in% names(ValidationData)) {
msg <- c(msg, 'PredictionColumnName value is not in ValidationData')
}
if(!is.null(PredictionColumnName) && !is.null(TrainData) && !PredictionColumnName %in% names(TrainData)) {
msg <- c(msg, 'PredictionColumnName value is not in TrainData')
}
# Print error messages and Stop function
if(!is.null(msg)) {
for(m in seq_along(msg)) print(msg[m]); stop('See above')
}
# Variable Importance Tables
Test_Importance_dt <- Test_Importance_dt
Validation_Importance_dt <- Validation_Importance_dt
Train_Importance_dt <- Train_Importance_dt
Test_Interaction_dt <- Test_Interaction_dt
Validation_Interaction_dt <- Validation_Interaction_dt
Train_Interaction_dt <- Train_Interaction_dt
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Select Rmarkdown Report and Run it ----
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Regression Markdown
if(tolower(TargetType) == 'regression') {
rmarkdown::render(
input = file.path(appDir, 'Regression_ModelInsights.Rmd'),
output_file = file.path(OutputPathName))
}
# Classification Markdown
if(tolower(TargetType) == 'classification') {
rmarkdown::render(
input = file.path(appDir, 'Classification_ModelInsights.Rmd'),
output_file = file.path(OutputPathName))
}
# MultiClass Markdown
if(tolower(TargetType) == 'multiclass') {
rmarkdown::render(
input = file.path(appDir, 'MultiClass_ModelInsights.Rmd'),
output_file = file.path(OutputPathName))
}
}
#' @title ModelInsightsReport
#'
#' @description ModelInsightsReport is an Rmarkdown report for viewing the model insights generated by AutoQuant supervised learning functions
#'
#' @author Adrian Antico
#' @family Model Insights
#'
#' @param TrainData data.table or something that converts to data.table via as.data.table
#' @param ValidationData data.table or something that converts to data.table via as.data.table
#' @param TestData data.table or something that converts to data.table via as.data.table
#' @param TargetColumnName NULL. Target variable column name as character
#' @param PredictionColumnName NULL. Predicted value column name as character. 'p1' for AutoQuant functions
#' @param FeatureColumnNames NULL. Feature column names as character vector.
#' @param DateColumnName NULL. Date column name as character
#' @param TargetType 'regression', 'classification', or 'multiclass'
#' @param ModelID ModelID used in the AutoQuant supervised learning function
#' @param Algo 'catboost' or 'other'. Use 'catboost' if using AutoQuant::AutoCatBoost_() functions. Otherwise, 'other'
#' @param Path Path to Model Output if ModelObject is left NULL
#' @param SourcePath Path to directory with AutoQuant Model Output
#' @param OutputPath Path to directory where the html will be saved
#' @param ModelObject Returned output from regression, classificaiton, and multiclass Remix Auto_() models. Currenly supports CatBoost, XGBoost, and LightGBM models
#' @param GlobalVars ls() don't use
#' @param KeepOutput NULL A list of output names to select. Pass in as a character vector. E.g. c('Test_VariableImportance', 'Train_VariableImportance')
#' @param Test_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Validation_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Train_Importance_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a two column data.table with colnames 'Variable' and 'Importance'
#' @param Test_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Features1', 'Features2' and 'score'
#' @param Validation_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Features1', 'Features2' and 'score'
#' @param Train_Interaction_dt NULL.. Ignore if using AutoQuant Models. Otherwise, supply a three column data.table with colnames 'Features1', 'Features2' and 'score'
#'
#' @examples
#' \dontrun{
#'
#' #####################################################
#' # CatBoost
#' #####################################################
#'
#' # Create some dummy correlated data
#' data <- AutoQuant::FakeDataGenerator(
#' Correlation = 0.85,
#' N = 10000,
#' ID = 2,
#' ZIP = 0,
#' AddDate = FALSE,
#' Classification = FALSE,
#' MultiClass = FALSE)
#'
#' # Copy data
#' data1 <- data.table::copy(data)
#'
#' # Run function
#' ModelObject <- AutoQuant::AutoCatBoostRegression(
#'
#' # GPU or CPU and the number of available GPUs
#' TrainOnFull = FALSE,
#' task_type = 'GPU',
#' NumGPUs = 1,
#' DebugMode = FALSE,
#'
#' # Metadata args
#' OutputSelection = c('Importances','EvalPlots','EvalMetrics','Score_TrainData'),
#' ModelID = 'Test_Model_1',
#' model_path = getwd(),
#' metadata_path = getwd(),
#' SaveModelObjects = FALSE,
#' SaveInfoToPDF = FALSE,
#' ReturnModelObjects = TRUE,
#'
#' # Data args
#' data = data1,
#' ValidationData = NULL,
#' TestData = NULL,
#' TargetColumnName = 'Adrian',
#' FeatureColNames = names(data1)[!names(data1) %in% c('IDcol_1','IDcol_2','Adrian')],
#' PrimaryDateColumn = NULL,
#' WeightsColumnName = NULL,
#' IDcols = c('IDcol_1','IDcol_2'),
#' TransformNumericColumns = 'Adrian',
#' Methods = c('Asinh','Asin','Log','LogPlus1','Sqrt','Logit'),
#'
#' # Model evaluation
#' eval_metric = 'RMSE',
#' eval_metric_value = 1.5,
#' loss_function = 'RMSE',
#' loss_function_value = 1.5,
#' MetricPeriods = 10L,
#' NumOfParDepPlots = ncol(data1)-1L-2L,
#'
#' # Grid tuning args
#' PassInGrid = NULL,
#' GridTune = FALSE,
#' MaxModelsInGrid = 30L,
#' MaxRunsWithoutNewWinner = 20L,
#' MaxRunMinutes = 60*60,
#' BaselineComparison = 'default',
#'
#' # ML args
#' langevin = FALSE,
#' diffusion_temperature = 10000,
#' Trees = 500,
#' Depth = 9,
#' L2_Leaf_Reg = NULL,
#' RandomStrength = 1,
#' BorderCount = 128,
#' LearningRate = NULL,
#' RSM = 1,
#' BootStrapType = NULL,
#' GrowPolicy = 'SymmetricTree',
#' model_size_reg = 0.5,
#' feature_border_type = 'GreedyLogSum',
#' sampling_unit = 'Object',
#' subsample = NULL,
#' score_function = 'Cosine',
#' min_data_in_leaf = 1)
#'
#' # Create Model Insights Report
#' AutoQuant::ModelInsightsReport(
#'
#' # Items to keep in global environment when
#' # function finishes execution
#' KeepOutput = 'Test_VariableImportance',
#'
#' # DataSets
#' TrainData = NULL,
#' ValidationData = NULL,
#' TestData = NULL,
#'
#' # Meta info
#' TargetColumnName = NULL,
#' PredictionColumnName = NULL,
#' FeatureColumnNames = NULL,
#' DateColumnName = NULL,
#'
#' # Variable Importance
#' Test_Importance_dt = NULL,
#' Validation_Importance_dt = NULL,
#' Train_Importance_dt = NULL,
#' Test_Interaction_dt = NULL,
#' Validation_Interaction_dt = NULL,
#' Train_Interaction_dt = NULL,
#'
#' # Control options
#' TargetType = 'regression',
#' ModelID = 'ModelTest',
#' Algo = 'catboost',
#' SourcePath = getwd(),
#' OutputPath = getwd(),
#' ModelObject = ModelObject)
#' }
#'
#' @export
ModelInsightsReport <- function(KeepOutput = NULL,
# Datasets
TrainData = NULL,
ValidationData = NULL,
TestData = NULL,
# Meta info
TargetColumnName = NULL,
PredictionColumnName = 'Predict',
FeatureColumnNames = NULL,
DateColumnName = NULL,
# Control options
TargetType = 'regression',
ModelID = 'ModelTest',
Algo = 'catboost',
SourcePath = NULL,
OutputPath = NULL,
ModelObject = NULL,
# Variable Importance
Test_Importance_dt = NULL,
Validation_Importance_dt = NULL,
Train_Importance_dt = NULL,
Test_Interaction_dt = NULL,
Validation_Interaction_dt = NULL,
Train_Interaction_dt = NULL,
# Dont use
GlobalVars = ls()) {
# Run Function
Run_ModelInsightsReport(
# Data Sets
TrainData = TrainData,
ValidationData = ValidationData,
TestData = TestData,
# Data Meta
TargetColumnName = TargetColumnName,
PredictionColumnName = PredictionColumnName,
FeatureColumnNames = FeatureColumnNames,
DateColumnName = DateColumnName,
# Control Options
TargetType = TargetType,
ModelID = ModelID,
Algo = Algo,
SourcePath = SourcePath,
OutputPath = OutputPath,
ModelObject = ModelObject,
# Variable Importance data.table's
Test_Importance_dt = Test_Importance_dt,
Validation_Importance_dt = Validation_Importance_dt,
Train_Importance_dt = Train_Importance_dt,
Test_Interaction_dt = Test_Interaction_dt,
Validation_Interaction_dt = Validation_Interaction_dt,
Train_Interaction_dt = Train_Interaction_dt)
# Remove objects
GlobalVarsNew <- ls()
rm(list = c(setdiff(GlobalVarsNew, c(GlobalVars, KeepOutput))))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.