View source: R/AutoXGBoostCARMA.R
AutoXGBoostCARMA | R Documentation |
AutoXGBoostCARMA Mutlivariate Forecasting with calendar variables, Holiday counts, holiday lags, holiday moving averages, differencing, transformations, interaction-based categorical encoding using target variable and features to generate various time-based aggregated lags, moving averages, moving standard deviations, moving skewness, moving kurtosis, moving quantiles, parallelized interaction-based fourier pairs by grouping variables, and Trend Variables.
AutoXGBoostCARMA(
data = NULL,
XREGS = NULL,
TimeWeights = NULL,
NonNegativePred = FALSE,
RoundPreds = FALSE,
TrainOnFull = FALSE,
TargetColumnName = NULL,
DateColumnName = NULL,
HierarchGroups = NULL,
GroupVariables = NULL,
FC_Periods = 1,
SaveDataPath = NULL,
TimeUnit = NULL,
TimeGroups = NULL,
TargetTransformation = FALSE,
Methods = c("Asinh", "Log", "LogPlus1", "Sqrt"),
EncodingMethod = "target_encoding",
AnomalyDetection = NULL,
Lags = NULL,
MA_Periods = NULL,
SD_Periods = NULL,
Skew_Periods = NULL,
Kurt_Periods = NULL,
Quantile_Periods = NULL,
Quantiles_Selected = c("q5", "q95"),
Difference = FALSE,
FourierTerms = 0,
CalendarVariables = NULL,
HolidayVariable = NULL,
HolidayLookback = NULL,
HolidayLags = NULL,
HolidayMovingAverages = NULL,
TimeTrendVariable = FALSE,
DataTruncate = FALSE,
ZeroPadSeries = NULL,
SplitRatios = c(0.95, 0.05),
PartitionType = "random",
TreeMethod = "hist",
NThreads = max(1, parallel::detectCores() - 2L),
Timer = TRUE,
DebugMode = FALSE,
EvalMetric = "MAE",
LossFunction = "reg:squarederror",
GridTune = FALSE,
GridEvalMetric = "mae",
ModelCount = 30L,
MaxRunsWithoutNewWinner = 20L,
MaxRunMinutes = 24L * 60L,
EarlyStoppingRounds = 100L,
NTrees = 500L,
num_parallel_tree = 1,
LearningRate = 0.5,
MaxDepth = 6L,
MinChildWeight = 1,
SubSample = 0.7,
ColSampleByTree = 1,
alpha = 0.1,
lambda = 0.9,
SaveModel = FALSE,
ArgsList = NULL,
ModelID = "FC001",
TVT = NULL
)
data |
Supply your full series data set here |
XREGS |
Additional data to use for model development and forecasting. Data needs to be a complete series which means both the historical and forward looking values over the specified forecast window needs to be supplied. |
TimeWeights |
= NULL |
NonNegativePred |
TRUE or FALSE |
RoundPreds |
Rounding predictions to an integer value. TRUE or FALSE. Defaults to FALSE |
TrainOnFull |
Set to TRUE to train on full data |
TargetColumnName |
List the column name of your target variables column. E.g. 'Target' |
DateColumnName |
List the column name of your date column. E.g. 'DateTime' |
HierarchGroups |
= NULL Character vector or NULL with names of the columns that form the interaction hierarchy |
GroupVariables |
Defaults to NULL. Use NULL when you have a single series. Add in GroupVariables when you have a series for every level of a group or multiple groups. |
FC_Periods |
Set the number of periods you want to have forecasts for. E.g. 52 for weekly data to forecast a year ahead |
SaveDataPath |
Path to save modeling data |
TimeUnit |
List the time unit your data is aggregated by. E.g. '1min', '5min', '10min', '15min', '30min', 'hour', 'day', 'week', 'month', 'quarter', 'year' |
TimeGroups |
Select time aggregations for adding various time aggregated GDL features. |
TargetTransformation |
Run AutoTransformationCreate() to find best transformation for the target variable. Tests YeoJohnson, BoxCox, and Asigh (also Asin and Logit for proportion target variables). |
Methods |
Choose from 'YeoJohnson', 'BoxCox', 'Asinh', 'Log', 'LogPlus1', 'Sqrt', 'Asin', or 'Logit'. If more than one is selected, the one with the best normalization pearson statistic will be used. Identity is automatically selected and compared. |
EncodingMethod |
Choose from 'binary', 'm_estimator', 'credibility', 'woe', 'target_encoding', 'poly_encode', 'backward_difference', 'helmert' |
AnomalyDetection |
NULL for not using the service. Other, provide a list, e.g. AnomalyDetection = list('tstat_high' = 4, tstat_low = -4) |
Lags |
Select the periods for all lag variables you want to create. E.g. c(1:5,52) or list('day' = c(1:10), 'weeks' = c(1:4)) |
MA_Periods |
Select the periods for all moving average variables you want to create. E.g. c(1:5,52) or list('day' = c(2:10), 'weeks' = c(2:4)) |
SD_Periods |
Select the periods for all moving standard deviation variables you want to create. E.g. c(1:5,52) or list('day' = c(2:10), 'weeks' = c(2:4)) |
Skew_Periods |
Select the periods for all moving skewness variables you want to create. E.g. c(1:5,52) or list('day' = c(2:10), 'weeks' = c(2:4)) |
Kurt_Periods |
Select the periods for all moving kurtosis variables you want to create. E.g. c(1:5,52) or list('day' = c(2:10), 'weeks' = c(2:4)) |
Quantile_Periods |
Select the periods for all moving quantiles variables you want to create. E.g. c(1:5,52) or list('day' = c(2:10), 'weeks' = c(2:4)) |
Quantiles_Selected |
Select from the following c('q5','q10','q15','q20','q25','q30','q35','q40','q45','q50','q55','q60','q65','q70','q75','q80','q85','q90','q95') |
Difference |
Set to TRUE to put the I in ARIMA |
FourierTerms |
Set to the max number of pairs |
CalendarVariables |
NULL, or select from 'second', 'minute', 'hour', 'wday', 'mday', 'yday', 'week', 'wom', 'isoweek', 'month', 'quarter', 'year' |
HolidayVariable |
NULL, or select from 'USPublicHolidays', 'EasterGroup', 'ChristmasGroup', 'OtherEcclesticalFeasts' |
HolidayLookback |
Number of days in range to compute number of holidays from a given date in the data. If NULL, the number of days are computed for you. |
HolidayLags |
Number of lags for the holiday counts |
HolidayMovingAverages |
Number of moving averages for holiday counts |
TimeTrendVariable |
Set to TRUE to have a time trend variable added to the model. Time trend is numeric variable indicating the numeric value of each record in the time series (by group). Time trend starts at 1 for the earliest point in time and increments by one for each success time point. |
DataTruncate |
Set to TRUE to remove records with missing values from the lags and moving average features created |
ZeroPadSeries |
NULL to do nothing. Otherwise, set to 'maxmax', 'minmax', 'maxmin', 'minmin'. See |
SplitRatios |
E.g c(0.7,0.2,0.1) for train, validation, and test sets |
PartitionType |
Select 'random' for random data partitioning 'time' for partitioning by time frames |
TreeMethod |
Choose from 'hist', 'gpu_hist' |
NThreads |
Set the maximum number of threads you'd like to dedicate to the model run. E.g. 8 |
Timer |
Setting to TRUE prints out the forecast number while it is building |
DebugMode |
Setting to TRUE generates printout of all header code comments during run time of function |
EvalMetric |
Select from 'r2', 'RMSE', 'MSE', 'MAE' |
LossFunction |
Default is 'reg:squarederror'. Other options include 'reg:squaredlogerror', 'reg:pseudohubererror', 'count:poisson', 'survival:cox', 'survival:aft', 'aft_loss_distribution', 'reg:gamma', 'reg:tweedie' |
GridTune |
Set to TRUE to run a grid tune |
GridEvalMetric |
This is the metric used to find the threshold 'poisson', 'mae', 'mape', 'mse', 'msle', 'kl', 'cs', 'r2' |
ModelCount |
Set the number of models to try in the grid tune |
MaxRunsWithoutNewWinner |
Number of consecutive runs without a new winner in order to terminate procedure |
MaxRunMinutes |
Default 24L*60L |
NTrees |
Select the number of trees you want to have built to train the model |
LearningRate |
Learning Rate |
MaxDepth |
Depth |
MinChildWeight |
Records in leaf |
SubSample |
Random forecast setting |
ColSampleByTree |
Self explanatory |
alpha |
0. L1 Reg. |
lambda |
1. L2 Reg. |
SaveModel |
Logical. If TRUE, output ArgsList will have a named element 'Model' with the CatBoost model object |
ArgsList |
ArgsList is for scoring. Must contain named element 'Model' with a catboost model object |
ModelID |
Something to name your model if you want it saved |
TVT |
Passthrough |
See examples
Adrian Antico
Other Automated Panel Data Forecasting:
AutoCatBoostCARMA()
,
AutoH2OCARMA()
,
AutoLightGBMCARMA()
## Not run:
# Load data
data <- data.table::fread('https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1')
# Ensure series have no missing dates (also remove series with more than 25% missing values)
data <- AutoQuant::TimeSeriesFill(
data,
DateColumnName = 'Date',
GroupVariables = c('Store','Dept'),
TimeUnit = 'weeks',
FillType = 'maxmax',
MaxMissingPercent = 0.25,
SimpleImpute = TRUE)
# Set negative numbers to 0
data <- data[, Weekly_Sales := data.table::fifelse(Weekly_Sales < 0, 0, Weekly_Sales)]
# Remove IsHoliday column
data[, IsHoliday := NULL]
# Create xregs (this is the include the categorical variables instead of utilizing only the interaction of them)
xregs <- data[, .SD, .SDcols = c('Date', 'Store', 'Dept')]
# Change data types
data[, ':=' (Store = as.character(Store), Dept = as.character(Dept))]
xregs[, ':=' (Store = as.character(Store), Dept = as.character(Dept))]
# Build forecast
XGBoostResults <- AutoXGBoostCARMA(
# Data Artifacts
data = data,
NonNegativePred = FALSE,
RoundPreds = FALSE,
TargetColumnName = 'Weekly_Sales',
DateColumnName = 'Date',
HierarchGroups = NULL,
GroupVariables = c('Store','Dept'),
TimeUnit = 'weeks',
TimeGroups = c('weeks','months'),
# Data Wrangling Features
EncodingMethod = 'binary',
ZeroPadSeries = NULL,
DataTruncate = FALSE,
SplitRatios = c(1 - 10 / 138, 10 / 138),
PartitionType = 'timeseries',
AnomalyDetection = NULL,
# Productionize
FC_Periods = 0,
TrainOnFull = FALSE,
NThreads = 8,
Timer = TRUE,
DebugMode = FALSE,
SaveDataPath = NULL,
# Target Transformations
TargetTransformation = TRUE,
Methods = c('BoxCox', 'Asinh', 'Asin', 'Log',
'LogPlus1', 'Sqrt', 'Logit','YeoJohnson'),
Difference = FALSE,
# Features
Lags = list('weeks' = seq(1L, 10L, 1L),
'months' = seq(1L, 5L, 1L)),
MA_Periods = list('weeks' = seq(5L, 20L, 5L),
'months' = seq(2L, 10L, 2L)),
SD_Periods = NULL,
Skew_Periods = NULL,
Kurt_Periods = NULL,
Quantile_Periods = NULL,
Quantiles_Selected = c('q5','q95'),
XREGS = xregs,
FourierTerms = 4,
CalendarVariables = c('week', 'wom', 'month', 'quarter'),
HolidayVariable = c('USPublicHolidays','EasterGroup',
'ChristmasGroup','OtherEcclesticalFeasts'),
HolidayLookback = NULL,
HolidayLags = 1,
HolidayMovingAverages = 1:2,
TimeTrendVariable = TRUE,
# ML eval args
TreeMethod = 'hist',
EvalMetric = 'RMSE',
LossFunction = 'reg:squarederror',
# ML grid tuning
GridTune = FALSE,
ModelCount = 5,
MaxRunsWithoutNewWinner = 20L,
MaxRunMinutes = 24L*60L,
# ML args
NTrees = 300,
LearningRate = 0.3,
MaxDepth = 9L,
MinChildWeight = 1.0,
SubSample = 1.0,
ColSampleByTree = 1.0)
UpdateMetrics <- print(
XGBoostResults$ModelInformation$EvaluationMetrics[
Metric == 'MSE', MetricValue := sqrt(MetricValue)])
print(UpdateMetrics)
XGBoostResults$ModelInformation$EvaluationMetricsByGroup[order(-R2_Metric)]
XGBoostResults$ModelInformation$EvaluationMetricsByGroup[order(MAE_Metric)]
XGBoostResults$ModelInformation$EvaluationMetricsByGroup[order(MSE_Metric)]
XGBoostResults$ModelInformation$EvaluationMetricsByGroup[order(MAPE_Metric)]
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.