TSAT: Time Series Analysis Tools (TSAT)

Documented in FcGradientBoosting

# res = FcGradientBoosting(DataVec, SplitAt, Time, ForecastHorizon)
#
# DESCRIPTION
# Gradient Boosting as published by [Chen/Guestrin, 2016] is a machine learning technique for regression and classification problems, 
# which produces a prediction model in the form of the ensemble of weak prediction models, typically decision trees. 
# However it is claimed that an ensemble of weak predictors when combined together should hive a strong model with superior accuracy.
#
# INPUT
# DataVec             [1:n] numerical vector of time series data
# SplitAt             Index of row where the DataVec is divided into test and train data. If not given n is used
# Time                [1:n] character vector of Time in the length of data
# Frequency           If aggregation required. Can be either "days", "weeks", "months" or "quarters" or "years", 
#                     see ConvertNumerical2TSobject. Default is "days"
# ForecastHorizon     Scalar defining the timesteps to forecast ahead
# OPTIONAL
# PlotIt              FALSE (default), do nothing. TRUE: plots the forecast versus the validation set.
#
# OUTPUT
# Forecast            [1:ForecastHorizon] Forecast generated by the model 
# Model               Model object of class xgbar
# TestData            [1:n-SplitAt, 1:2] Data frame of test data and test time 
# TrainData           [1:SplitAt, 1:2] Data frame of training data and training time 
#
# DETAILS
# "Gradient Boosting algorithm is a machine learning technique used for building predictive tree-based models.Boosting is an ensemble 
# technique in which new models are added to correct the errors made by existing models. Models are added sequentially until no further 
# improvements can be made. The ensemble technique uses the tree ensemble model which is a set of classification and regression trees 
# (CART)." [see url] 
# Do not confuse with bagging: simple ensembling technique in which we build many independent predictors/models/learners and combine 
# them using some model averaging techniques. (e.g. weighted average, majority vote or normal average.
# It should be noted that in order to forecast more than one step saisonality extracted by either an arma model (\pkg{gbm}) or and 
# fourier model (\pkg{forecastxgb}) has to be used. 
#
# Author: MCT

FcGradientBoosting=function(DataVec,SplitAt,Time,ForecastHorizon,Frequency='days',FUN=sum,PlotIt=FALSE){
  
  # devtools::install_github("ellisp/forecastxgb-r-package/pkg")

  if(missing(SplitAt)) {
    warning('Input for SplitAt was not given. Setting SplitAt to length(DataVec)')
    SplitAt = length(DataVec)
  }
  if(missing(ForecastHorizon)) {
    warning('Input for ForecastHorizon was not given. Setting ForecastHorizon to 1')
    ForecastHorizon = 1
  }
  if(missing(Time)) {
    Time = 1:length(DataVec)
    warning('No input for Time was given, will use 1:length(DataVec) for Time')
  }
  
  inputs = checkInputForecasting(DataVec, Time, SplitAt, ForecastHorizon, PlotIt)
  DataVec = inputs$DataVec
  Time = inputs$Time
  SplitAt = inputs$SplitAt
  ForecastHorizon = inputs$ForecastHorizon
  PlotIt = inputs$PlotIt
  
  
  if (!is.function(FUN)) {
    stop('FUN has to be a function')
  }
  
  n = length(DataVec)
  
  errorMessage = packageMissingError("forecastxgb", n, SplitAt)
  if(errorMessage != FALSE){
    return(
      list(
        Model = NULL,
        FitStats = NULL,
        Forecast = rep(NaN, length(DataVec)-SplitAt),
        ForecastTrain = rep(NaN, SplitAt),
        Info = errorMessage
      )
    )
  }
  errorMessage = packageMissingError("forecast", n, SplitAt)
  if(errorMessage != FALSE){
    return(
      list(
        Model = NULL,
        FitStats = NULL,
        Forecast = rep(NaN, length(DataVec)-SplitAt),
        ForecastTrain = rep(NaN, SplitAt),
        Info = errorMessage
      )
    )
  }

  switch(Frequency,
    days={
      DT=data.frame(Time=Time,Datavector=DataVec)
      N=nrow(DT)
      Splitted=list(
        TrainingSet = head(DT$Data,SplitAt),
        TrainingTime = head(DT$Time,SplitAt),
        TestSet = tail(DT$Data,N-SplitAt),
        TestTime = tail(DT$Time,N-SplitAt))
    },
    months={
      DT=TSAT::aggregateDays2Months(Time,DataVec,FUN = FUN,Header = c('Time','Data'))
      N=nrow(DT)
      Splitted=list(
        TrainingSet = head(DT$Data,SplitAt),
        TrainingTime = head(DT$Time,SplitAt),
        TestSet = tail(DT$Data,N-SplitAt),
        TestTime = tail(DT$Time,N-SplitAt))
    },
    weeks={
      DT=TSAT::aggregateDays2Weeks(Time,DataVec,FUN = FUN,Header = c('Time','Data'))
      N=nrow(DT)
      Splitted=list(
        TrainingSet = head(DT$Data,SplitAt),
        TrainingTime = head(DT$Time,SplitAt),
        TestSet = tail(DT$Data,N-SplitAt),
        TestTime = tail(DT$Time,N-SplitAt))
    },
    {stop('please chose correct frequency')}
  )
   

  TS=TSAT::ConvertNumerical2TSobject(Time =Splitted$TrainingTime ,NumericVectorOrMatrix = Splitted$TrainingSet,Frequency = Frequency)

  model <- forecastxgb::xgbar(TS)
  
  # fc <- forecastxgb::forecast.xgbar(model, h = Horizon)
  
  # SplitAt = ForecastHorizon, SplitAt < ForecastHorizon, SplitAt > ForecastHorizon
  
  fc <- forecast::forecast(model, h = ForecastHorizon)
  
  n=nrow(DT)
  
  if(PlotIt){
   # plot(fc)
    if(length(Splitted$TestTime)==0) {
      x = Splitted$TrainingTime
      y = Splitted$TrainingSet
      #plot(Splitted$TrainingTime,Splitted$TrainingSet,type='l',col='black')
    } else {
      x = Splitted$TestTime
      y = Splitted$TestSet
    }
    plot(x,y,type='l',col='black')
    
    #points(Splitted$TestTime,fc$mean,type='l',col='red')
    if(ForecastHorizon>n-SplitAt){
      time_interval = mean(diff(x))
      extended_time = c(Splitted$TestTime, seq(from = x[length(x)], by=time_interval,
                                            length.out = ForecastHorizon-(n-SplitAt)))
      points(extended_time,fc$mean,type='l',col='red')
    } else {
      points(x,fc$mean,type='l',col='red')
    }  
  }
  requireNamespace('forecast')
  
  if(ForecastHorizon > n-SplitAt) {
    acc=forecast::accuracy(fc$mean[1:length(Splitted$TestSet)],Splitted$TestSet)
  } else if(ForecastHorizon < n-SplitAt) {
    acc=forecast::accuracy(fc$mean,Splitted$TestSet[1:ForecastHorizon])
  } else {
    acc=forecast::accuracy(fc$mean,Splitted$TestSet)
  }
  
  return(list(
    Forecast = fc$mean,
    Model = model,
    ForecastStats=fc,
    TestData =data.frame(Time=Splitted$TestTime,TestSet=Splitted$TestSet),
    TrainData=data.frame(Time=Splitted$TrainingTime,TrainingSet=Splitted$TrainingSet)
  ))
}