#' @title Daily Time Series Forecast By Hour
#' @description Uses recursive partitioning or random forest and auto arima to make daily forecasts.
#'
#' Requires the data to already be summarized into hourly amounts. Make sure that there are no missing periods of coviate data if covariates are being included in the model.
#' @param df The unquoted name of the dataframe that you want to summarize.
#' @param dateColumn The quoted name of the column that contains the hourly dates. Should be in POSIXct format.
#' @param valueColumn The quoted name of the column that has the daily values to be forecasted.
#' @param covs Optional. A dataframe of covariates. This dataframe should include at least two columns: (1) A date column with the number of rows equal to the number of periods to forecast into the future, and (2) a column with values for each day.
#' @param algo Quoted name of algorithm to use. Defaults to rpart, which is fast. The other option is randomForest.
#' @param lossFunction Defaults to "mape" (mean absolute percentage error). Other option is "mae" (mean absolute error).
#' @param period The number of periods forecasted into the future.
#' @param seasonalPeriods The other periods, in addition to the period parameter, that may be influential
#' @param K The number of fourier terms. Must be one lesss than the number of periods
#' @param returnMePlot Return the model evaluation plot? T = Yes.
#' @param returnYoyPlot Return the year-over-year plot? T = Yes.
#' @return List the contains a dataframe with the test, training, and forecasted data (dataFor), a dataframe with only the forecasted data (dataForOnly), variable importance plot if randomForest is selected (viPlot), loss (either mape or mae), model evaluation plot (modelEvalPlot), and year over year dataframe including the forecast (yoySales).
#' @export
#'
tsForecastHourly <- function(df, dateColumn, valueColumn, covs = NULL, algo = "rpart", lossFunction = "mape", period = 28, seasonalPeriods = c(7, 364), K = 2, returnMePlot = F, returnYoyPlot = F){
# covdf should be aa dataframe that has at least two columns: date, and value of covariate that includes only future covariate observations
outputList <- list()
# Reorder and rename the columns
if(!is.null(covs)){
# If there is a covariance matrix, include its column names
covMatrixCols <- colnames(covs) %>% .[which(!. %in% "begDay")]
df <- df[,c(dateColumn, valueColumn, covMatrixCols)]
colnames(df) <- c("begDay", "TotalSales", covMatrixCols)
}else{
df <- df[,c(dateColumn, valueColumn)]
colnames(df) <- c("begDay", "TotalSales")
}
# Make sure that there are no missing dates
daysDiff <- difftime(max(df$begDay), min(df$begDay), units = "days") %>% as.numeric() %>% ceiling()
daties <- data.frame(begDay = rep(seq.Date(from = min(df$begDay) %>% as.Date(), to = max(df$begDay) %>% as.Date(), by = "day"), each = 24)
, hod = rep(seq(0,23, by = 1), times = daysDiff)
, stringsAsFactors = F) %>%
dplyr::mutate(
begDay = paste0(as.character(begDay), "T", hod, ":00:00") %>% ymd_hms
) %>%
dplyr::select(-hod)
tsAll <- full_join(df, daties, by = "begDay") %>%
dplyr::arrange(begDay)
tsAll$TotalSales[is.na(tsAll$TotalSales)] <- 0
tsAll %<>% mutate(
# Indicators based on date
dow = lubridate::wday(begDay)
, dom = lubridate::mday(begDay)
, moy = lubridate::month(begDay)
, christmas = ifelse(dom == 25 & moy == 12, 1, 0)
, july4 = ifelse(dom == 4 & moy == 7, 1, 0)
, newYears = ifelse(dom == 1 & moy == 1, 1, 0)
# For yoy plot
, bdy = lubridate::year(begDay) %>% as.character()
, begDaySy = as.character(begDay)
, begDaySy = ymd(paste(as.character(lubridate::year(Sys.Date())), substr(begDaySy, 6, 7), substr(begDaySy, 9, 10), sep = "-"))
)
# Create a matrix of covariates
if(!is.null(covs)){
# Prepare the covs df to be used by setting the same date name, putting it in the first column, and adding in the holidays
covs <- covs[,c(dateColumn, setdiff(colnames(covs), dateColumn))]
colnames(covs)[1] <- "begDay"
covs %<>% mutate(
# Indicators based on date
dow = lubridate::wday(begDay)
, dom = lubridate::mday(begDay)
, moy = lubridate::month(begDay)
, christmas = ifelse(dom == 25 & moy == 12, 1, 0)
, july4 = ifelse(dom == 4 & moy == 7, 1, 0)
, newYears = ifelse(dom == 1 & moy == 1, 1, 0)
# For yoy plot
, bdy = lubridate::year(begDay) %>% as.character()
, begDaySy = as.character(begDay)
, begDaySy = ymd(paste(as.character(lubridate::year(Sys.Date())), substr(begDaySy, 6, 7), substr(begDaySy, 9, 10), sep = "-"))
) %>%
dplyr::select(-dom, -bdy, -begDaySy)
# Prepare the historical covariate information for training
xReg <- tsAll[,colnames(covs)] %>%
bind_rows(., covs) %>% # Bind the future dates
dplyr::select(-begDay)
down <- model.matrix(~ as.factor(xReg$dow)) %>% .[,-1] # Dummify day of week into six columns
dowInData <- table(xReg$dow) %>% as.data.frame(stringsAsFactors = F) %>% .[-1,] %>% dplyr::arrange(Var1)
colnames(down) <- lubridate::wday(as.numeric(dowInData$Var1), label = T)
xReg <- cbind(xReg, down) # Add to covariate matrix
moyn <- model.matrix(~ as.factor(xReg$moy)) %>% .[,-1] # Dummify month of year into 11 columns
monthsInData <- table(xReg$moy) %>% as.data.frame(stringsAsFactors = T) %>% .[-1,] %>% dplyr::arrange(Var1)
colnames(moyn) <- lubridate::month(as.numeric(monthsInData$Var1))
xReg <- cbind(xReg, moyn) %>% # Add to covariate matrix
dplyr::select(-dow, -moy)
xReg %<>% as.matrix(nrow = nrow(xReg), ncol = ncol(xReg))
}
# Key row numbers
rowTrainStart <- period + 1 # We need the first rows for lag features
rowTrainEnd <- nrow(tsAll) - period # We need the last rows for the test
rowTestStart <- rowTrainEnd + 1
rowTestEnd <- nrow(tsAll)
# STL (seasonal, trend, left over) decomposition----
# Idea is to extract the seasonal components so that they can be forecasted
dataTs <- stats::ts(tsAll$TotalSales[1:rowTrainEnd], freq = period) # Create a time series object
decompTs <- stats::stl(dataTs, s.window = "periodic", robust = T)$time.series # Seasonal decomposition of time series by Loess. The time series portion returns a dataframe with the seasonal, trend, and remainder for each day.
if(!is.null(covs)){
xRegTrain <- xReg[1:rowTrainEnd,]
nzc <- nearZeroVar(xRegTrain)
xRegTrain <- xRegTrain[,-c(nzc)]
# findLinearCombos(xRegTrain)
}
# For future
dataTsf<- stats::ts(tsAll$TotalSales, freq = period) # Create a time series object
decompTsf <- stats::stl(dataTsf, s.window = "periodic", robust = T)$time.series # Seasonal decomposition of time series by Loess. The time series portion returns a dataframe with the seasonal, trend, and remainder for each day.
if(!is.null(covs)){
xRegf <- xReg[1:rowTestEnd,]
nzc <- nearZeroVar(xRegf)
xRegf <- xRegf[,-c(nzc)]
}
# Get double or triple season fourier terms----
seasonalPeriods <- c(seasonalPeriods, period) %>% .[order(.)]
dataMsts <- forecast::msts(tsAll$TotalSales[1:rowTrainEnd], seasonal.periods = seasonalPeriods) # Create multiple seasonal objects. Use one for each period that is influential. E.g., 7 if a week before is influential, 364 if a year before is influential, etc.
# K is the number of fourier terms. More is better, but takes exponentially longer.
fuur <- fourier(dataMsts, K = rep(K, times = length(seasonalPeriods))) # If K is 2, then it made two pairs of sine and cosine signals for each seasonal.period. The resulting matrix will have the same number of rows as the training matrix, and # of seasonal.periods * 2 * K columns
# # For the future
dataMstsf <- forecast::msts(tsAll$TotalSales, seasonal.periods = seasonalPeriods) # Create multiple seasonal objects. Use one for each period that is influential. E.g., 7 if a week before is influential, 364 if a year before is influential, etc.
# K is the number of fourier terms. More is better, but takes exponentially longer.
fuurf <- forecast::fourier(dataMstsf, K = rep(K, times = length(seasonalPeriods))) # If K is 2, then it made two pairs of sine and cosine signals for each seasonal.period. The resulting matrix will have the same number of rows as the training matrix, and # of seasonal.periods * 2 * K columns
# Forecast the trend part of the time series----
trendPart <- stats::ts(decompTs[,2]) # The second column is the trend column.
if(is.null(covs)){
trendFit <- auto.arima(trendPart)
trendFor <- forecast(trendFit, h = period)$mean # Forecasts the trend out the number of days as the period
}else{
trendFit <- auto.arima(trendPart, xreg = xRegTrain) # Fits arima model to the trend and also uses covariates.
trendFor <- forecast(trendFit, xreg = xReg[rowTestStart:rowTestEnd,colnames(xRegTrain)])$mean # Forecasts the trend using covariates.
}
# For the future
trendPartf <- stats::ts(decompTsf[,2]) # The second column is the trend column.
if(is.null(covs)){
trendFitf <- forecast::auto.arima(trendPartf) # Fits arima model to the trend.
trendForf <- forecast::forecast(trendFitf, period)$mean # Forecasts the trend out the number of days as the period
}else{
trendFitf <- forecast::auto.arima(trendPartf, xreg = xRegf) # Fits arima model to the trend.
trendForf <- forecast::forecast(trendFitf, xreg = xReg[(rowTestEnd+1):nrow(xReg),colnames(xRegf)])$mean # Forecasts the trend out the number of days as the period
}
# Make final feature and construct the training matrix----
newTotalSales <- rowSums(decompTs[,c(1,3)]) # Detrended Total sales by adding up the seasonal and the remainder (leaves out the trend) for each row of the training data.
lagSeas <- decompTs[1:(rowTrainEnd - period), 1] # Seasonal part of time series as lag feature.
if(is.null(covs)){
matrixTrain <- data.frame(TotalSales = newTotalSales[rowTrainStart:rowTrainEnd] # Gets the last window*period rows of the newTotalSales
, fuur[rowTrainStart:rowTrainEnd,] # Gets the last rows of the fourier transformations to be ivs
, Lag = lagSeas) # Gets the lagged seasonal part of the time series as an iv
}else{
matrixTrain <- data.frame(TotalSales = newTotalSales[rowTrainStart:rowTrainEnd] # Gets the last window*period rows of the newTotalSales
, fuur[rowTrainStart:rowTrainEnd,] # Gets the last rows of the fourier transformations to be ivs
, Lag = lagSeas # Gets the lagged seasonal part of the time series as an iv
, xReg[rowTrainStart:rowTrainEnd,]
)
}
# For the future
newTotalSalesf <- rowSums(decompTsf[,c(1,3)]) # Detrended Total sales by adding up the seasonal and the remainder (leaves out the trend) for each row of the training data.
lagSeasf <- decompTsf[1:rowTrainEnd, 1] # Seasonal part of time series as lag feature.
if(is.null(covs)){
matrixTrainf <- data.frame(TotalSales = newTotalSalesf[rowTrainStart:rowTestEnd]
, fuurf[rowTrainStart:rowTestEnd,] # Gets the last rows of the fourier transformations to be ivs
, Lag = lagSeasf) # Gets the lagged seasonal part of the time series as an iv
}else{
matrixTrainf <- data.frame(TotalSales = newTotalSalesf[rowTrainStart:rowTestEnd]
, fuurf[rowTrainStart:rowTestEnd,] # Gets the last rows of the fourier transformations to be ivs
, Lag = lagSeasf # Gets the lagged seasonal part of the time series as an iv
, xReg[rowTrainStart:rowTestEnd,]
)
}
# Function to measure accuracy: mape = mean absolue percentage error, mae = mean absolute error----
if(lossFunction == "mape"){
lossF <- function(real, pred){
real[real == 0] <- 1 # Replace 0s with ones or else one zero sets it to infinity
return(100* mean(abs((real - pred)/real)))
}
}else if(lossFunction == "mae"){
lossF <- function(real, pred){
return(mean(abs(real - pred)))
}
}
# Fit the model----
if(algo == "rpart"){
fit <- rpart::rpart(TotalSales ~ ., data = matrixTrain
, control = rpart.control(minsplit = 2 # Minimum number of observations before a tree can be split. 2 is the min.
, maxdepth = 30 # Max depth of the tree. Max is 30.
, cp = 0.000001 # Threshhold for deciding if each branch fulfills conditions for further processing
)
)
# For the future
fitf <- rpart::rpart(TotalSales ~ ., data = matrixTrainf
, control = rpart.control(minsplit = 2 # Minimum number of observations before a tree can be split. 2 is the min.
, maxdepth = 30 # Max depth of the tree. Max is 30.
, cp = 0.000001 # Threshhold for deciding if each branch fulfills conditions for further processing
)
)
}else if(algo == "randomForest"){
fit <- randomForest::randomForest(TotalSales ~ ., data = matrixTrain
, ntree = 1000
, mtry = 8 # Number of variables sampled in each split
, nodesize = 3 # Minimum size for end node. Larger values means fewer trees are grown. Default is 5 for regression
, importance = T
)
impd <- caret::varImp(fit) %>%
dplyr::mutate(
varName = row.names(.)
) %>%
dplyr::arrange(Overall)
if(nrow(impd) > 40){
impd <- impd[1:40,]
}
clustFit <- kmeans(impd$Overall, centers = floor(nrow(impd)/3), nstart = 20)
impd$cluster <- clustFit$cluster %>% as.factor()
impd$varName <- factor(impd$varName, levels = impd$varName)
outputList$viPlot <- ggplot(impd, aes(x = varName, y = Overall, fill = cluster)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "Variable", y = "Overall Importance", title = "Variable Importance of Random Forest Model") +
guides(fill = F) +
theme_minimal()
# For the future
fitf <- randomForest::randomForest(TotalSales ~ ., data = matrixTrainf
, ntree = 2000
, mtry = 8 # Number of variables sampled in each split
, nodesize = 3 # Minimum size for end node. Larger values means fewer trees are grown. Default is 5 for regression
, importance = T
)
}
# Create forecasts----
# For the test period to evaluate of the model
testLag <- decompTs[(rowTrainEnd - period + 1):rowTrainEnd, 1] # Get lagged seasonal values from the decomposed training data for the test data
fuurTest <- forecast::fourier(dataMsts
, K = rep(K, times = length(seasonalPeriods))
, h = period # The number of periods out to forecast
)
if(is.null(covs)){
matrixTest <- data.frame(fuurTest
, Lag = testLag)
}else{
matrixTest <- data.frame(fuurTest
, Lag = testLag
, xReg[(rowTrainEnd - period + 1):rowTrainEnd,]
)
}
forecastedData <- stats::predict(fit, matrixTest) + trendFor # Predict detrended time series part with tree2 model and then add the trend part of the time series forecasted by ARIMA
# For the future, the part we really want
testLagf <- decompTsf[rowTestStart:rowTestEnd, 1] # Get lagged seasonal values from the decomposed training data for the test data
fuurTestf <- forecast::fourier(dataMstsf
, K = rep(K, times = length(seasonalPeriods))
, h = period # The number of periods out to forecast
)
if(is.null(covs)){
matrixTestf <- data.frame(fuurTestf
, Lag = testLagf)
}else{
matrixTestf <- data.frame(fuurTestf
, Lag = testLagf
, xReg[rowTestStart:rowTestEnd,]
)
}
forecastedDataf <- stats::predict(fitf, matrixTestf) + trendForf # Predict detrended time series part with tree2 model and then add the trend part of the time series forecasted by ARIMA
# Plot the results and compare it with real values----
# Training, test, and forecast data
dataFor <- data.frame(TotalSales = c(tsAll$TotalSales
, forecastedData
, forecastedDataf)
, begDay = c(tsAll$begDay
, tsAll$begDay[rowTestStart:rowTestEnd]
, tsAll$begDay[rowTestStart:rowTestEnd] + lubridate::hours(period))
, type = c(rep("Train Data", rowTrainEnd)
, rep("Test Data", rowTestEnd - rowTestStart + 1)
, rep("Forecast", rowTestEnd - rowTestStart + period + 1)
)
)
dataFor$TotalSales <- as.numeric(dataFor$TotalSales) %>% round(0)
algoLoss <- lossF(tsAll$TotalSales[rowTestStart:rowTestEnd], forecastedData) %>% round(2)
outputList$loss <- algoLoss
if(lossFunction == "mape"){
plotSubtitle <- paste0("Mean absolute percentage error for ", toupper(algo), " = ", algoLoss, "%.")
}else if(lossFunction == "mae"){
plotSubtitle <- paste0("Mean absolute error for ", toupper(algo), " = ", algoLoss, ".")
}
plotTitle <- paste0("Forecast from ", toupper(algo))
plotCaption <- ""
if(!is.null(covs)){
plotTitle <- paste0(plotTitle, " with Covariates")
plotCaption <- paste0("Covariates: ", paste(setdiff(colnames(covs), "begDay"), collapse = ", "))
}
if(returnMePlot == T){
modelEvalPlot <- ggplot2::ggplot(dataFor, aes(x = begDay, y = TotalSales, color = type)) +
geom_line(alpha = 0.75) +
ggforce::facet_zoom(x = begDay %in% c(tsAll$begDay[rowTestStart:rowTestEnd], tsAll$begDay[rowTestStart:rowTestEnd] + lubridate::hours(period)), zoom.size = 1.2) +
labs(x = dateColumn, y = valueColumn, title = plotTitle, subtitle = plotSubtitle, caption = plotCaption)
outputList$modelEvalPlot <- modelEvalPlot
}
if(returnYoyPlot == T){
yoySales <- dataFor %>%
dplyr::arrange(begDay, type) %>%
dplyr::mutate(
keep = ifelse(begDay == lead(begDay), 0, 1)
, Year = lubridate::year(begDay)
, Year = ifelse(type == "Forecast", paste(Year, type, sep = " "), Year) %>% factor()
, begDayYoy = as.character(begDay)
, begDayYoy = paste0(lubridate::year(Sys.Date()), "-", substr(begDayYoy, 6,7), "-", substr(begDayYoy, 9, 10), substr(begDayYoy, 11, nchar(begDayYoy))) %>% ymd_hms()
) %>%
dplyr::filter(keep == 1) %>%
dplyr::select(-keep)
outputList$yoySales <- yoySales
}
# Data to return----
colnames(dataFor) <- c(valueColumn, dateColumn, "type")
outputList$dataFor <- dataFor
# Only the forecast data
dataForOnly <- data.frame(begDay = tsAll$begDay[rowTestStart:rowTestEnd] + lubridate::hours(period)
, TotalSales = forecastedDataf
, stringsAsFactors = F)
dataForOnly$TotalSales <- as.numeric(dataForOnly$TotalSales) %>% round(0)
colnames(dataForOnly) <- c(dateColumn, valueColumn)
outputList$dataForOnly <- dataForOnly
return(outputList)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.