#' determines if a time-series dataset contains a single variable (as opposed to multiple variables)
#'
#' @param x a time series dataset
#'
#' library(fpp2)
#' rt_ts_is_single_variable(a10)
#'
#' @export
rt_ts_is_single_variable <- function(x) {
classes <- class(x)
return (length(classes) == 1 && classes == 'ts')
}
#' determines if a time-series dataset contains a single variable (as opposed to multiple variables)
#'
#' @param x a time series dataset
#'
#' library(fpp2)
#' rt_ts_is_multi_variable(melsyd)
#'
#' @export
rt_ts_is_multi_variable <- function(x) {
classes <- class(x)
return (any(classes == 'mts') || any(classes == 'msts'))
}
#' rather than e.g. 2007.333 for May of 2007 that `time()` returns, this returns `2007.01`
#'
#' @param ts_values a time series dataset
#'
#' library(fpp2)
#' get_time_period(a10)
#'
#' @export
rt_ts_get_time_period <- function(ts_values) {
values <- time(ts_values) %>% as.character()
splits <- str_split(values, pattern='\\.', simplify = TRUE)
if(ncol(splits) == 2) {
value_periods <- as.numeric(as.factor(splits[,2]))
max_freq_width <- max(nchar(as.character(value_periods)))
values <- paste0(splits[, 1],
'.',
str_pad(string=as.numeric(as.factor(splits[,2])), width=max_freq_width, side='left', pad='0'))
}
return (as.numeric(values))
}
#' returns a forumla string to pass `tslm()``
#'
#' @param dependent_variable name of the depedent variable in the dataset
#' @param independent_variables name of the indepedent variables in the dataset (including "trend" & "season" if applicable)
#' @param interaction_variables not supported yet
#' @param ex_ante_forecast_horizon is used when the regression model will be used to forecast future values, `ex_ante_forecast_horizon` specifies the number periods (i.e. horizon) that will be forecasted.
#'
#' Ex ante means the forecast will be a "true" forecast and, in this case, will only use lagged values that are lagged far enough back, that we can use them to predict into the future. (and/or trend/season variables)
#' For example, if we want to lag 4 periods (e.g. quarters) ahead, we must use values that are lagged 4 periods behind.
#'
#' Lagged variables must be in the for of `x_lag_y`, where `x` is the original variable name and `y` is the lag number.
#'
#' If `ex_ante_forecast_horizon` is set, only variables that are in the form of `x_lag_y` will be included, and additionally, only lagged variables where `y` >= `ex_ante_forecast_horizon` will be included.
#'
#' All independent variables are included of `ex_ante_forecast_horizon` is `NULL`
#' @examples
#'
#' rt_ts_lm_build_formula(dependent_variable = 'dataset',
#' independent_variables = c('trend', 'season'))
#'
#' @importFrom purrr map_chr
#' @importFrom stringr str_split
#' @export
rt_ts_lm_build_formula <- function(dependent_variable,
independent_variables=NULL,
interaction_variables=NULL,
ex_ante_forecast_horizon=NULL) {
# ex_ante_forecast_horizon specifies the number periods (i.e. horizon) we will be forecasting
# this ex_ante (i.e. true forecast) implementation requires the lags included in the regression model
# to be >= the ex_ante_forecast_horizon.... e.g. cannot have lag 5 included if we are predicting 10 periods ahead, or lag 9
# all independent variables are included of ex_ante_forecast_horizon is NULL
if(!is.null(ex_ante_forecast_horizon)) {
splits <- str_split(independent_variables, '_lag_', simplify=TRUE)
if(ncol(splits) == 1) { # no matches/lags found
valid_lags <- rep(FALSE, length(independent_variables)) # all invalid
} else {
lag_number <- as.numeric(splits[, 2])
lag_number[is.na(lag_number)] <- 0 # NA means not a lag
valid_lags <- lag_number >= ex_ante_forecast_horizon
}
keep_vars <- independent_variables %in% c('trend', 'season') | valid_lags
independent_variables <- independent_variables[keep_vars]
}
if(is.null(interaction_variables)) {
interaction_variables_formula <- ''
} else {
interaction_variables_formula <- paste(map_chr(interaction_variables, ~ paste(., collapse =' * ')),
collapse = ' + ')
}
if(is.null(independent_variables) || length(independent_variables) == 0) {
independent_variables_formula <- interaction_variables_formula
} else if(is.null(interaction_variables) || length(interaction_variables) == 0) {
independent_variables_formula <- paste(independent_variables, collapse =' + ')
} else {
independent_variables_formula <- paste(interaction_variables_formula,
'+',
paste(independent_variables, collapse =' + '))
}
return (paste(dependent_variable, '~', independent_variables_formula))
}
#' returns a new dataset containing the lagged values for specified variables
#'
#' @param dataset a time-series dataset (single- or multi-variable)
#' @param num_lags the number of lags to include (e.g. `3` will include 3 lagged variables for each applicable variable, `x_lag_1`, `x_lag_2`, `x_lag_3`)
#' @param lag_variables the variables (i.e. columns) to include lagged values for (the original column will be retained). A value of `NULL` is used for single-variable datasets (i.e. ts datasets that have no columns) or, for multi-variable datasets, will create lagged variables for *all* variables in the dataset.
#'
#' All variables that are not specified in `lag_variables` are removed.
#'
#' @param keep_variables the variables to retain in the dataset which are not specified in `lag_variables`. A value of `NULL` is used for single-variable datasets.
#'
#' @importFrom stats lag
#' @export
rt_ts_create_lagged_dataset <- function(dataset, num_lags=1, lag_variables=NULL, keep_variables=NULL) {
lagged_dataset <- NULL
new_columns <- NULL
if(rt_ts_is_single_variable(dataset)) {
lagged_dataset <- dataset
new_columns <- 'original_data'
for(lag_index in 1:num_lags) {
lagged_dataset <- cbind(lagged_dataset, stats::lag(dataset, lag_index * -1))
new_columns <- c(new_columns, paste0('data_lag_', lag_index))
}
} else {
if(is.null(lag_variables)) {
lag_variables <- colnames(dataset)
}
new_columns <- NULL
if(!is.null(keep_variables)) {
# add variables/columns we want to keep
for(column in keep_variables) {
lagged_dataset <- cbind(lagged_dataset, dataset[, column])
new_columns <- c(new_columns, column)
}
}
for(column in lag_variables) {
# add original column
lagged_dataset <- cbind(lagged_dataset, dataset[, column])
new_columns <- c(new_columns, column)
# add lags
for(lag_index in 1:num_lags) {
lagged_dataset <- cbind(lagged_dataset, stats::lag(dataset[, column], lag_index * -1))
new_columns <- c(new_columns, paste0(column, '_lag_', lag_index))
}
}
}
if(rt_ts_is_multi_variable(lagged_dataset)) {
colnames(lagged_dataset) <- new_columns
}
return (lagged_dataset)
}
#' adds lag variables, performs regrssion & forecasting, produces a plot
#'
#' @param dataset a time-series dataset (single- or multi-variable)
#' @param dependent_variable the dependent variable; can be null for single-var datasets
#' @param independent_variables the independent variables;
#'
#' `independent_variables` can be null for single-var datasets; for multi-variable datasets, a value of `NULL` will include all variables
#'
#' single-variable & multi-variable datasets can also include `trend` and/or `season` values
#'
#' @param ignore_last_x_periods exclude last x periods from model
#' @param lambda (simply passes value to `tslm`); according to their docs: "Box-Cox transformation parameter. If lambda="auto", then a transformation is automatically selected using BoxCox.lambda. The transformation is ignored if NULL. Otherwise, data transformed before model is estimated."
#' @param num_lags if specified, adds lag variables for all the independent_variables
#' @param include_dependent_variable_lag if num_lags >= 1 *and* a multi-variable dataset, then this indicates whether or not we should lag the dependent_variable and include it in the model (if the data is single-variable, then this is ignored, because the only thing to lag is the dependent_variable)
#' @param ex_ante_forecast_horizon if specified, indicates how far into the future we should forecast. All original variables (except for `trend`/`season`) and all lag values that have a lag less than `ex_ante_forecast_horizon` will be removed.
#' @param build_graphs if TRUE, will build various graphs and return those ggplot objects in the returned list
#' @param show_dataset_labels whether or not to show the values of the original dataaset
#' @param show_forecast_labels whether or not to show the forecast values
#'
#' @importFrom stringr str_split
#' @importFrom ggplot2 aes geom_point geom_text labs geom_boxplot
#' @importFrom dplyr mutate
#' @importFrom tidyr gather
#' @export
rt_ts_auto_regression <- function(dataset,
dependent_variable=NULL,
independent_variables=NULL,
ignore_last_x_periods=NULL,
lambda=NULL,
num_lags=NULL,
include_dependent_variable_lag=TRUE,
ex_ante_forecast_horizon=NULL,
build_graphs=TRUE,
show_dataset_labels=FALSE,
show_forecast_labels=TRUE) {
if(rt_ts_is_multi_variable(dataset)) {
if(is.null(independent_variables)) {
independent_variables <- colnames(dataset) %>% rt_remove_val(dependent_variable)
}
dataset <- dataset[, unique(c(dependent_variable, independent_variables) %>% rt_remove_val(c('trend', 'season')))]
# might have filtered dataset so it is now only a single-var; in which case we have to change the dependent variable
if(rt_ts_is_single_variable(dataset)) {
dependent_variable <- NULL
}
}
if(!is.null(num_lags)) {
stopifnot(num_lags >= 1)
}
if(rt_ts_is_single_variable(dataset)) {
# single variable dataset has to have independent variables to regress on,
# meaning that it has to have either lags-vars or trend/season vars
stopifnot(!is.null(num_lags) ||
(!is.null(independent_variables) && all(independent_variables %in% c('trend', 'season'))))
} else {
# dependent_variable cannot be null for multi-variable dataset
# if independent_variables is null for multi-variable dataset, set it to all columns except dependent_variable
stopifnot(!is.null(dependent_variable))
if(is.null(independent_variables)) {
column_names <- colnames(dataset)
independent_variables <- column_names[! column_names %in% dependent_variable]
}
}
# ex_ante_forecast_horizon means we are going to do a *true* forecast, and
# *either* we need lags (at least enough lags for the corresponding horizon), or we need `trend` and/or `season`
if(!is.null(ex_ante_forecast_horizon) && !any(independent_variables %in% c('trend', 'season'))) {
stopifnot(num_lags >= ex_ante_forecast_horizon)
}
original_dependent_variable <- dependent_variable
if(is.null(dependent_variable)) {
dependent_variable <- 'data'
}
# add lags to dataset
if(!is.null(num_lags)) {
# if the *original* dataset is single-var, then we'll need to rename the dependent_variable to the name that rt_ts_create_lagged_dataset gives
if(rt_ts_is_single_variable(dataset)) {
dependent_variable = 'original_data' # this is what rt_ts_create_lagged_dataset names the column for single-var
}
keep_variables <- original_dependent_variable
lag_variables <- independent_variables[! independent_variables %in% c('trend', 'season')]
if(include_dependent_variable_lag && rt_ts_is_multi_variable(dataset)) {
# if we are going to lag on the dependent vairable as well, then we have to include that in
# lag_variables and remove it from keep_variables (keep_variables keeps the specified variables
# but doesn't lag them)
lag_variables <- c(dependent_variable, lag_variables)
keep_variables <- NULL
}
dataset <- rt_ts_create_lagged_dataset(dataset,
num_lags=num_lags,
lag_variables=lag_variables,
keep_variables=keep_variables) # for single-var we will have changed it from NULL
# now, the column names of the new dataset (other than the dependent_variable) contains all of the
# original and new independent_variables
# but, it won't have trend/season, so if that was in the independent variables, we need to add it back
trend_season <- independent_variables[independent_variables %in% c('trend', 'season')] # get trend and/or season if they are in the list of independent variables
column_names <- colnames(dataset)
independent_variables <- c(trend_season, column_names[! column_names %in% dependent_variable])
}
reg_formula <- rt_ts_lm_build_formula(dependent_variable=dependent_variable,
independent_variables=independent_variables,
ex_ante_forecast_horizon=ex_ante_forecast_horizon)
# extract all variables used in the regression formula (and thus the regression model)
independent_vars_used <- str_split(str_split(reg_formula, pattern=' ~ ', simplify=TRUE )[2], ' \\+ ')[[1]]
stopifnot(!is.null(independent_vars_used) && length(independent_vars_used) != 0 && !any(independent_vars_used == ""))
# now figure out which vars were used in the regression model, aside from trend/season (i.e. which variables we need to subset)
independent_vars_used_from_dataset <- independent_vars_used[! independent_vars_used %in% c('trend', 'season')]
# if any variables are used beside trend/season, we want the dataset containing only those values
# because, for example, if we do an na.omit on variables that have a lot of missing data but aren't even
# used in the model, we'll be needlessly restricting the data avaialable to the model
if(length(independent_vars_used_from_dataset) > 0) {
# we just want the dataset that is actually used by the regression model
dataset <- dataset[, c(dependent_variable, independent_vars_used_from_dataset)]
}
# for LAGGED DATA na.omit will remove the NAs at the beginning of the dataset (because a column that lags x
# periods doesn't have values for the first x periods)
# but will also remove the NAs at the end of the dataset which is needed to we restrict the training to the
# original time horizon i.e. ending period
training_data <- na.omit(dataset)
if(!is.null(ignore_last_x_periods)) {
if(rt_ts_is_single_variable(training_data)) {
training_data <- head(training_data, length(training_data) - ignore_last_x_periods)
} else {
training_data <- head(training_data, nrow(training_data) - ignore_last_x_periods)
}
}
ts_model <- tslm(formula=reg_formula, data=training_data, lambda=lambda)
ts_forecast <- NULL
# we can only forecast for ex_ante regressions, otherwise we don't have the required data
if(!is.null(ex_ante_forecast_horizon)) {
# if we only forecast trend and/or season (i.e. 0 variables from the original dataset)
# then we don't need to extract other data, just forecast however many periods
if(length(independent_vars_used_from_dataset) == 0) {
ts_forecast <- forecast(ts_model, h=ex_ante_forecast_horizon)
} else { # `dataset` has 1 or more variables (including the dependent_variable)
# now we want the dataset of just the independent variables with the NAs removed, because the last
# `ex_ante_forecast_horizon` number of periods of this dataset will be the first
# `ex_ante_forecast_horizon` number of periods after the last available period in the original training set
# i.e. will be the `ex_ante_forecast_horizon` periods we can predict
# it is possible that we stripped out all but one variable i.e. it is now single-var dataset
temp <- na.omit(dataset[, independent_vars_used_from_dataset])
# i want to verify that the end() of temp is exactly `ex_ante_forecast_horizon` periods after the
# last period we trained on
# period_add_duration <- function(period, data_frequency, periods_to_add) {
#
# years_to_add <- floor((period[2] + periods_to_add - 1) / data_frequency)
# ending_period <- (period[2] + periods_to_add) %% data_frequency
#
# if(ending_period == 0) {
# ending_period <- data_frequency
# }
#
# period[1] <- period[1] + years_to_add
# period[2] <- ending_period
#
# return (period)
# }
#
# # period_add_duration(period=c(2008, 6), data_frequency=12, periods_to_add=5)
# # period_add_duration(period=c(2008, 6), data_frequency=12, periods_to_add=6)
# # period_add_duration(period=c(2008, 6), data_frequency=12, periods_to_add=7)
# # period_add_duration(period=c(2008, 6), data_frequency=12, periods_to_add=24)
# # period_add_duration(period=c(2008, 6), data_frequency=12, periods_to_add=24 + 5)
# # period_add_duration(period=c(2008, 6), data_frequency=12, periods_to_add=24 + 6)
# # period_add_duration(period=c(2008, 6), data_frequency=12, periods_to_add=24 + 7)
#
# # need to "add" a period plus a "duration"
# expected_end <- period_add_duration(period=end(training_data),
# data_frequency=frequency(training_data),
# periods_to_add=ex_ante_forecast_horizon)
#stopifnot(all(expected_end == end(temp)))
if(rt_ts_is_single_variable(temp)) {
# rt_ts_is_single_variable == TRUE should mean that we only have one independent variable, confirm logic
stopifnot(length(independent_vars_used_from_dataset) == 1)
num_periods <- length(temp)
new_data <- subset(temp, start=num_periods - ex_ante_forecast_horizon + 1)
# even though it is a single-var dataset, the tmls was given a data.frame, with a particular column
new_data <- as.data.frame(new_data)
colnames(new_data) <- independent_vars_used_from_dataset
} else {
num_periods <- nrow(temp)
# new_data should now contain a ts dataset of the periods we want to predict and the (lagged)
# values necessary to predict them
start_index <- num_periods - ex_ante_forecast_horizon + 1
new_data <- as.data.frame(subset(temp, start=start_index))
# if ex_ante_forecast_horizon is 1 (i.e. we're only subsetting one period), then subsetting the
# data when there are multiple variables and then turning that data into a data.frame results
# in a data frame that has the columns as rows...
if(ex_ante_forecast_horizon == 1 && length(independent_vars_used_from_dataset) > 1) {
new_data <- as.data.frame(t(new_data))
}
}
# the number of rows in new_data should match the number of periods we are forecasting
stopifnot(nrow(new_data) == ex_ante_forecast_horizon)
ts_forecast <- forecast(ts_model, newdata=new_data)
}
}
ggplot_fit <- NULL
ggplot_actual_vs_fit <- NULL
ggplot_residual_vs_fit <- NULL
ggplot_residual_vs_predictors <- NULL
ggplot_residual_vs_season <- NULL
ggplot_residual_vs_period <- NULL
if(build_graphs) {
######################################################################################################
# datasets the graphs will share
######################################################################################################
if(rt_ts_is_single_variable(dataset)) {
dependent_values <- dataset
} else {
dependent_values <- dataset[, dependent_variable]
}
fitted_values <- fitted(ts_model)
residual_values <- residuals(ts_model)
# df_fit_data contains original dataset with fitted/residuals/etc.
df_fit_data <- NULL
if(length(independent_vars_used_from_dataset) > 0) {
df_fit_data <- cbind(dataset[, independent_vars_used_from_dataset],
Actual=dependent_values,
Fitted=fitted_values,
Season=cycle(dependent_values),
Time=rt_ts_get_time_period(dependent_values),
Residuals=residual_values) %>%
na.omit() %>%
as.data.frame() %>%
mutate(Season=as.factor(Season))
colnames(df_fit_data) <- c(independent_vars_used_from_dataset,
'Actual', 'Fitted', 'Season', 'Time', 'Residuals')
} else {
df_fit_data <- cbind(Actual=dependent_values,
Fitted=fitted_values,
Season=cycle(dependent_values),
Time=rt_ts_get_time_period(dependent_values),
Residuals=residual_values) %>%
na.omit() %>%
as.data.frame() %>%
mutate(Season=as.factor(Season))
}
######################################################################################################
# FIT GRAPH (AND FORECAST IF APPLICABLE)
######################################################################################################
# build plot; use data before it was striped of NAs (`dataset`) so when we plot the regression points,
# we can see what the model used or did not use
forecasted_dataset <- NULL
if(!is.null(ts_forecast)) {
forecasted_dataset <- as.ts(ts_forecast)
}
ggplot_fit <- rt_ts_plot_time_series(dataset=dependent_values,
fitted_dataset=ts_model$fitted.values,
forecasted_dataset=forecasted_dataset,
show_values=show_dataset_labels,
show_points=show_dataset_labels,
show_fit_points_values = FALSE,
show_forecasted_points_values = show_forecast_labels)
ggplot_fit <- ggplot_fit + labs(y=dependent_variable)
######################################################################################################
# Actual vs Fitted
######################################################################################################
data_freq <- frequency(dependent_values)
if(data_freq == 1 || data_freq > 12) {
ggplot_actual_vs_fit <- ggplot(df_fit_data, aes(x=Fitted, y=Actual))
} else {
ggplot_actual_vs_fit <- ggplot(df_fit_data, aes(x=Fitted, y=Actual, color=Season))
}
label_threshold <- as.numeric(quantile(abs(residual_values), .97)) # get value at 97th percentile of data
ggplot_actual_vs_fit <- ggplot_actual_vs_fit +
geom_point() +
geom_abline(intercept=0, slope=1) +
geom_text(aes(label=ifelse(abs(Residuals) > label_threshold, Time, '')), # show extreme values
size=3,
vjust=-0.2,
hjust=-0.2) +
geom_smooth(method='loess', group=1, se=FALSE, size=0.5, colour='red') +
labs(title='Actual vs. Fitted Values',
x='Fitted',
y='Actual Values',
caption='\nBlack line shows perfect alignment between `fitted` and `actual` values.\nRed line shows smoothed trend between `fitted` and `actual`.\nData points with large residuals are labed.')
######################################################################################################
# Residuals vs Fitted
######################################################################################################
data_freq <- frequency(dependent_values)
if(data_freq == 1 || data_freq > 12) {
ggplot_residual_vs_fit <- ggplot(df_fit_data, aes(x=Fitted, y=Residuals))
} else {
ggplot_residual_vs_fit <- ggplot(df_fit_data, aes(x=Fitted, y=Residuals, color=Season))
}
label_threshold <- as.numeric(quantile(abs(residual_values), .97)) # get value at 97th percentile of data
ggplot_residual_vs_fit <- ggplot_residual_vs_fit +
geom_point() +
geom_abline(intercept=0, slope=0) +
geom_text(aes(label=ifelse(abs(Residuals) > label_threshold, Time, '')), # show extreme values
size=3,
vjust=-0.2,
hjust=-0.2) +
geom_smooth(method='loess', group=1, se=FALSE, size=0.5, colour='red') +
labs(title='Residuals vs. Fitted Values',
x='Fitted',
y='Residuals Values',
caption='\nBlack line is reference for 0 residual/error.\nRed line shows smoothed trend between `residuals` and `fitted` values.\nData points with large residuals are labed.')
######################################################################################################
# Residuals vs Predictors
######################################################################################################
if(length(independent_vars_used_from_dataset) > 0) {
ggplot_residual_vs_predictors <- df_fit_data[, c('Residuals', independent_vars_used_from_dataset)] %>%
gather(key, value, -Residuals) %>%
mutate(key=factor(key, levels = independent_vars_used_from_dataset)) %>%
ggplot(aes(x=value, y=Residuals, group=key)) +
geom_point() +
geom_smooth(method='loess') +
facet_wrap( ~ key,
scales='free_x',
ncol=min(2, length(independent_vars_used_from_dataset))) +
labs(title='Residuals vs. Predictors',
x='Predictor Values')
}
######################################################################################################
# Residuals vs Period
######################################################################################################
floor_times <- floor(df_fit_data$Time)
if(length(unique(floor_times)) <= 50) {
ggplot_residual_vs_period <- ggplot(df_fit_data %>% mutate(Floor_Times=as.factor(floor_times)),
aes(x=Floor_Times, y=Residuals)) +
geom_boxplot() +
labs(title = 'Residuals vs. Period',
x='Period')
}
######################################################################################################
# Residuals vs Season
######################################################################################################
if(frequency(dataset) > 1 && frequency(dataset) <= 52) {
ggplot_residual_vs_season <- ggplot(df_fit_data, aes(x=Season, y=Residuals)) +
geom_boxplot() +
labs(title = 'Residuals vs. Season')
}
}
return (list(formula=reg_formula,
model=ts_model,
forecast=ts_forecast,
plot_fit=ggplot_fit,
plot_actual_vs_fitted=ggplot_actual_vs_fit,
plot_residuals_vs_fitted=ggplot_residual_vs_fit,
plot_residuals_vs_predictors=ggplot_residual_vs_predictors,
plot_residuals_vs_period=ggplot_residual_vs_period,
plot_residuals_vs_season=ggplot_residual_vs_season))
}
#' Returns friendly axis ticks. e.g. for monthly it returns a factor with values c("2019-01", "2019-02").
#' If the frequency of the dataset is anything other than 52, 12, 4, it returns the raw time() values.
#'
#' @param dataset ts dataset
#' @importFrom stats time
#' @importFrom timeDate frequency
#' @importFrom stringr str_pad
#' @export
rt_ts_get_friendly_time_ticks <- function(dataset) {
time_ticks <- as.numeric(time(dataset))
num_periods <- frequency(dataset)
if(num_periods == 52 || num_periods == 12 || num_periods == 4) {
years <- floor(time_ticks)
time_minor <- time_ticks - years
friendly_minor <- (num_periods * time_minor) + 1
num_zeros_to_pad <- nchar(trunc(max(friendly_minor)))
friendly_ticks <- paste0(years,
'-',
str_pad(string=round(friendly_minor),
width=num_zeros_to_pad, pad="0", side='left'))
friendly_ticks <- factor(friendly_ticks, levels=friendly_ticks, ordered=TRUE)
return (friendly_ticks)
} else {
return (time_ticks)
}
}
#' plots a time-series dataset
#'
#' @param dataset ts dataset
#' @param show_values show text value above each point in time
#' @param show_points show points
#' @param show_dates show dates above each point in time
#' @param include_last_point default TRUE; if FALSE, removes the last (most recent) point; if float (0 < include_last_point < 1) then the value represents how far into the time period it is and projects accordingly; e.g. if the last point has a value of `100` and include_last_point is `0.25` (i.e. 25 percent of the way through the current period), then `100` will be adjusted to `400`.
#' @param fitted_dataset adds a fit line based on the dataset passed in
#' @param show_fit_points_values shows the fitted points and values if fitted_dataset is not null
#' @param forecasted_dataset adds forecasted line points and
#' @param show_forecasted_points_values shows the fitted points and values if forecasted_dataset is not null
#' @param y_zoom_min adjust (i.e. zoom in) to the y-axis; sets the minimum y-value for the adjustment
#' @param y_zoom_max adjust (i.e. zoom in) to the y-axis; sets the maximum y-value for the adjustment
#' @param facet_multi_variables if TRUE each variable gets it's own section
#' @param line_colors if NULL then black for single-var and rt_colors for multi-var
#' @param text_size sets the text size
#' @param base_size sets the base size
#'
#' @importFrom magrittr "%>%"
#' @importFrom tidyr gather
#' @importFrom dplyr filter mutate
#' @importFrom purrr map_chr
#' @importFrom ggplot2 ggplot aes geom_line expand_limits scale_y_continuous scale_color_manual theme_light labs geom_text geom_point theme element_text coord_cartesian facet_wrap
#' @importFrom stringr str_remove
#' @importFrom scales pretty_breaks format_format
#' @importFrom stats time
#' @importFrom timeDate frequency
#' @export
rt_ts_plot_time_series <- function(dataset,
show_values=FALSE,
show_points=FALSE,
show_dates=FALSE,
include_last_point=TRUE,
fitted_dataset=NULL,
show_fit_points_values=FALSE,
forecasted_dataset=NULL,
show_forecasted_points_values=TRUE,
y_zoom_min=NA,
y_zoom_max=NA,
facet_multi_variables=FALSE,
line_colors=NULL,
text_size=4,
base_size=11) {
if(include_last_point == FALSE) {
# remove last point
if(rt_ts_is_single_variable(dataset)) {
dataset <- head(dataset, length(dataset) - 1)
} else {
dataset <- head(dataset, nrow(dataset) - 1)
}
}
# if fitted_dataset and/or forecasted_dataset is NULL then they should be ignored in ts.union
if(!is.null(fitted_dataset) || !is.null(forecasted_dataset)) {
stopifnot(rt_ts_is_single_variable(dataset))
dataset <- ts.union(dataset, fitted_dataset, forecasted_dataset)
current_col_names <- colnames(dataset)
colnames(dataset) <- current_col_names %>%
str_remove(pattern = 'fitted_dataset.') %>%
str_remove(pattern = 'forecasted_dataset.')
}
df_dataset <- as.data.frame(dataset)
num_periods <- frequency(dataset)
friendly_ticks <- rt_ts_get_friendly_time_ticks(dataset)
# 36 to allow for 3 years of monthly
if(is.factor(friendly_ticks) && length(friendly_ticks) > 36) {
df_dataset$ticks <- as.numeric(time(dataset))
} else {
df_dataset$ticks <- friendly_ticks
}
df_dataset$friendly_ticks <- friendly_ticks
if(!is.null(fitted_dataset) || !is.null(forecasted_dataset)) {
ignore_columns <- c('ticks', 'friendly_ticks')
if(!is.null(fitted_dataset)) {
df_dataset <- df_dataset %>%
rename(Fitted=fitted_dataset)
ignore_columns <- c(ignore_columns, 'Fitted')
}
if(!is.null(forecasted_dataset)) {
df_dataset <- df_dataset %>%
rename(Forecast=`Point Forecast`)
ignore_columns <- c(ignore_columns,
"Forecast",
"Lo 80",
"Hi 80",
"Lo 95",
"Hi 95")
}
df_dataset <- df_dataset %>%
rename(Actual=dataset) %>%
gather(key, value, -ignore_columns)
if(!is.null(fitted_dataset)) {
df_dataset$pretty_fitted <- map_chr(df_dataset$Fitted,
~ prettyNum(., big.mark=",", preserve.width="none", digits=4, scientific=FALSE))
}
if(!is.null(forecasted_dataset)) {
df_dataset$pretty_forecast <- map_chr(df_dataset$Forecast,
~ prettyNum(., big.mark=",", preserve.width="none", digits=4, scientific=FALSE))
}
} else {
df_dataset <- df_dataset %>%
gather(key, value, -c(ticks, friendly_ticks))
}
df_dataset$pretty_value <- map_chr(df_dataset$value,
~ prettyNum(., big.mark=",", preserve.width="none", digits=4, scientific=FALSE))
ggplot_object <- df_dataset %>%
ggplot(aes(x=ticks, y=value, color=key, group=key)) +
geom_line(na.rm = TRUE) +
expand_limits(y=0) +
scale_y_continuous(breaks=pretty_breaks(10),
labels = format_format(big.mark=",", preserve.width="none", digits=4, scientific=FALSE)) +
theme_light(base_size = base_size) +
labs(y=NULL,
x=NULL,
color=NULL)
if(!is.null(fitted_dataset)) {
ggplot_object <- ggplot_object +
geom_line(aes(y=Fitted, color="Fitted"), na.rm = TRUE)
if(show_fit_points_values) {
ggplot_object <- ggplot_object +
geom_point(aes(y=Fitted, color="Fitted"), na.rm = TRUE, show.legend = FALSE) +
geom_text(aes(y=Fitted, label=pretty_fitted, color="Fitted"), na.rm = TRUE,
vjust=-0.5, size=text_size, check_overlap = TRUE, show.legend = FALSE)
}
}
if(!is.null(forecasted_dataset)) {
ggplot_object <- ggplot_object +
geom_line(aes(y=Forecast, color="Forecast"), na.rm = TRUE)
ggplot_object <- ggplot_object +
geom_ribbon(aes(ymin=`Lo 80`, ymax=`Hi 80`, color=NULL),
fill="red", alpha=0.25, show.legend=FALSE, na.rm = TRUE) +
geom_ribbon(aes(ymin=`Lo 95`, ymax=`Hi 95`, color=NULL),
fill="red", alpha=0.15, show.legend=FALSE, na.rm = TRUE)
if(show_forecasted_points_values) {
ggplot_object <- ggplot_object +
geom_point(aes(y=Forecast, color="Forecast"), na.rm = TRUE, show.legend = FALSE) +
geom_text(aes(y=Forecast, label=pretty_forecast, color="Forecast"), na.rm = TRUE,
vjust=-0.5, size=text_size, check_overlap = TRUE, show.legend = FALSE)
}
}
if (is.numeric(include_last_point)) {
# keep last point but project it.
rt_stopif(include_last_point <= 0)
rt_stopif(include_last_point >= 1)
projections <- df_dataset %>%
filter(ticks == max(ticks)) %>%
mutate(value = value / include_last_point)
projections$pretty_value <- map_chr(projections$value,
~ prettyNum(., big.mark=",", preserve.width="none", digits=4,
scientific=FALSE))
ggplot_object <- ggplot_object +
geom_point(data = projections, aes(x=ticks, y=value), color="red", size=2.2, show.legend = FALSE) +
geom_point(data = projections, aes(x=ticks, y=value), size=1.3, show.legend = FALSE) +
geom_text(data = projections,
aes(x=ticks, y=value, label=pretty_value),
color="red", vjust=-0.5, size=text_size, check_overlap = TRUE, show.legend = FALSE) +
labs(caption='Point with Red outline is projection based on how far into the time period we are.')
}
if(!is.null(fitted_dataset) || !is.null(forecasted_dataset)) {
line_colors <- 'black'
if(!is.null(fitted_dataset)) {
line_colors <- c(line_colors, 'blue')
}
if(!is.null(forecasted_dataset)) {
line_colors <- c(line_colors, 'red')
}
} else if(rt_ts_is_multi_variable(dataset) && is.null(line_colors)) {
line_colors <- c(rt_colors(), rt_colors())
} else if (is.null(line_colors)) {
line_colors <- 'black'
}
ggplot_object <- ggplot_object +
scale_color_manual(values=line_colors, na.value = '#2A3132')
if(show_values) {
ggplot_object <- ggplot_object +
geom_text(aes(label = pretty_value),
check_overlap = TRUE, vjust=-0.5, size=text_size, na.rm = TRUE, show.legend = FALSE)
}
if(show_dates) {
if(show_values) {
show_values_hjust <- -0.5
} else {
show_values_hjust <- -0.1
}
ggplot_object <- ggplot_object +
geom_text(aes(label = friendly_ticks),
check_overlap = TRUE, vjust=--0.5, hjust=show_values_hjust, size=text_size,
na.rm = TRUE, angle=90, show.legend = FALSE)
}
if(show_points) {
ggplot_object <- ggplot_object + geom_point(size=1, na.rm = TRUE, show.legend = FALSE)
}
if(is.factor(df_dataset$ticks)) {
ggplot_object <- ggplot_object + theme(axis.text.x = element_text(angle = 30, hjust = 1))
}
if(length(unique(df_dataset$key)) == 1 && is.null(fitted_dataset) && is.null(forecasted_dataset)) {
ggplot_object <- ggplot_object + theme(legend.position = "none")
}
if(!is.na(y_zoom_min) || !is.na(y_zoom_max)) {
# if one of the zooms is specified then we hae to provide both, so get corresponding min/max
if(is.na(y_zoom_min)) {
y_zoom_min <- min(dataset, na.rm = TRUE)
}
if(is.na(y_zoom_max)) {
y_zoom_max <- max(dataset, na.rm = TRUE)
}
ggplot_object <- ggplot_object + coord_cartesian(ylim = c(y_zoom_min, y_zoom_max))
}
if(facet_multi_variables && rt_ts_is_multi_variable(dataset)) {
ggplot_object <- ggplot_object + facet_wrap(~ key, ncol=1, scales = 'free_y')
}
return (ggplot_object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.