#' ARIMA model choice automation
#'
#' This function allows to select by informational criteria and performance results one or a serie of different models allowing for best performance in and out of sample. The function is a wrapper for forecast and tseries packages.
#'
#' @param seriex A univariate time serie to be studied
#' @param order The maximum limit for p, d and q for ordinary ARIMA part
#' @param seasonallim The maximum limit for P, D and Q for SARIMA part
#' @param crossvalid The point for cross-validation
#' @param limit A maximum number of models to be selected by criteria (3 in sample and 3 out of sample criterias are used)
#' @param method The estimation method to be applied (as in the timeserie package)
#' @param adf.plim ADF selection probability limit
#' @param partlim Part of autocorrelated lags in lags
#'
#' @return A list of selected models estimated with SARIMA with additional performance information
#'
#' @keywords ARIMAanalyse
#' @export
#' @examples
#' ARIMAanalyse = function(seriex,
#' orderlim = c(1, 1, 1), seasonallim = c(1, 1, 1),
#' crossvalid = c(2015, 4), limit = 3,
#' method = "ML", adf.plim = 0.1, partlim = 0.3)
#'
#' @import tidyverse
#' @import tseries
#' @import ggfortify
#' @import gridExtra
#' @import forecast
#' @import car
#' @import lmtest
#'
ARIMAanalyse = function(seriex, # Time serie
orderlim = c(1, 1, 1), # Difference limit
seasonallim = c(1, 1, 1), # Seasonal part limit
crossvalid = c(2015, 4), # Validation division point
limit = 3, # Number of models selected by criteria
method = "ML", # Estimation method
adf.plim = 0.1, # ADF selection probability limit
partlim = 0.3 # Part of autocorrelated lags in lags
) {
#######+###################
# Tests+# Exploratory plots
#######+###################
# Lagorder
lago = crossing(d = 0:orderlim[2], D = 0:seasonallim[2])
# Frame creation
tests = data.frame(matrix(NA,
nrow = nrow(lago),
ncol = 5))
names(tests) = c("d", "D",
"ADF", "KPSS.L", "KPSS.T")
tests[,1] = lago[,1]
tests[,2] = lago[,2]
# Statistics frame creation
statistics = data.frame(matrix(NA,
nrow = nrow(lago),
ncol = 6))
names(statistics) = c("d", "D",
"ADF", "KPSS.L", "KPSS.T",
"DW")
statistics[,1] = lago[,1]
statistics[,2] = lago[,2]
# Correlation
acorrelation = list()
pcorrelation = list()
# Plots
cfplots = list()
# Loop
for (i in 0:(nrow(lago)-1)) {
if (statistics[i+1,1] != 0) {
ser = diff(seriex,
lag = attributes(seriex)$tsp[3],
dif = tests[i+1,1])
} else {ser = seriex}
if (statistics[i+1,2] != 0) {
ser = diff(ser,
lag = 1,
dif = tests[i+1,2])
} else {ser = ser}
# Tests P-VALUES dataframe
tests[i+1, 3] = adf.test(ser)$p.value
tests[i+1, 4] = kpss.test(ser,null = "Level")$p.value
tests[i+1, 5] = kpss.test(ser,null = "Trend")$p.value
statistics[i+1, 3] = adf.test(ser)$statistic
# Tests STATISTICS dataframe
statistics[i+1, 4] = kpss.test(ser,null = "Level")$statistic
statistics[i+1, 5] = kpss.test(ser,null = "Trend")$statistic
statistics[i+1, 6] = durbinWatsonTest(as.integer(ser))
# Values
s_level =
qnorm((1 + 0.95)/2)/sqrt(sum(!is.na(ser)))
acorrelation[[i+1]] =
sum(abs(acf(ser, plot = F)$acf)/s_level > 1)/as.integer(10*log10(length(ser)))
pcorrelation[[i+1]] =
sum(abs(pacf(ser, plot = F)$acf)/s_level > 1)/as.integer(10*log10(length(ser)))
# CFs
cf = ggAcf(ser, plot = F)
pcf = ggPacf(ser, plot = F)
# Plots
placf = autoplot(cf) +
ggtitle(paste("Differences :", tests[i+1,1], tests[i+1,2], sep = " "))
plpacf = autoplot(pcf) +
ggtitle(paste("Differences :", tests[i+1,1], tests[i+1,2], sep = " "))
cfplots[[i+1]] = grid.arrange(placf, plpacf, nrow = 1)
}
##############################
# Analyse and preselect models
##############################
# Selection by correlation
nn = intersect(which(acorrelation < partlim),
which(pcorrelation < partlim))
# Selection by test results
an = tests[nn,] %>%
filter(ADF < adf.plim &
(KPSS.L >= 0.1 | KPSS.T >= 0.1)) %>%
select(d, D)
#######################
# Combinations creation
#######################
dbin = as.integer(an$d)
Dbin = as.integer(an$D)
comb = crossing(p = c(0:orderlim[1]),
d = dbin,
q = c(0:orderlim[3]),
P = c(0:seasonallim[1]),
D = Dbin,
Q = c(0:seasonallim[3]))
if (nrow(comb) == 0) {
results = list(tests = list(pval = tests,
stat = statistics),
plots = cfplots)
return(results)
stop("No satisfying difference orders detected")
}
comb = comb %>%
mutate(n = 1:nrow(comb))
##############
# Modelisation
##############
# Auxilary list
model = list()
## seriex = ts_data[,2]
serie = window(seriex, end = crossvalid)
# Cycle
for (i in 1:nrow(comb)) {
# Order selection
yearly = as.numeric(comb[i, 1:3])
season = as.numeric(comb[i, 4:6])
# Model
model[[i]] = Arima(serie,
order = yearly,
seasonal = season,
method = method)
}
# Results extraction
# Auxilary frame
frame = data.frame(AIC = NA, BIC = NA,
loglik = NA, pseudoR2 = NA, sigma2 = NA, n = NA)[-1,]
# Model description
for (i in 1:nrow(comb)) {
frame[i,] = frame[i,] %>%
mutate(AIC = model[[i]]$aic,
BIC = model[[i]]$bic,
loglik = model[[i]]$loglik,
sigma2 = model[[i]]$sigma2,
pseudoR2 = 0,
n = i)
}
# Loglik normalisation
lago = crossing(d = dbin, D = Dbin)
# Loop
for (i in 1:nrow(lago)) {
sell = left_join(lago[i,], comb, by = c("d", "D")) %>%
mutate(ko = i,
kk = as.integer(p == 0 &
q == 0 &
P == 0 &
Q == 0)) %>%
select(n, ko, kk)
frame[sell$n[sell$kk == 0],] =
frame[sell$n[sell$kk == 0],] %>%
mutate(pseudoR2 =
1 - loglik/frame$loglik[sell$n[sell$kk == 1]])
frame[sell$n[sell$kk == 1],] =
frame[sell$n[sell$kk == 1],] %>%
mutate(pseudoR2 = 0)
}
# Reorganisation
frameX = frame %>%
dplyr::arrange(AIC) %>%
mutate(ak = 1:nrow(frame)) %>%
dplyr::arrange(BIC) %>%
mutate(bk = 1:nrow(frame)) %>%
dplyr::arrange(pseudoR2) %>%
mutate(lk = nrow(frame):1) %>%
dplyr::arrange(sigma2) %>%
mutate(sk = 1:nrow(frame)) %>%
mutate(CC4 = ak + bk + lk + sk,
CC3 = ak + bk + lk,
CC2 = ak + bk)
################
# Optimums study
################
# Criteria
cca = frameX %>%
dplyr::arrange(AIC) %>%
head(limit) %>%
select(n) %>%
mutate(TYPE = "AIC")
ccb = frameX %>%
dplyr::arrange(BIC) %>%
head(limit) %>%
select(n) %>%
mutate(TYPE = "BIC")
# ccl = frameX %>%
# dplyr::arrange(lk) %>%
# head(limit) %>%
# select(n) %>%
# mutate(TYPE = "Log-Lik")
ccs = frameX %>%
dplyr::arrange(sigma2) %>%
head(limit) %>%
select(n) %>%
mutate(TYPE = "Sigma2")
ccx = rbind(cca, ccb) %>%
rbind(ccs)
# Presentation results
cc = left_join(ccx, comb)
##########################
# Testing models forecasts
##########################
# Aux
forec_list = list()
res = data.frame(erm = NA, er2m = NA,
sigma2 = NA, somme = NA, n = NA)[-1,]
# Loop
for (i in 1:length(model)) {
# Forecasts
forec = forecast(model[[i]],
h = (length(seriex) - length(serie)),
level = c(50, 90))
# Forecasts organisation
forec_list[[i]] = data.frame(pred = forec$mean,
orig = seriex[(length(serie)+1):length(seriex)]) %>%
mutate(er = pred - orig,
er2 = er^2)
# Results
res[i, ] = data.frame(erm = sum(forec_list[[i]]$er),
er2m = sum(forec_list[[i]]$er2),
sigma2 = model[[i]]$sigma2,
somme = sum(forec_list[[i]]$er2) + model[[i]]$sigma2,
n = i)
}
##############
# New criteria
##############
## Square error
rsq = res %>%
dplyr::arrange(er2m) %>%
head(limit)
## Mean error
rm = res %>%
dplyr::arrange(pmax(erm, -erm)) %>%
head(limit)
## Overall square error
rosq = res %>%
dplyr::arrange(somme) %>%
head(limit)
# Combination
rtot = rbind(rsq, rm) %>%
rbind(rosq)
# Merge
rcc = left_join(rtot, comb)
#######
# Plots
#######
################
# Study criteria
# Aux
I = 1:length(unique(cc$n))
x = 1
# Ploting frame
toplot1 = data.frame(matrix(NA,
ncol = length(I)+1,
nrow = length(window(seriex, start = crossvalid))))
names(toplot1)[2:(length(I)+1)] =
paste0("n", unique(as.integer(cc$n)))
names(toplot1)[1] = "original"
# toplot1[,x] = seriex
toplot1[,x] = window(seriex, start = crossvalid)
# Loop
for (i in unique(as.integer(cc$n))) {
x = x + 1
# Forecasts
forec = forecast(model[[i]],
h = (length(seriex) - length(serie)),
level = c(50, 90))
# series = append(serie, forec$mean)
# toplot1[, x] = series
toplot1[, x] = append(serie[length(serie)], forec$mean)
}
# Ts conversion
toplot1 = ts(toplot1,
start = attributes(serie)$tsp[2],
freq = attributes(serie)$tsp[3])
###############
# Test criteria
# Aux
I = 1:length(unique(rcc$n))
x = 1
# Ploting frame
toplot2 = data.frame(matrix(NA,
ncol = length(I)+1,
nrow = length(window(seriex, start = crossvalid))))
names(toplot2)[2:(length(I)+1)] =
paste0("n", unique(as.integer(rcc$n)))
names(toplot2)[1] = "original"
# toplot2[,x] = seriex
toplot2[,x] = window(seriex, start = crossvalid)
# Loop
for (i in unique(as.integer(rcc$n))) {
x = x + 1
# Forecasts
forec = forecast(model[[i]],
h = (length(seriex) - length(serie)),
level = c(50, 90))
# series = append(serie, forec$mean)
# toplot2[, x] = series
toplot2[, x] = append(serie[length(serie)], forec$mean)
}
# Ts conversion
toplot2 = ts(toplot2,
start = attributes(serie)$tsp[2],
freq = attributes(serie)$tsp[3])
#########
# Results
#########
crit = list(study = cc,
test = rcc)
plotx = list(study = toplot1,
test = toplot2)
tests = list(stat = statistics,
pval = tests)
results = list(crit = crit,
predict = plotx,
tests = tests,
models = model,
cfplots = cfplots)
# Output
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.