# Prediction tools
# Setup --------------------------------------------------
# Packages
suppressMessages({
library(data.table)
library(tidyverse)
library(neuralnet)
library(lightgbm)
library(glmnet)
library(ggplot2)
library(mice)
library(caret)
library(caretEnsemble)
library(RANN) # KNN
library(forecast)
})
data = as.data.frame(fpp2::uschange)
Yobs = data[(nrow(data)-4):nrow(data),1]
data[(nrow(data)-4):nrow(data),1] = NA
load.dt = function(df){
# One hot encoding
dummy = dummyVars(~ ., data = df, fullRank = TRUE)
df = as.data.frame(predict(dummy, df))
# Preprocessing
df.nn = preProcess(df[, -which(names(df) == "Consumption")], method = c("center", "scale", "YeoJohnson", "nzv", "knnImpute"))# Remove character and dat
df = cbind(df[, "Consumption"], predict(df.nn, newdata = df[, -which(names(df) == "Consumption")])) # if character
# Log Y !
# df$SalePrice = log(df$SalePrice)
# Clean names
df = janitor::clean_names(df)
# Output
return(as.data.table(df))
}
# Import data
data = load.dt(data)
data = data %>% rownames_to_column("index") # a supprimer
# Model ts univarié obj décomposition
dec = forecast::mstl(data[!is.na(df_consumption),]$df_consumption, lambda = 1)
dec[,"Trend"] %>% nnetar() %>% predict(h = 90) %>% autoplot()
dec[,"Trend"] %>% tbats() %>% predict(h = 90) %>% autoplot()
mdls = list(data[!is.na(df_consumption),]$df_consumption)
mdls$tbats = tbats(data[!is.na(df_consumption),]$df_consumption)
mdls$nn = nnetar(data[!is.na(df_consumption),]$df_consumption)
mdls$arima = auto.arima(data[!is.na(df_consumption),]$df_consumption)
mdls$ets = ets(data[!is.na(df_consumption),]$df_consumption)
prd = list()
prd$tbats = predict(mdls$tbats, newdata = data[is.na(df_consumption),])
prd$nn = predict(mdls$nn, newdata = data[is.na(df_consumption),])
prd$arima = predict(mdls$arima, newdata = data[is.na(df_consumption),])
prd$ets = predict(mdls$ets, newdata = data[is.na(df_consumption),])
acc = list()
acc$tbats = accuracy(mdls$tbats)
acc$nn = accuracy(mdls$nn)
acc$arima = accuracy(mdls$arima)
acc$ets = accuracy(mdls$ets)
res = do.call("cbind", lapply(prd, function(x) {x$mean})) %>%
as.data.frame() %>%
cbind(data[is.na(df_consumption), "index"]) %>%
full_join(data[!is.na(df_consumption), c("index", "df_consumption")],., by = "index") %>%
mutate_at(vars(index), as.numeric) %>%
arrange(index) %>%
gather("var","val", -1)
ggplot(res[res$index>150,], aes(x = index, y = val)) +
geom_point(aes(fill = var, color = var)) +
geom_line(aes(group = var, color = var))
cible = data.frame("y" = dec[,"Remainder"])
dfcible = cbind(cible, data[!is.na(df_consumption),])
gdf = dfcible %>%
select(index, y, df_consumption) %>%
gather("var","val",-1)
ggplot(gdf, aes(x = index, y = val, group = var, fill = var, color = var)) +
geom_line() +
geom_point()
# Create lag values !!!
# EXTAND ####
add.shift.revenu = function(df, lag, roll, y) {
df[, (paste0("y_lag_", lag)) := shift(df_consumption, lag)]
df[, (paste0("y_lag_rollmean_", roll)) := frollmean(y_lag_1, roll, na.rm = TRUE, hasNA = TRUE)]
df[, (paste0("y_lag_rollsum_", roll)) := frollsum(y_lag_1, roll, na.rm = TRUE, hasNA = TRUE)]
df[, (paste0("y_lag_rollmax_", roll)) := frollapply(y_lag_1, roll, function(x) { max(x, na.rm = TRUE) })]
df[, (paste0("y_lag_rollmin_", roll)) := frollapply(y_lag_1, roll, function(x) { min(x, na.rm = TRUE) })]
df[, (paste0("y_lag_rollvar_", roll)) := frollapply(y_lag_1, roll, function(x) { var(x, na.rm = TRUE) })]
df[, y_trend := y_lag_1 / y_lag_rollmean_3, by = .(index)]
#df[, y_trend_12 := y_lag_1 / y_lag_12, by = .(index)]
#df[, y_trend_norm := y_lag_1 / y_lag_rollmax_12, by = .(index)]
}
ex_data = add.shift.revenu(data, lag = c(1:3), roll = c(3,12))
# See it
head(data)
head(ex_data)
# Ensemble method ---------------------------------
library(caret)
library(caretEnsemble)
trainData = data
extrainData = ex_data
trainData = mutate_all(trainData, as.numeric)
extrainData = mutate_all(extrainData, as.numeric)
# See available algorithms in caret
modelnames <- paste(names(getModelInfo()), collapse=', ')
modelnames
# Set the seed for reproducibility
set.seed(100)
# Stacking Algorithms - Run multiple algos in one call.
trainControl = trainControl(
method = "repeatedcv",
number = 5,
repeats = 3,
savePredictions = TRUE,
preProc = c("center", "scale")
)
algorithmList = c('lm','rf','gbm')
set.seed(100)
models = caretList(
df_consumption ~ .,
data = na.omit(trainData),
trControl = trainControl,
methodList = algorithmList
)
exmodels = caretList(
df_consumption ~ .,
data = na.omit(extrainData),
trControl = trainControl,
methodList = algorithmList
)
results = resamples(models)
results = summary(results)$statistics$RMSE
exresults = resamples(exmodels)
exresults = summary(exresults)$statistics$RMSE
rownames(exresults) = paste("Extand", rownames(results), sep = " - " )
results = do.call("rbind", list(results,exresults))
results = rownames_to_column(as.data.frame(results), "Model")
results = janitor::clean_names(results)
ggplot(results, aes(x = reorder(model,-mean), y = mean)) +
geom_boxplot(
aes(ymin = min, lower = x1st_qu,
middle = mean, upper = x3rd_qu,
ymax = max),
stat = "identity"
) +
coord_flip() +
theme_minimal() +
labs(x = NULL, y = "RMSE")
# Ensemble the predictions of `models` to form a new combined prediction based on glm
stack.glm = caretStack(
models,
method = "gbm",
metric = "rmse",
trControl = trainControl
)
print(stack.glm)
# Predict on testData
stack_predicteds = predict(stack.glm, newdata = submit)
head(stack_predicteds)
# VAR
models = vars::VAR(trainData[-((nrow(trainData)-4):nrow(trainData)),], type = "none")
prd = predict(models, n.ahead = 4)
res = forecast::accuracy(prd$fcst$Consumption[,1], trainData[((nrow(trainData)-3):nrow(trainData)),2])
print(res)
# Calculate impulse response
ir = vars::irf(
models,
impulse="Income",
response="Consumption",
n.ahead = 5,
ortho = FALSE,
cumulative = TRUE
)
# Plot
plot(ir)
# Autoregressif --------------------------------------------------------
# Linear model ----------------------------------------------------------------
# Dynamic linear Models ------------------------------------------------------
# Neural Network -----------------------------------------------------------
# Gradient boosting method -------------------------------------------
# Remove empty col
rm = c(names(which(lapply(data, var) == 0)),names(which(colSums(is.na(data)|data == 0) > .75 * nrow(data))))
cols = setdiff(names(data), rm)
data = data[, ..cols]
# Inputs
y = "df_consumption"
x = setdiff(colnames(data), c(y))
#cats = c("item_id", "shop_id", "item_category_id", "month", "year", "quarter")
# Splitting stategy
nfold = 5
# Output
submit = data[is.na(data$df_consumption) == TRUE]
data = data[!is.na(data$df_consumption) == TRUE]
# Initialize
train_index = list()
valid_index = list()
# replicability
set.seed(123)
# Index partition # Changer pour glissement avec pondération en fonction du temps
for ( i in seq_len(nfold)) {
tps = sample(unique(data$index), 0.8 * length(unique(data$index)))
train_index[[i]] = data[index %in% tps, which = TRUE]
valid_index[[i]] = data[!index %in% tps, which = TRUE]
}
# Clean
rm(tps)
# Initialize
weight = c()
forecast = list()
# Fit
for (i in seq_len(nfold)) {
valid = lgb.Dataset(data.matrix(data[valid_index[[i]], ..x]), label = data[valid_index[[i]],][[y]]) # categorical_feature = cats
dtrain = lgb.Dataset(data.matrix(data[train_index[[i]], ..x]), label = data[train_index[[i]],][[y]])
fit = lgb.train(
data = dtrain,
valid = list(val = valid),
num_iterations = 100,
early_stopping_round = 10,
params = list(
objective = "regression",
metric = "rmse",
learning_rate = 0.1
),
verbose = 1
)
cat("Fold", i, "obtained Best score :", fit$best_score, "round :", fit$best_iter, " \n")
forecast[[i]] = predict(fit, data.matrix(submit[, ..x]))
weight[i] = fit$best_score
}
cat("Overall mean score :", mean(weight))
# Importance
lgb.importance(fit)[
order(-Gain)[1:20],
ggplot(.SD, aes(reorder(Feature, Gain), Gain)) +
geom_col(fill = "steelblue") +
xlab("Feature") +
coord_flip() +
theme_minimal() +
theme(axis.text.y = element_text(size = 12))]
# Stacked method
# -- Classification ------------------------------------------------------------
# regression logistique --------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.