R/benchmark_ts_models.R

Defines functions add.shift.revenu load.dt

# Prediction tools

# Setup --------------------------------------------------
# Packages
suppressMessages({
   library(data.table)
   library(tidyverse)  
   library(neuralnet)
   library(lightgbm)
   library(glmnet)  
   library(ggplot2)  
   library(mice)
   library(caret)  
   library(caretEnsemble) 
   library(RANN) # KNN
   library(forecast)
})

data = as.data.frame(fpp2::uschange)

Yobs = data[(nrow(data)-4):nrow(data),1]
data[(nrow(data)-4):nrow(data),1] = NA
   
load.dt = function(df){
   
   # One hot encoding   
   dummy = dummyVars(~ ., data = df, fullRank = TRUE)
   df = as.data.frame(predict(dummy, df))
   
   # Preprocessing
   df.nn = preProcess(df[, -which(names(df) == "Consumption")], method = c("center", "scale", "YeoJohnson", "nzv", "knnImpute"))# Remove character and dat 
   df = cbind(df[, "Consumption"], predict(df.nn, newdata = df[, -which(names(df) == "Consumption")])) # if character

   # Log Y ! 
   # df$SalePrice = log(df$SalePrice)  
   
   # Clean names
   df = janitor::clean_names(df) 
   
   # Output  
   return(as.data.table(df)) 
   
}

# Import data
data = load.dt(data)

data = data %>% rownames_to_column("index") # a supprimer


# Model ts univarié obj décomposition


dec = forecast::mstl(data[!is.na(df_consumption),]$df_consumption, lambda = 1)
dec[,"Trend"] %>% nnetar() %>% predict(h = 90) %>% autoplot()
dec[,"Trend"] %>% tbats() %>% predict(h = 90) %>% autoplot()

mdls = list(data[!is.na(df_consumption),]$df_consumption)
mdls$tbats = tbats(data[!is.na(df_consumption),]$df_consumption)
mdls$nn = nnetar(data[!is.na(df_consumption),]$df_consumption)
mdls$arima = auto.arima(data[!is.na(df_consumption),]$df_consumption)
mdls$ets = ets(data[!is.na(df_consumption),]$df_consumption)

prd = list()
prd$tbats = predict(mdls$tbats, newdata = data[is.na(df_consumption),])
prd$nn = predict(mdls$nn, newdata = data[is.na(df_consumption),])
prd$arima = predict(mdls$arima, newdata = data[is.na(df_consumption),])
prd$ets = predict(mdls$ets, newdata = data[is.na(df_consumption),])

acc = list()
acc$tbats = accuracy(mdls$tbats)
acc$nn = accuracy(mdls$nn)
acc$arima = accuracy(mdls$arima)
acc$ets = accuracy(mdls$ets)

res = do.call("cbind", lapply(prd, function(x) {x$mean})) %>% 
   as.data.frame() %>% 
   cbind(data[is.na(df_consumption), "index"]) %>% 
   full_join(data[!is.na(df_consumption), c("index", "df_consumption")],., by = "index") %>% 
   mutate_at(vars(index), as.numeric) %>% 
   arrange(index) %>% 
   gather("var","val", -1)

ggplot(res[res$index>150,], aes(x = index, y = val)) +
   geom_point(aes(fill = var, color = var)) + 
   geom_line(aes(group = var, color = var))

cible = data.frame("y" = dec[,"Remainder"])

dfcible = cbind(cible, data[!is.na(df_consumption),])

gdf = dfcible %>% 
   select(index, y, df_consumption) %>% 
   gather("var","val",-1)

ggplot(gdf, aes(x = index, y = val, group = var, fill = var, color = var)) + 
   geom_line() + 
   geom_point()

# Create lag values !!!
# EXTAND ####

add.shift.revenu = function(df, lag, roll, y) {
   
   df[, (paste0("y_lag_", lag)) := shift(df_consumption, lag)] 
   
   df[, (paste0("y_lag_rollmean_", roll)) := frollmean(y_lag_1, roll, na.rm = TRUE, hasNA = TRUE)]
   df[, (paste0("y_lag_rollsum_", roll)) := frollsum(y_lag_1, roll, na.rm = TRUE, hasNA = TRUE)]  
   df[, (paste0("y_lag_rollmax_", roll)) := frollapply(y_lag_1, roll, function(x) { max(x, na.rm = TRUE) })]
   df[, (paste0("y_lag_rollmin_", roll)) := frollapply(y_lag_1, roll, function(x) { min(x, na.rm = TRUE) })]    
   df[, (paste0("y_lag_rollvar_", roll)) := frollapply(y_lag_1, roll, function(x) { var(x, na.rm = TRUE) })]  
   
   df[, y_trend := y_lag_1 / y_lag_rollmean_3, by = .(index)]                                                                 
   #df[, y_trend_12 := y_lag_1 / y_lag_12, by = .(index)]        
   #df[, y_trend_norm := y_lag_1 / y_lag_rollmax_12, by = .(index)]                                                                   
   
}

ex_data = add.shift.revenu(data, lag = c(1:3), roll = c(3,12))

# See it 
head(data)
head(ex_data)

# Ensemble method ---------------------------------

library(caret)
library(caretEnsemble)

trainData = data
extrainData = ex_data

trainData = mutate_all(trainData, as.numeric)
extrainData = mutate_all(extrainData, as.numeric)

# See available algorithms in caret
modelnames <- paste(names(getModelInfo()), collapse=',  ')
modelnames

# Set the seed for reproducibility
set.seed(100)

# Stacking Algorithms - Run multiple algos in one call.
trainControl = trainControl(
   method = "repeatedcv", 
   number = 5, 
   repeats = 3,
   savePredictions = TRUE,
   preProc = c("center", "scale")
)

algorithmList = c('lm','rf','gbm')

set.seed(100)

models = caretList(
   df_consumption ~ ., 
   data = na.omit(trainData), 
   trControl = trainControl,
   methodList = algorithmList
) 

exmodels = caretList(
   df_consumption ~ ., 
   data = na.omit(extrainData), 
   trControl = trainControl,
   methodList = algorithmList
) 

results = resamples(models)
results = summary(results)$statistics$RMSE

exresults = resamples(exmodels)
exresults = summary(exresults)$statistics$RMSE
rownames(exresults) = paste("Extand", rownames(results), sep = " - " )

results = do.call("rbind", list(results,exresults))
results = rownames_to_column(as.data.frame(results), "Model")
results = janitor::clean_names(results)

ggplot(results, aes(x = reorder(model,-mean), y = mean)) + 
   geom_boxplot(
      aes(ymin = min, lower = x1st_qu, 
          middle = mean, upper = x3rd_qu,
          ymax = max),
      stat = "identity"
   ) + 
   coord_flip() + 
   theme_minimal() + 
   labs(x = NULL, y = "RMSE")


# Ensemble the predictions of `models` to form a new combined prediction based on glm
stack.glm = caretStack(
   models, 
   method = "gbm",
   metric = "rmse",
   trControl = trainControl
)

print(stack.glm)

# Predict on testData
stack_predicteds = predict(stack.glm, newdata = submit)
head(stack_predicteds)

# VAR 
models = vars::VAR(trainData[-((nrow(trainData)-4):nrow(trainData)),], type = "none") 

prd = predict(models, n.ahead = 4)
res = forecast::accuracy(prd$fcst$Consumption[,1], trainData[((nrow(trainData)-3):nrow(trainData)),2])

print(res)

# Calculate impulse response
ir = vars::irf(
   models, 
   impulse="Income",
   response="Consumption",
   n.ahead = 5,
   ortho = FALSE,
   cumulative = TRUE
)


# Plot
plot(ir)


# Autoregressif  --------------------------------------------------------
# Linear model ----------------------------------------------------------------
# Dynamic linear Models ------------------------------------------------------
# Neural Network -----------------------------------------------------------
# Gradient boosting method -------------------------------------------

# Remove empty col
rm = c(names(which(lapply(data, var) == 0)),names(which(colSums(is.na(data)|data == 0) > .75 * nrow(data))))
cols = setdiff(names(data), rm)
data = data[, ..cols]

# Inputs
y = "df_consumption"
x = setdiff(colnames(data),  c(y))
#cats = c("item_id", "shop_id", "item_category_id", "month", "year", "quarter")

# Splitting stategy 
nfold = 5

# Output
submit = data[is.na(data$df_consumption) == TRUE]
data = data[!is.na(data$df_consumption) == TRUE]

# Initialize
train_index = list()
valid_index = list()

# replicability
set.seed(123)

# Index partition # Changer pour glissement avec pondération en fonction du temps 
for ( i in seq_len(nfold)) {
   
   tps = sample(unique(data$index), 0.8 * length(unique(data$index)))  
   train_index[[i]] = data[index %in% tps, which = TRUE]
   valid_index[[i]] = data[!index %in% tps, which = TRUE]    
   
}

# Clean 
rm(tps)

# Initialize
weight = c()
forecast = list()

# Fit
for (i in seq_len(nfold)) {
   
   valid = lgb.Dataset(data.matrix(data[valid_index[[i]], ..x]), label = data[valid_index[[i]],][[y]]) # categorical_feature = cats
   dtrain = lgb.Dataset(data.matrix(data[train_index[[i]], ..x]), label = data[train_index[[i]],][[y]])
   
   fit = lgb.train(
      data = dtrain, 
      valid = list(val = valid), 
      num_iterations = 100,
      early_stopping_round = 10,    
      params = list(
         objective = "regression",
         metric = "rmse",
         learning_rate = 0.1
      ),
      verbose = 1 
   )
   
   cat("Fold", i, "obtained Best score :", fit$best_score, "round :", fit$best_iter, " \n")  
   
   forecast[[i]] = predict(fit, data.matrix(submit[, ..x])) 
   weight[i] = fit$best_score  
   
}

cat("Overall mean score :", mean(weight))

# Importance 
lgb.importance(fit)[
   order(-Gain)[1:20],
   ggplot(.SD, aes(reorder(Feature, Gain), Gain)) +
      geom_col(fill = "steelblue") +
      xlab("Feature") +
      coord_flip() +
      theme_minimal() + 
      theme(axis.text.y = element_text(size = 12))]

# Stacked method 
# -- Classification ------------------------------------------------------------

# regression logistique --------------------------------------------------------
AlexisMayer/toolbox documentation built on Aug. 25, 2020, 3:56 p.m.