knitr::opts_chunk$set(echo = TRUE, message = F, warning = F, fig.width = 13, fig.height = 6, comment = "")

Importing Needed packages

library(tidyverse)
library(lubridate)
library(e1071) 
library(gridExtra)
library(janitor)
library(data.table)
library(funModeling)
library(kableExtra)
library(htmltools)
library(ISOweek)
library(Metrics)
library(tidymodels)
library(timetk)

Helper Functions

minmax_scaler <- function(x) {


   return( ( x - min( x ) )  / ( max(x) - min(x) ) ) 
}


robust_scaler <- function(x){

  return( ( x - quantile( x , 0.5) )  / ( quantile(x ,0.75) - quantile(x, 0.25) ) )

}


ml_error <-  function(model_name = "Linear Regression Model",model_predictions){
  MAE <- model_predictions %>%
    yardstick::mae( actual,predictions)

  MAPE <- model_predictions %>%
    yardstick::mape( actual,predictions)

  RMSE <- model_predictions %>%
    yardstick::rmse( actual, predictions)

  data.frame(Model_Name = model_name, MAE= round(MAE$.estimate,3), MAPE = round(MAPE$.estimate,3), RMSE = round(RMSE$.estimate, 3))  

} 


# TimeSeries cross validation function
cross_validation <- function(data_training, kfold , Model_name, model){

mae_list  <- c()
mape_list <- c()
rmse_list <- c() 

for (k in seq(kfold ,1)){
  print(paste("kfolds number", k))

  validation_start_date <- max(data_train$date) - as.difftime(k*6*7, units = "days")

  validation_end_date <- max(data_train$date) - as.difftime((k-1)*6*7, units = "days")

  data_training <- data_train %>% 
    filter(date < validation_start_date)

  data_validation <- data_train %>% 
    filter(date >= validation_start_date & date <= validation_end_date)

  lm_fit_cv <- model %>% fit(sales ~ . , data = data_training)

  lm_pred_cv <- lm_fit_cv %>% predict(data_validation) %>% 
    bind_cols(data_validation$sales) %>% 
    rename(predictions = ".pred", actual = "...2")

  lm_result_cv <- ml_error("Linear Regression Model",lm_pred_cv) 


    # store performance of each kfold iteration
  mae_list [k] <- unlist( lm_result_cv['MAE']   , use.names=FALSE) 
  mape_list[k] <- unlist( lm_result_cv['MAPE']  , use.names=FALSE )
  rmse_list[k] <- unlist( lm_result_cv['RMSE']  , use.names=FALSE)



  }

 return( tibble( Model_name = Model_name,
                 MAE = paste( round(mean(mae_list),2)," +/- ",   round(sd(mae_list),  2)),
                 MAPE = paste( round(mean(mape_list),2)," +/- ", round(sd(mape_list), 2)),
                 RMSE = paste( round(mean(rmse_list),2)," +/- ", round(sd(rmse_list), 2))) )  

}

Reading the data

# Dataset of Sales
df5 <- read_csv("/home/renato/repos/Rossmann/inst/Data/data_modelling.csv")

Manual Feature Selection

df6 <-  as_tibble(df5) %>% 
  select(-X1)
# Boruta selected features
cols_selected_boruta_full <- c("store","store_type","promo", "assortment", "competition_distance", "competition_open_since_month",
    "competition_open_since_year","promo2","promo2since_week","promo2since_year", "competition_time_month",
    "promo_time_week", "day_of_week_sin", "day_of_week_cos", "month_sin","month_cos", "day_sin", "day_cos",
    "week_of_year_sin", "week_of_year_cos", "sales", "date")

Machine Learning Modelling

df6 <- df6 %>% 
  select(cols_selected_boruta_full) %>% 
  mutate(sales = expm1(sales))


# Splitthe set in training and testing
data_split <- df6 %>%  time_series_split(assess = "6 weeks", cumulative = T)

# Dataset Train
data_train <- training(data_split)

# Dataset Test
data_test <- testing(data_split)

rec <- recipe(sales~.,data_train) 

# Selected metrics 
mt <- metric_set(yardstick::rmse, yardstick::mae, yardstick::mape)

Average Model

aux1 <- data_test %>% 
  select(store, sales)

aux2 <- data_test %>% 
  group_by(store) %>% 
  summarise(predictions = mean(sales))

avg_model <- aux1 %>% 
  left_join(aux2, by= "store") %>% 
  select(sales, predictions) %>% 
  rename(actual = "sales")

baseline_result <- ml_error("Average Model",avg_model)

baseline_result

Linear Regression Model

# Create Linear Regresion Model
#lm <-
#  linear_reg() %>% 
#  set_engine("lm") %>% 
#  set_mode("regression")

# Training Model
#lm_fit <- lm %>%
#  fit(sales ~ . , data = data_train)

# Preditions
#lm_pred <- lm_fit %>% 
#  predict(data_test) %>% 
#  bind_cols(data_test$sales) %>% 
#  rename(predictions = ".pred", actual = "...2")

# Evaluate
#lm_result <- ml_error("Linear Regression Model",lm_pred)

# Save pickle of results
#saveRDS(lm_result,"Resultados_Modelos/lm_result.rds")

lm_result <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/lm_result.rds")

lm_result %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")

Linear Regression Model - Cross Validation

#lm_result_cv <- cross_validation(data_training , 5, "Linear Regression Model Cross Validation", lm)

# Save pickle of results
#saveRDS(lm_result_cv,"Resultados_Modelos/lm_result_cv.rds")

lm_result_cv <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/lm_result_cv.rds")

lm_result_cv

Random Forest Model

# Create Non-linear Model RandomForest
#rf <-
#  rand_forest(trees = 100) %>% 
#  set_engine("ranger") %>% 
#  set_mode("regression")

# Training Model
#rf_fit <- rf %>%
#  fit(sales ~ ., data = data_train)


# Preditions
#rf_pred <- rf_fit %>% 
#   predict(data_test) %>% 
#  bind_cols(data_test$sales) %>% 
#  rename(predictions = ".pred", actual = "...2")


#rf_result <- ml_error("Random Forest Model",rf_pred)  

# Save pickle of results
#saveRDS(rf_result,"Resultados_Modelos/rf_result.rds")

rf_result <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/rf_result.rds")

rf_result %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")

Random Forest Model - Cross Validation

#rf_result_cv <- cross_validation(data_training , 5, "Random Forest Model Cross Validation", rf)

# Save pickle of results
#saveRDS(rf_result_cv,"Resultados_Modelos/rf_result_cv.rds")

rf_result_cv <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/rf_result_cv.rds")

rf_result_cv

Xgboosting Model

# Create Model
#xg <-
#  boost_tree(trees = 100) %>% 
#  set_engine("xgboost") %>% 
#  set_mode("regression")

# Training Model
#xg_fit <- xg %>%
#  fit(sales ~ ., data = data_train)


# Preditions
#xg_pred <- xg_fit %>% 
#  predict(data_test) %>% 
#  bind_cols(data_test$sales) %>% 
#  rename(predictions = ".pred", actual = "...2")

xg_result <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/xg_result.rds")

# Evaluate
#xg_result <- ml_error("Xgboosting Model",xg_pred)

# Save pickle of results
#saveRDS(xg_result,"Resultados_Modelos/xg_result.rds")

xg_result %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")
result_xg_cv <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/result_xg_cv.rds")

#result_xg_cv <- cross_validation(data_training , 5, "Xgboosting Model Cross Validation", xg)

# Save pickle of results
#saveRDS(result_xg_cv,"Resultados_Modelos/result_xg_cv.rds")

result_xg_cv

Comparing Model Performances

Single Performances

bind_rows(baseline_result, lm_result, rf_result, xg_result) %>% 
  arrange(RMSE) %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")

Real Performance - Cross Validation

bind_rows( lm_result_cv, rf_result_cv, result_xg_cv) %>% 
  arrange(RMSE) %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")


rsprojeto/Rossmann documentation built on Feb. 7, 2021, 5:32 a.m.