process-prediction-workflow.R
In processpredictR: Process Prediction

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE,
  cache = FALSE
)

## ----setup, message = F, eval = T---------------------------------------------
library(processpredictR)
library(bupaR)
library(ggplot2)
library(dplyr)
library(keras)
library(purrr)

## ----echo = F, eval = T, out.width = "60%", fig.align = "center"--------------
knitr::include_graphics("framework.PNG")

## ---- eval = T----------------------------------------------------------------
df <- prepare_examples(traffic_fines, task = "outcome")
df

## ---- eval = T----------------------------------------------------------------
set.seed(123)
split <- df %>% split_train_test(split = 0.8)
split$train_df %>% head(5)
split$test_df %>% head(5)

## ---- eval = T----------------------------------------------------------------
nrow(split$train_df) / nrow(df)
n_distinct(split$train_df$case_id) / n_distinct(df$case_id)

## -----------------------------------------------------------------------------
#  model <- split$train_df %>% create_model(name = "my_model")
#  # pass arguments as ... that are applicable to keras::keras_model()
#  
#  model # is a list

## -----------------------------------------------------------------------------
#  model %>% names() # objects from a returned list

## -----------------------------------------------------------------------------
#  model$model$name # get the name of a model

## -----------------------------------------------------------------------------
#  model$model$non_trainable_variables # list of non-trainable parameters of a model

## -----------------------------------------------------------------------------
#  model %>% compile() # model compilation

## -----------------------------------------------------------------------------
#  hist <- fit(object = model, train_data = split$train_df, epochs = 5)

## -----------------------------------------------------------------------------
#  hist$params

## -----------------------------------------------------------------------------
#  hist$metrics

## -----------------------------------------------------------------------------
#  predictions <- model %>% predict(test_data = split$test_df,
#                                   output = "append") # default
#  predictions %>% head(5)

## -----------------------------------------------------------------------------
#  predictions %>% class

## -----------------------------------------------------------------------------
#  confusion_matrix(predictions)

## ---- out.width="100%", fig.width = 7-----------------------------------------
#  plot(predictions) +
#    theme(axis.text.x = element_text(angle = 90))

## ---- out.width="100%", fig.width = 7-----------------------------------------
#  knitr::include_graphics("confusion_matrix.PNG")

## -----------------------------------------------------------------------------
#  model %>% evaluate(split$test_df)

## -----------------------------------------------------------------------------
#  # preprocessed dataset with categorical hot encoded features
#  df_next_time <- traffic_fines %>%
#    group_by_case() %>%
#    mutate(month = lubridate::month(min(timestamp), label = TRUE)) %>%
#    ungroup_eventlog() %>%
#    prepare_examples(task = "next_time", features = "month") %>% split_train_test()
#  
#  

## -----------------------------------------------------------------------------
#  # the attributes of df are added or changed accordingly
#  
#  df_next_time$train_df %>% attr("features")

## -----------------------------------------------------------------------------
#  df_next_time$train_df %>% attr("hot_encoded_categorical_features")

## -----------------------------------------------------------------------------
#  df <- prepare_examples(traffic_fines, task = "next_activity") %>% split_train_test()
#  custom_model <- df$train_df %>% create_model(custom = TRUE, name = "my_custom_model")
#  custom_model

## -----------------------------------------------------------------------------
#  custom_model <- custom_model %>%
#    stack_layers(layer_dropout(rate = 0.1)) %>%
#    stack_layers(layer_dense(units = 64, activation = 'relu'))
#  custom_model

## -----------------------------------------------------------------------------
#  # this works too
#  custom_model %>%
#    stack_layers(layer_dropout(rate = 0.1), layer_dense(units = 64, activation = 'relu'))

## -----------------------------------------------------------------------------
#  new_outputs <- custom_model$model$output %>% # custom_model$model to access a model and $output to access the outputs of that model
#    keras::layer_dropout(rate = 0.1) %>%
#    keras::layer_dense(units = custom_model$num_outputs, activation = 'softmax')
#  
#  custom_model <- keras::keras_model(inputs = custom_model$model$input, outputs = new_outputs, name = "new_custom_model")
#  custom_model
#  

## -----------------------------------------------------------------------------
#  # class of the model
#  custom_model %>% class

## -----------------------------------------------------------------------------
#  # compile
#  compile(object=custom_model, optimizer = "adam",
#          loss = loss_sparse_categorical_crossentropy(),
#          metrics = metric_sparse_categorical_crossentropy())

## -----------------------------------------------------------------------------
#  # the trace of activities must be tokenized
#  tokens_train <- df$train_df %>% tokenize()
#  map(tokens_train, head) # the output of tokens is a list
#  
#  

## -----------------------------------------------------------------------------
#  # make sequences of equal length
#  x <- tokens_train$token_x %>% pad_sequences(maxlen = max_case_length(df$train_df), value = 0)
#  y <- tokens_train$token_y

## ---- eval=F------------------------------------------------------------------
#  # train
#  fit(object = custom_model, x, y, epochs = 10, batch_size = 10) # see also ?keras::fit.keras.engine.training.Model
#  
#  # predict
#  tokens_test <- df$test_df %>% tokenize()
#  x <- tokens_test$token_x %>% pad_sequences(maxlen = max_case_length(df$train_df), value = 0)
#  predict(custom_model, x)
#  
#  # evaluate
#  tokens_test <- df$test_df %>% tokenize()
#  x <- tokens_test$token_x
#  # normalize by dividing y_test over the standard deviation of y_train
#  y <- tokens_test$token_y / sd(tokens_train$token_y)
#  evaluate(custom_model, x, y)