knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This vignette shows how to replicate the experiments with the synthetic datasets that are described in the paper 'Detection of Batch Activities from Event Logs'. The synthetic datasets are included in this R package. First load the R package:
library(bamalog)
Then, we need to load the CSV file from the package's inst
folder:
library(readr) csv_log <- read_csv2(system.file("extdata", "ANDGatewayVar1_conccasebased_processBatchActivityStats.csv", package = "bama"))
Some pre-processing is necessary due to the format that the simulation tool provides the CSV files:
library(dplyr) # Edit csv_log arrival timestamps (for technical reasons of the simulator) if(!("PARALLEL" %in% unique(csv_log$Batch.Type))){ index <- is.na(csv_log$Natural.Arrival.Time) csv_log$Arrival[index] <- csv_log$Natural.Arrival.Time[index] remove(index) } csv_log$Natural.Arrival.Time <- NULL # Edit Batch.Number when different batch subprocesses are included in the process if("BatchSubProcess2" %in% csv_log$Cluster.Id){ index <- csv_log$Cluster.Id == "BatchSubProcess2" csv_log$Batch.Number[index] <- csv_log$Batch.Number[index] + 0.5 remove(index) } task_log <- csv_log task_log$Cluster.Id <- NULL # Rename columns names(task_log) <- c("case_id", "activity", "arrival", "start", "complete", "resource", "input_batch_number", "input_batch_type") # Remove "Batch Activity" activities task_log <- task_log %>% filter(activity != "Batch Activity") # Remove process instances included in the last batch as this batch might be incomplete (and, hence, the artificial data might inappropriately be marked as a batch) instances_to_remove <- task_log %>% filter(input_batch_number == max(input_batch_number, na.rm = TRUE)) instances_to_remove <- unique(instances_to_remove$case_id) task_log <- task_log %>% filter(!(case_id %in% instances_to_remove)) remove(instances_to_remove) # Add instance_id (activity instance identifier) for comparison purposes later task_log$instance_id <- seq(1, nrow(task_log)) # Save activity log with batch numbers and types for evaluation purposes evaluation_log <- task_log # Remove redundant columns for batch mining purposes task_log <- task_log %>% select(case_id, activity, arrival, start, complete, resource, instance_id)
Now directly call the main batch mining algorithm:
# Create seq_tolerated_gap_list (gap of 0 seconds is allowed) seq_tolerated_gap_list <- seq_tolerated_gap_list_generator(task_log = task_log, seq_tolerated_gap_value = 0) subsequence_list <- enumerate_subsequences(task_log, 0) # Use the following line for using frequent sequence mining instead # subsequence_list <- identify_frequent_sequences(task_log, 0) # Detect batching behavior result_log <- detect_batching(task_log = task_log, act_seq_tolerated_gap_list = seq_tolerated_gap_list, timestamp_format = "yyyy-mm-dd hh:mm:ss", numeric_timestamps = FALSE, log_and_model_based = TRUE, subsequence_list = subsequence_list, subsequence_type = "enum", # use `mine` to use frequence sequence mining # subsequence_type = "mine", within_case_seq_tolerated_gap = 0, between_cases_seq_tolerated_gap = 0, show_progress = F)
Inspect the resulting event log with batch annotations:
library(kableExtra) kable(head(result_log, n = 25))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.