R/archive/transys_branch_2.R

Defines functions .remove_eventlog_missing_values

# transys.R --------------------------------------------------------------------

# Header
# Description:   This module introduces a reference class named as TransitionSystem which only keeps the status changes.
#                Each case can have only one status at a time.
#                This model can also be used for processes which do not have concurrent activities.
# Author:        Nima Ramezani
# Email :        nima.ramezani@gmail.com
# Start Date:    08 November 2016
# Last Revision: 03 May 2022
# Version:       0.9.7

# Version   Date               Action
# -----------------------------------
# 0.0.1     08 November 2016   Initial issue
# 0.1.0     09 November 2016   Class TS.CASE modified: All methods defined within the class
# 0.2.0     15 November 2016   Queries added to SimpleBusinessProcess objects. case_IDs should be first set as a query, then call property methods to get various process properties
# 0.2.1     15 November 2016   Method getTimeAdjacency() added to class TS.CASE
# 0.3.0     15 November 2016   Major changes in class SimpleBusinessProcess: Property 'data' renamed to 'history', properties adjacency, visNetwork, igraph and all other process metrics moved to list property 'data'
# 0.3.1     15 November 2016   Method getTimeAdjacency() added to class SimpleBusinessProcess
# 0.4.0     18 November 2016   Thanks to dplyr, all the sorting, next status duration is computed in the first feed. Methods feed(), fillCases() modified!
# 0.4.1     08 February 2017   SimpleBusinessProcess renamed to TransitionSystem. TS.CASE renamed to TS.CASE
# 0.4.2     08 February 2017   Filename changed to transys.R
# 0.4.3     16 February 2017   Method getAdjacency() and getTimeAdjacency() removed from class TransitionSystem and replaced by method getAdjacencies()
# 0.4.4     16 February 2017   Method getStatusVolume() added. Returns an object of class TIME.SERIES
# 0.5.0     06 September 2017  Method feed.eventlog() modified: Argument add_ends added! If TRUE, statuses START and END will be added to the beginning and end of processes!
# 0.5.1     11 September 2017  Method feed.eventlog() modified: Columns paths and selected added
# 0.5.2     14 September 2017  Property query removed.
# 0.5.3     14 September 2017  Function computeFullAdjacencies() added
# 0.5.4     14 September 2017  Functions getOutlierCases() and applyCaseFilter() added
# 0.5.5     14 September 2017  Function getAdjacencies modified.
# 0.5.6     14 September 2017  Function getSimpleAdjacencies added.
# 0.6.0     30 June 2018       Fundamental changes: All methods removed and renamed.
# 0.6.1     02 July 2018       Function feed.eventlog() modified
# 0.6.3     03 July 2018       Function get.volumeIn(), get.volumeOut() and get.backlog() modified. Method compute.volumes.daily() removed
# 0.6.6     14 September 2018  Functions get.volumeIn(), get.volumeOut() and get.backlog() now can give with hourly timeseries.
# 0.6.7     12 October 2018    All plotting functions transferred to tsvis.R
# 0.7.0     23 October 2018    Properties 'modelStart' and 'modelEnd' added to class TransitionSystem, functions get.volumeIn() & get.volumeOut() modified accordingly.
# 0.7.2     26 October 2018    Method filter.case() modified: two filtering arguments added: startStatuses, endStatuses
# 0.7.3     06 November 2018   Generic function summary.TransitionSystem() modified: collapses multiple statuses in the summary string
# 0.7.4     24 February 2019   Function feed.eventlog() modified: convert dataset to data.frame to avoid causing error on tibble inputs
# 0.7.5     21 June 2019       Function TransitionSystem.summary() modified: A minor bug was fixed.
# 0.7.9     09 July 2019       Function TransitionSystem.simulate() added, methods get.mldata.tc(), get.time.status.volume() added, Some method names changed, property 'cases' embedded in 'tables' 
# 0.8.1     16 July 2019       Method filter.event() added, argument 'extra_col' added to feed.eventlog()
# 0.8.2     08 August 2019     Method simulate.TransitionSystem() modified
# 0.8.3     20 August 2019     Method simulate.TransitionSystem() modified: progressbar added.
# 0.8.4     20 August 2019     Method feed.eventlog() modified: remove_sst bug rectified: now updates startTime
# 0.8.5     09 September 2019  Method feed.eventlog() modified: adds new events to the existing history.
# 0.8.6     09 September 2019  Generic function simulate.TransitionSystem() transfered as method run.simulate().
# 0.8.7     26 August 2021     Argument freqThreshold renamed to freqRateCut.
# 0.8.9     17 March 2022      Method feed.eventlog() changed: uses internal function .remove_eventlog_messing_values() to remove missing values in critical columns
# 0.9.0     17 March 2022      Method reset() added.
# 0.9.1     18 March 2022      Method get.adjacency() renamed to get.transition_matrix().
# 0.9.2     18 March 2022      Method get.transition_matrix() modified: row associated with END status normalized.
# 0.9.3     18 March 2022      Method to_periodic() added
# 0.9.4     18 March 2022      Method random_walk() added
# 0.9.5     21 April 2022      TRANSYS renamed to TransitionSystem
# 0.9.6     26 April 2022      Minor bug in method to_periodic() fixed. (statuses were shuffled)
# 0.9.7     03 May 2022        Modifications in method run.simulation(): 1- Status "EXIT" is not generated, 2- filtering is transferred to histobj

# Good information for working with R functions:
# http://adv-r.had.co.nz/Functions.html

# fast Data Manipulation in R:
# https://www.analyticsvidhya.com/blog/2015/12/faster-data-manipulation-7-packages/
# fast Table Reshape:
# http://seananderson.ca/2013/10/19/reshape.html
# http://stackoverflow.com/questions/26536251/comparing-gather-tidyr-to-melt-reshape2

# Visualizing Markov-Chain Models:
# https://setosa.io/ev/markov-chains/


# Required Libraries:
library(dplyr)

empt = Sys.time()[-1]
empd = Sys.Date()[-1]


.remove_eventlog_missing_values = function(ds, column){
  tbd = c()
  if(column %in% colnames(ds)){
    tbd = is.na(ds[column]) %>% which
    warnif(length(tbd) > 0, sprintf('%s events removed due to missing values in column %s', length(tbd), column))
  }
  if(length(tbd) > 0){
    return(ds[-tbd, ])
  } else {
    return(ds)
  }
}

# This global variable is repeated in prom, can transfer to rutils
#' @export
timeUnitCoeff = c(hour = 3600, second = 1, minute = 60, day = 24*3600, week = 7*24*3600, year = 24*3600*365)

#' @title TransitionSystem: A reference class for modelling Transition Systems
#' @description Reference class containing some properties and methods required for analysing, modelling, simulating and visualising a Transition Sytem.
#' A Transistopn System is the more general case of a Markov Chain model descibing a system which can change status over time.
#'
#' @field history \code{data.frame} holding history data of status transitions
#' @field report \code{list} list of data.frames containing outputs of various query reports. These tables could be:
#' 
#' \code{nodes}: \code{data.frame} holding nodes of a transition system network. Nodes are equivalent to statuses.
#' \code{links}: \code{data.frame} holding links of a transition system network. Links(edges) are equivalent to transitions.
#'  
#' @export TransitionSystem
TransitionSystem = setRefClass('TransitionSystem',
                      fields = list(
                        modelStart    = 'POSIXct',
                        modelEnd      = 'POSIXct',
                        history       = 'data.frame',
                        settings      = 'list',
                        tables        = 'list',
                        report        = 'list',
                        metrics       = 'list',
                        plots         = 'list',
                        timeseries    = 'list',
                        timeseries.full  = 'list'
                      ),

                      methods = list(
                        
                        clear = function(){
                          report     <<- list()
                          plots      <<- list()
                          timeseries <<- list()
                          metrics    <<- list()
                        },

                        initialize = function(start = NULL, end = NULL, ...){
                          callSuper(...)
                          settings$include_case_measures <<- F
                          settings$filter <<- list(complete = NULL, minLoops = NULL, maxLoops = NULL, IDs = NULL, startStatuses = NULL, endStatuses = NULL, freqRateCut = NULL)

                          history <<- data.frame(caseID = character(), status = character(), nextStatus = character(), startTime = empt, endTime = empt, caseStart = logical(), caseEnd = logical(),
                                                 selected = logical(), startDate = empd, endDate = empd, creation = empt, duration = numeric(), eventAge = numeric(), path = character(),
                                                 stringsAsFactors = F)
                          tables$profile.case <<- data.frame(caseID = character(), firstStatus = character(), lastStatus = character(), startTime = empt, endTime = empt, caseStart = logical(), caseEnd = logical(), duration = numeric(),
                                                 stringsAsFactors = F)

                          modelStart <<- start %>% as.time
                          modelEnd   <<- end %>% as.time
                        },

                        # remove_sst: remove same status transitions
                        # caseStartFlag_col
                        # Since in most use cases, the sequence of transition events is a stream data, 
                        # the eventlog cannot cover the entire lifetime of all the cases.
                        # This means that every event-log data can cover a window in the timeline.
                        # There are always cases arrived/born/created before, within and after the window of coverage.
                        # So at the beginnig of the coverage window, we may have cases that have started before and already
                        # had some transitions. Similarly, there are cases that 
                        # will have transitions happening after the coverage window.
                        # Therefore, we need to provide information on how to introduce cases which 
                        # have started or ended within the coverage window.
                        # We can do this by flagging which cases start or end within the coverage window
                        # in columns of the input transition event log and 
                        # set the column name in arguments 'caseStartFlag_col' and 'caseEndFlag_col'

                        feed.eventlog = function(dataset, caseID_col = 'caseID', status_col = 'status', startTime_col = 'startTime', caseStartFlag_col = NULL, caseEndFlag_col = NULL, caseStartTags = NULL, caseEndTags = NULL, remove_sst = F, extra_col = NULL){
                          "
                          Feeds an eventlog as a data.frame or tibble to train the transition system model. \\cr

                          \\emph{Arguments:} \\cr \\cr
                          
                          \\code{dataset (data.frame)}: The input eventlog data table. \\cr

                          \\code{caseID_col (character)}: Header of the column in \\code{dataset} containing case IDs. \\cr

                          \\code{status_col (character)}: Header of the column in \\code{dataset} 
                            containing source status of the transition event. \\cr

                          \\code{startTime_col (character)}: Header of the column in \\code{dataset} 
                            containing time-stamp when transition starts. 
                            This is the time when the case has arrived into the source status. \\cr
                            
                          \\code{caseStartFlag_col (character)}: Header of the column specifying which cases 
                            are started/born/arrived within the Time-line Coverage Window (TCW). \\cr
                            
                          \\code{caseEndFlag_col (character)}: Header of the column specifying which cases 
                            are ended/dead/finished/completed within the Time-line Coverage Window (TCW). \\cr
                           
                          \\code{caseStartTags, caseEndTags (character)}: Alternatively, you can specify which statuses are starting/ending statuses.
                            If this is specified, cases starting/ending with statuses within the given lists, 
                            will be considered as cases started/ended within the timeline window of analysis.
                            If both \\code{caseStartFlag_col} and \\code{caseStartTags} are specified, \\code{caseStartTags} will be ignored 
                            (Similarly, for \\code{caseStartFlag_col} and \\code{caseStartTags}).  \\cr

                          \\code{remove_sst (logical)}: If set to TRUE, same status transitions will be removed.  \\cr
                          
                          \\code{extra_col (character)}: list of additional columns to be added to the transition history table \\cr
                          "
                          
                          # verifications
                          dataset %<>%
                            nameColumns(columns = list(caseID = caseID_col , status = status_col , startTime = startTime_col, caseStart = caseStartFlag_col, caseEnd = caseEndFlag_col),
                                        classes = list(caseID = 'character', status = 'character', startTime = 'POSIXct', caseStart = 'logical', caseEnd = 'logical')) %>%
                            mutate(selected = T)

                          # dataset$status = gsub("[[:space:]]", "", as.character(dataset$status))
                          for(column in c('caseID', 'status', 'startTime', 'caseStart', 'caseEnd')){
                            dataset %<>% .remove_eventlog_missing_values(column) 
                          }

                          support('dplyr')
                          dataset %<>% dplyr::mutate(nextStatus = status, endTime = startTime) %>%
                            # dplyr::distinct(caseID, startTime, .keep_all = TRUE) %>% # remove duplicated combination of caseID and startTime
                            dplyr::arrange(caseID, startTime) %>% column.shift.up(c('nextStatus', 'endTime'), keep.rows = T)

                          # Removing cases with no transition (all cases who remained in their first status until the end of the dataset are removed!)
                          dp = which(!duplicated(dataset$caseID))

                          if(is.null(dataset$caseStart)){
                            if(is.null(caseStartTags)){dataset$caseStart = T} else {
                              dataset %>% group_by(caseID) %>% summarise(starting_status = first(status)) %>% 
                                filter(starting_status %in% caseStartTags) %>% pull(caseID) -> startedcases
                              dataset$caseStart = dataset$caseID %in% startedcases
                            }
                          }

                          if(is.null(dataset$caseEnd)){
                            if(is.null(caseEndTags)){dataset$caseEnd = T} else {
                              dataset %>% group_by(caseID) %>% summarise(ending_status = last(status)) %>% 
                                filter(ending_status %in% caseEndTags) %>% pull(caseID) -> endedcases
                              dataset$caseEnd = dataset$caseID %in% endedcases}}
                          
                          dataset = dataset[c('caseID', 'status', 'nextStatus', 'startTime', 'endTime', 'caseStart', 'caseEnd', 'selected', extra_col)] %>% as.data.frame

                          # Add END/EXIT 
                          endindx = c(dp[-1] - 1, nrow(dataset))
                          dataset[endindx, 'nextStatus'] = ifelse(dataset[endindx, 'caseEnd'], 'END', 'EXIT')
                          dataset[endindx, 'endTime'] = dataset[endindx, 'startTime'] + 1

                          # Add START/ENTER 
                          rb           = dataset[dp,]
                          rb$endTime   = rb$startTime
                          rb$startTime = rb$startTime - 1
                          rb$nextStatus = rb$status
                          rb$status     = ifelse(rb$caseStart, 'START', 'ENTER')

                          dataset %<>% rbind(rb) %>% dplyr::arrange(caseID, startTime)
                          dp = which(!duplicated(dataset$caseID))
                          endindx = c(dp[-1] - 1, nrow(dataset))

                          dataset[dp, c('caseID', 'startTime', 'status', 'caseStart', 'caseEnd')] %>%
                            inner_join(dataset[endindx, c('caseID', 'endTime', 'nextStatus')], by = 'caseID') %>%
                            dplyr::select(caseID, firstStatus = status, lastStatus = nextStatus, startTime, endTime, caseStart, caseEnd) %>%
                            dplyr::mutate(duration = as.numeric(endTime - startTime), completed = caseStart & caseEnd, selected = T) %>% 
                            {rownames(.)<-.$caseID;.} -> tables.profile.case
                          
                          if(remove_sst){
                            dataset = dataset[dataset$status != dataset$nextStatus, ] 
                            dpinv   = duplicated(dataset$caseID) %>% which
                            dataset$startTime[dpinv] <- dataset$endTime[dpinv - 1]
                            # dp = which(!duplicated(dataset$caseID))
                          }

                          dataset %<>%
                            mutate(startDate = startTime %>% as_date,
                                   endDate   = endTime   %>% as_date)

                          tables.profile.status <- data.frame(status = as.character(dataset$status %U% dataset$nextStatus)) %>% {rownames(.)<-.$status;.}
                          report.caseIDs        <- tables.profile.case$caseID
                          report.statuses       <- tables.profile.status$status

                          cs = as.character(dataset$caseID)

                          dataset$creation   <- tables.profile.case[cs, 'startTime']
                          dataset$duration   <- as.numeric(dataset$endTime - dataset$startTime)
                          dataset$eventAge   <- as.numeric(dataset$startTime - dataset$creation)

                          dataset$selected = TRUE
                          dataset$path     = dataset$status %++% '-' %++% dataset$nextStatus
 
                          if(is.empty(history)){
                            history <<- dataset
                            tables$profile.case <<- tables.profile.case
                            report$caseIDs      <<- report.caseIDs
                            report$statuses     <<- report.statuses
                          } else {
                            history <<- dplyr::bind_rows(history, dataset)
                            tables$profile.case <<- dplyr::bind_rows(tables$profile.case, tables.profile.case) 
                            report$caseIDs      <<- c(report$caseIDs, report.caseIDs) %>% unique
                            report$statuses     <<- c(report$statuses, report.statuses) %>% unique
                          }
                          
                          if(is.empty(modelStart)){modelStart <<- min(history$startTime)} else {modelStart <<- min(history$startTime, modelStart)}
                          if(is.empty(modelEnd)){modelEnd     <<- max(history$endTime)} else {modelEnd <<- max(history$endTime, modelEnd)}

                          filter.case(complete = settings$filter$complete, minLoops = settings$filter$minLoops, maxLoops = settings$filter$maxLoops, IDs = settings$filter$IDs, startStatuses = settings$filter$startStatuses, endStatuses = settings$filter$endStatuses, freqRateCut = settings$filter$freqRateCut)
                        },

                        # tables in cases are always full and do not depend on the filtering
                        get.case.path = function(){
                          "Returns trace, path, DNA or status sequence of each case"
                          if(is.null(tables$profile.case$path)){
                            cat('\n', 'Aggregating cases to find paths ...')
                            history %>% dplyr::group_by(caseID) %>%
                              dplyr::summarise(path = paste(status, collapse = '-') %>% paste(last(nextStatus), sep = '-'), startStatus = status[2], endStatus = last(status), transCount = length(status), loops = sum(duplicated(status), na.rm = T)) %>%
                              # dplyr::summarise(minTime = min(startTime), maxTime = max(endTime), startFlag = caseStart[1], endFlag = caseEnd[1], ) %>%
                              # dplyr::mutate(duration2 = as.numeric(maxTime - minTime)) %>%
                              dplyr::ungroup() -> case.path
                            
                            tables$profile.case <<- tables$profile.case %>% left_join(case.path, by = 'caseID')
                            
                            cat('Done!', '\n')
                          }
                          return(tables$profile.case %>% select(caseID, path, loops, transCount, duration, completed))
                        },
                        
                        get.case.status.startTime = function(){
                          obj$history %>% reshape2::dcast(caseID ~ status, value.var = 'startTime', fun.aggregate = min) %>% column2Rownames('caseID') -> wide
                          for(i in sequence(ncol(wide))) {wide[,i] %<>% as.POSIXct(origin = '1970-01-01')}
                          return(wide)  
                        },

                        get.status.case.startTime = function(){
                          obj$history %>% reshape2::dcast(status ~ caseID, value.var = 'startTime', fun.aggregate = min) %>% column2Rownames('status') -> wide
                          for(i in sequence(ncol(wide))) {wide[,i] %<>% as.POSIXct(origin = '1970-01-01')}
                          return(wide)  
                        },
                        
                        # returns the Case Status Time (CST) matrix containing amount of time each case spent on each status. NA means the case never met the status
                        get.case.status.duration = function(){
                          "Returns a table containing duration of each case on each status"
                          if(is.null(report$case.status.duration)){
                            cat('\n', 'Aggregating cases to find status durations ...')
                            if(nrow(history) == 0){
                              report$case.status.duration <<- data.frame()
                            } else if (is.null(report$case.status.duration)){
                              history %>% dplyr::group_by(caseID, status) %>%
                                dplyr::summarise(duration = sum(duration, na.rm = T)) %>%
                                reshape2::dcast(caseID ~ status, value.var = 'duration') %>% column2Rownames('caseID') ->> report$case.status.duration
                            }
                            cat('Done!', '\n')
                          }
                          return(report$case.status.duration)
                        },

                        get.case.status.freq = function(){
                          if(nrow(history) == 0){
                            report$case.status.freq <<- data.frame()
                          } else if(is.null(report$case.status.freq)){
                            history %>% dplyr::filter(selected) %>% dplyr::group_by(caseID, status) %>%
                              dplyr::summarise(freq = length(duration)) %>%
                              reshape2::dcast(caseID ~ status, value.var = 'freq') %>% na2zero %>% column2Rownames('caseID') ->> report$case.status.freq
                          }
                          return(report$case.status.freq)
                        },

                        get.volume.in = function(period = c('daily', 'weekly', 'hourly', 'monthly', 'annual'), full = F, as_timeseries = F){
                          period = match.arg(period)
                          assert(period %in% c('daily', 'hourly')) # todo: write for other periods
                          switch(period,
                                 daily  = {
                                   if(full){null = is.null(timeseries.full$volin.daily)} else {null = is.null(timeseries$volin.daily)}
                                   if(null){
                                     # compute.volumes.daily(full = full)
                                     if(full){hist = history} else {hist = history %>% filter(selected)}
                                     if(is.empty(hist)){volin = new('TS.DAILY')} else {
                                       B.entr = hist %>% dplyr::group_by(startDate, status) %>% dplyr::summarise(length(caseID))

                                       names(B.entr) <- c('Date', 'Status', 'Entry')

                                       B.entr %<>% reshape2::dcast(Date ~ Status, sum, value.var = 'Entry')

                                       modelStartDate <- as.Date(modelStart %>% setTZ('GMT'))
                                       modelEndDate   <- as.Date(modelEnd %>% setTZ('GMT'))

                                       volin <- new('TS.DAILY', from = modelStartDate, until = modelEndDate)
                                       volin$feedData(B.entr, date_col = 'Date')
                                       volin$data %<>% na2zero
                                     }

                                     if(full) timeseries.full$volin.daily <<- volin
                                     else      timeseries$volin.daily     <<- volin
                                   }
                                   if(as_timeseries){if(full){return(timeseries.full$volin.daily)} else {return(timeseries$volin.daily)}}
                                   else             {if(full){return(timeseries.full$volin.daily$data)} else {return(timeseries$volin.daily$data)}}
                                 },
                                 hourly = {
                                   if(full){null = is.null(timeseries.full$volin.hourly)} else {null = is.null(timeseries$volin.hourly)}
                                   if(null){
                                     # compute.volumes.hourly(full = full)
                                     if(full){hist = history} else {hist = history %>% filter(selected)}
                                     if(is.empty(hist)){volin = new('TS.HOURLY')} else {
                                       B.entr = hist %>% mutate(startHour = cut(startTime, breaks = 'hour')) %>%
                                         dplyr::group_by(startHour, status) %>% dplyr::summarise(length(caseID)) %>% dplyr::ungroup()

                                       names(B.entr) <- c('Hour', 'Status', 'Entry')

                                       B.entr %<>% reshape2::dcast(Hour ~ Status, sum, value.var = 'Entry')

                                       B.entr %<>% dplyr::mutate(Hour = as.POSIXct(Hour))

                                       volin <- new('TS.HOURLY', from = modelStart, until = modelEnd)
                                       volin$feedData(B.entr, hour_col = 'Hour')
                                       volin$data %<>% na2zero
                                     }

                                     if(full) timeseries.full$volin.hourly <<- volin
                                     else      timeseries$volin.hourly     <<- volin
                                   }
                                   if(as_timeseries){if(full){return(timeseries.full$volin.hourly)} else {return(timeseries$volin.hourly)}}
                                   else             {if(full){return(timeseries.full$volin.hourly$data)} else {return(timeseries$volin.hourly$data)}}
                                 })
                        },

                        get.volume.out = function(period = c('daily', 'weekly', 'hourly', 'monthly', 'annual'), full = F, as_timeseries = F){
                          period = match.arg(period)
                          assert(period %in% c('daily', 'hourly')) # todo: write for other periods
                          switch(period,
                                 daily = {
                                   if(full){null = is.null(timeseries.full$volout.daily)} else {null = is.null(timeseries$volout.daily)}
                                   if(null){
                                     # compute.volumes.daily(full = full)
                                     if(full){hist = history} else {hist = history %>% filter(selected)}
                                     if(is.empty(hist)){volout <- new('TS.DAILY')} else {
                                       B.exit = hist %>% dplyr::group_by(endDate, status)   %>% dplyr::summarise(length(caseID))

                                       names(B.exit) <- c('Date', 'Status', 'Exit')

                                       B.exit %<>% reshape2::dcast(Date ~ Status, sum, value.var = 'Exit')

                                       modelStartDate <- as.Date(modelStart %>% setTZ('GMT'))
                                       modelEndDate   <- as.Date(modelEnd %>% setTZ('GMT'))

                                       volout <- new('TS.DAILY', from = modelStartDate, until = modelEndDate)
                                       volout$feedData(B.exit, date_col = 'Date')
                                       volout$data %<>% na2zero
                                     }

                                     if(full) timeseries.full$volout.daily <<- volout
                                     else      timeseries$volout.daily     <<- volout
                                   }
                                   if(as_timeseries){if(full){return(timeseries.full$volout.daily)} else {return(timeseries$volout.daily)}}
                                   else             {if(full){return(timeseries.full$volout.daily$data)} else {return(timeseries$volout.daily$data)}}
                                 },
                                 hourly = {
                                   if(full){null = is.null(timeseries.full$volout.hourly)} else {null = is.null(timeseries$volout.hourly)}
                                   if(null){
                                     # compute.volumes.hourly(full = full)
                                     if(full){hist = history} else {hist = history %>% filter(selected)}
                                     if(is.empty(hist)){volout <- new('TS.HOURLY')} else {
                                       B.exit = hist %>% dplyr::mutate(endHour = cut(endTime, breaks = 'hour')) %>%
                                         dplyr::group_by(endHour, status) %>% dplyr::summarise(length(caseID)) %>% dplyr::ungroup()

                                       names(B.exit) <- c('Hour', 'Status', 'Exit')

                                       B.exit %<>% reshape2::dcast(Hour ~ Status, sum, value.var = 'Exit')
                                       B.exit %<>% dplyr::mutate(Hour = as.POSIXct(Hour))

                                       volout <- new('TS.HOURLY', from = modelStart, until = modelEnd)
                                       volout$feedData(B.exit, hour_col = 'Hour')
                                       volout$data %<>% na2zero
                                     }

                                     if(full) timeseries.full$volout.hourly <<- volout
                                     else      timeseries$volout.hourly     <<- volout
                                   }
                                   if(as_timeseries){if(full){return(timeseries.full$volout.hourly)} else {return(timeseries$volout.hourly)}}
                                   else             {if(full){return(timeseries.full$volout.hourly$data)} else {return(timeseries$volout.hourly$data)}}
                                 })
                        },

                        get.backlog = function(period = c('daily', 'weekly', 'hourly', 'monthly', 'annual'), full = F, as_timeseries = F){
                          period = match.arg(period)
                          assert(period %in% c('daily', 'hourly')) # todo: write for other periods
                          switch(period,
                                 daily = {
                                   if(full){null = is.null(timeseries.full$backlog.daily)} else {null = is.null(timeseries$backlog.daily)}
                                   if(null){
                                     # compute.volumes.daily(full = full)

                                     volout = get.volumeOut(full = full) %>% as.data.frame %>% column2Rownames('date')
                                     volin  = get.volumeIn(full = full) %>% as.data.frame %>% column2Rownames('date')

                                     dt = rownames(volout) %U% rownames(volin)
                                     st = colnames(volout) %^% colnames(volin)

                                     BL = (volin[dt, st] %>% as.matrix) - (volout[dt, st] %>% na2zero %>% as.matrix)

                                     BL <- BL %>% cumulative

                                     # assert((sum(BL[nrow(BL),]) == 0) & (min(BL) >= 0), "Something goes wrong! Volume cannot be negative!", match.call()[[1]])

                                     BL %<>% as.data.frame %>% rownames2Column('Date')
                                     BL$Date %<>% as.Date
                                     firstDay <- min(BL$Date)
                                     lastDay  <- max(BL$Date)

                                     BL %<>% column.shift.up('Date', keep.rows = T)
                                     BL <- rbind(BL[1, ], BL)
                                     BL[1, -1] <- 0
                                     BL[1, 'Date'] <- firstDay
                                     BL[nrow(BL), 'Date'] <- lastDay + 1

                                     backlog <- new('TS.DAILY', from = min(BL$Date), until = max(BL$Date))
                                     backlog$feedData(BL, date_col = 'Date')

                                     if(full){
                                       timeseries.full$backlog.daily <<- backlog
                                     } else{
                                       timeseries$backlog.daily <<- backlog
                                     }

                                   }
                                   if(as_timeseries){if(full){return(timeseries.full$backlog.daily)} else {return(timeseries$backlog.daily)}}
                                   else             {if(full){return(timeseries.full$backlog.daily$data)} else {return(timeseries$backlog.daily$data)}}
                                 },
                                 hourly = {
                                   if(full){null = is.null(timeseries.full$backlog.hourly)} else {null = is.null(timeseries$backlog.hourly)}
                                   if(null){
                                     volout = get.volumeOut(full = full, period = period) %>% as.data.frame %>% column2Rownames('time')
                                     volin  = get.volumeIn(full = full, period = period) %>% as.data.frame %>% column2Rownames('time')

                                     dt = rownames(volout) %U% rownames(volin)
                                     st = colnames(volout) %^% colnames(volin)

                                     BL = (volin[dt, st] %>% as.matrix) - (volout[dt, st] %>% na2zero %>% as.matrix)

                                     BL <- BL %>% cumulative

                                     # assert((sum(BL[nrow(BL),]) == 0) & (min(BL) >= 0), "Something goes wrong! Volume cannot be negative!", match.call()[[1]])

                                     BL %<>% as.data.frame %>% rownames2Column('Hour')
                                     BL$Hour %<>% as.POSIXct

                                     firstHour <- min(BL$Hour)
                                     lastHour  <- max(BL$Hour)

                                     BL %<>% column.shift.up('Hour', keep.rows = T)
                                     BL <- rbind(BL[1, ], BL)
                                     BL[1, -1] <- 0
                                     BL[1, 'Hour'] <- firstHour
                                     BL[nrow(BL), 'Hour'] <- lastHour + 3600

                                     backlog <- new('TS.HOURLY', from = min(BL$Hour), until = max(BL$Hour))
                                     backlog$feedData(BL, hour_col = 'Hour')

                                     if(full){
                                       timeseries.full$backlog.hourly <<- backlog
                                     } else{
                                       timeseries$backlog.hourly <<- backlog
                                     }

                                   }
                                   if(as_timeseries){if(full){return(timeseries.full$backlog.hourly)} else {return(timeseries$backlog.hourly)}}
                                   else             {if(full){return(timeseries.full$backlog.hourly$data)} else {return(timeseries$backlog.hourly$data)}}
                                 })
                        },

                        get.metric = function(measure = c('freq', 'totComp', 'avgComp', 'totTT', 'avgTT', 'medTT', 'sdTT', 'totLoops', 'avgLoops', 'medLoops',  'totTrans', 'avgTrans'), time_unit = c('second', 'minute', 'hour', 'day', 'week', 'year')){
                          nontime   = c('freq', 'totLoops', 'avgLoops', 'medLoops', 'totComp', 'avgComp', 'totTrans', 'avgTrans')
                          measure   = match.arg(measure)
                          time_unit = match.arg(time_unit)
                          k         = chif(measure %in% nontime, as.integer(1), 1.0/timeUnitCoeff[time_unit])

                          if(is.null(metrics[[measure]])){
                            if(measure %in% c('totLoops', 'avgLoops', 'medLoops')){get.case.path()}
                            switch(measure,
                                   'freq' = {metricval = length(get.cases()) %>% as.integer},
                                   'totTT' = {metricval = tables$profile.case[get.cases(), 'duration'] %>% sum(na.rm = T)},
                                   'avgTT' = {metricval = tables$profile.case[get.cases(), 'duration'] %>% mean(na.rm = T)},
                                   'sdTT' = {metricval = tables$profile.case[get.cases(), 'duration'] %>% sd(na.rm = T)},
                                   'medTT' = {metricval = tables$profile.case[get.cases(), 'duration'] %>% median(na.rm = T)},
                                   'totLoops' = {metricval = tables$case.path[get.cases(), 'loops'] %>% sum(na.rm = T) %>% as.integer},
                                   'avgLoops' = {metricval = tables$case.path[get.cases(), 'loops'] %>% mean(na.rm = T)},
                                   'medLoops' = {metricval = tables$case.path[get.cases(), 'loops'] %>% median(na.rm = T) %>% as.integer},
                                   'totTrans' = {metricval = tables$case.path[get.cases(), 'transCount'] %>% sum(na.rm = T) %>% as.integer},
                                   'avgTrans' = {metricval = tables$case.path[get.cases(), 'transCount'] %>% mean(na.rm = T)},
                                   'totComp' = {metricval = (tables$profile.case[get.cases(), 'caseStart'] &  tables$profile.case[get.cases(), 'caseEnd']) %>% sum(na.rm = T) %>% as.integer},
                                   'avgComp' = {metricval = (tables$profile.case[get.cases(), 'caseStart'] &  tables$profile.case[get.cases(), 'caseEnd']) %>% mean(na.rm = T)}
                            )
                            metrics[[measure]] <<- metricval
                          }
                          return(round(k*metrics[[measure]], digits = 2))
                          # add: loops, completion percentage, most frequent longest status,    distribution of cases over their longest status, distribution of total time over status
                        },

                        # valid.link.weights = c('totFreq', 'meanCaseFreq', 'medCaseFreq', 'sdCaseFreq', 'totalTime', ...)
                        # totalFreq:    Total transition frequency
                        
                        get.nodes = function(full = F){
                          "
                          Returns the status profile which is a data frame containing features associated with the statuses of the transition system.
                          \\cr 
                          Each status stablish a node in the process map graph from which you can build a Markov-Chain (MC) 
                          model and use it to determine the steady state probabilities or run a memory-less process simulation.
                          \\cr 
                          If you run this method once, the output table will be accessible via reports$nodes.
                          If the function is called for the second time, it returns the same table as before if you do not reset the object.
                          \\cr \\cr 
                          \\emph{Arguments:} 
                          \\cr \\cr
                          * \\code{full (logical)}: If set to \\code{TRUE}, case filtering is ignored and all cases will be considered in computing the statistics.
                          \\cr  \\cr
                          \\emph{Output:} 
                          \\cr \\cr
                          \\code{(data.frame)}: Includes the following columns
                          (columns with keyword \\code{case} are generated only if \\code{settings$include_case_measures} is \\code{TRUE}):
                          \\cr   \\cr
                          \\code{totExitFreq}: Over all cases, total count of transitions from the status to other statuses.
                          \\cr 
                          \\code{totEntryFreq}: Over all cases, total count of transitions into the status from other statuses .
                          \\cr 
                          \\code{totalDuration}: Sum of duration(time) that cases spent in the status.
                          \\cr 
                          \\code{meanDuration}: Average duration(time) that each case spent in the status.
                          \\cr 
                          \\code{medDuration}: Median of duration(time) that cases spent in the status.
                          \\cr 
                          \\code{sdDuration}: Standard deviation of duration(time) that cases were in the status.
                          \\cr 
                          \\code{nCaseEntry}: How many cases have at least once entered into the status.
                          \\cr 
                          \\code{nCaseExit}: How many cases have at least once exited from the status.
                          \\cr 
                          \\code{meanExitCaseFrequency}: indicates how many times in average, each case has exited from the status
                          "
                          
                          if(full){
                            if(!is.empty(tables$nodes)){return(tables$nodes)} else {H = history}
                          } else {
                            if(!is.empty(report$nodes)){return(report$nodes)} else {H = history %>% dplyr::filter(selected)}
                          }

                          cat('\n', 'Aggregating nodes ...')

                          if(settings$include_case_measures){
                            # column "
                            # column "nCaseExit" ? 
                            # What percentage of cases have at least exited from this status?
                            A.node = H %>% dplyr::group_by(caseID, status) %>% 
                              dplyr::summarise(count = length(selected, na.rm = T), duration = sum(duration, na.rm = T))
                            B.node = A.node %>% dplyr::group_by(status) %>%
                              dplyr::summarise(nCaseExit = sum(count > 0), meanExitCaseFreq = mean(count, na.rm = T), medExitCaseFreq = median(count, na.rm = T)   , sdExitCaseFreq = sd(count, na.rm = T),
                                               meanCaseDuration = mean(duration, na.rm = T), medCaseDuration = median(duration, na.rm = T), sdCaseDuration = sd(duration, na.rm = T))
                          }

                          C.node = H %>% dplyr::group_by(status) %>%
                            dplyr::summarise(totalExitFreq = sum(selected, na.rm = T), totalDuration = sum(duration, na.rm = T),
                                             meanDuration = mean(duration, na.rm = T), medDuration = median(duration, na.rm = T), sdDuration = sd(duration, na.rm = T))

                          D.node = H %>% dplyr::group_by(nextStatus) %>% select(status = nextStatus, count = selected) %>%
                            dplyr::summarise(totalEntryFreq = sum(count, na.rm = T)) %>% full_join(C.node, by = 'status')

                          if(settings$include_case_measures){
                            D.node %<>% full_join(B.node, by = 'status')
                          }
                          nsel = sum(history$selected)
                          nrow = nrow(history)
                          if(full  | nsel == nrow){tables$nodes <<- D.node}
                          if(!full | nsel == nrow){report$nodes <<- D.node}
                          if(is.empty(D.node)){
                            data.frame(status = character(), totalEntryFreq = numeric(), totalExitFreq = numeric(), totalDuration = numeric(), meanDuration = numeric(), medDuration = numeric(), sdDuration = numeric(), stringsAsFactors = F) %>% as_tibble -> D.node
                          }
                          cat('Done!', '\n')
                          return(D.node)
                        },

                        get.links = function(full = F){
                          
                          "
                          Returns the transition profile which is a data frame containing features associated with the transitions of cases from one status to other.
                          \\cr 
                          Each transition stablish a link in the process map graph from which you can build a Markov-Chain (MC) 
                          model and use it to determine the steady state probabilities or run a memory-less process simulation.
                          \\cr 
                          If you run this method once, the output table will be accessible via \\code{reports$links}.
                          If the function is called for the second time, it returns the same table as before if you do not reset the object.
                          \\cr \\cr 
                          \\emph{Arguments:} \\cr \\cr
                          * \\code{full (logical)}: If set to \\code{TRUE}, case filtering is ignored and all cases will be considered in computing the statistics.
                          \\cr  \\cr
                          \\emph{Output:} 
                          \\cr \\cr
                          \\code{(data.frame)}: Includes the following columns
                          (columns with keyword \\code{case} are generated only if \\code{settings$include_case_measures} is \\code{TRUE}):
                          \\cr   \\cr

                          \\code{meanCaseFreq}: average transition count per case \\cr
                          \\code{medCaseFreq}:  median case transition frequency 
                          (half of cases have transition frequency higher than this value) \\cr
                          \\code{sdCaseFreq}: standard deviation of transition frequencies among cases \\cr
                          \\code{totalTime}: Total transition time \\cr
                          \\code{meanCaseTime}: average transition time per case  \\cr
                          \\code{medCaseTime}: median case transition time (half of cases have transition time higher than this value)  \\cr
                          \\code{sdCaseTime}: standard deviation of transition time among cases  \\cr
                          \\code{meanTime}: average: transition time per transition  \\cr
                          \\code{medTime}: median transition time (half of transitions have transition time higher than this value)  \\cr
                          \\code{sdTime}: standard deviation of transition time among transitions(half of transitions have transition time higher than this value)  \\cr
                          \\code{meanCaseEntryFreq}: how many times in average a case has entered this status  \\cr

                          "
                          
                          if(full){
                            if(!is.empty(tables$links)){return(tables$links)} else {H = history}
                          } else {
                            if(!is.empty(report$links)){return(report$links)} else {H = history %>% dplyr::filter(selected)}
                          }

                          cat('\n', 'Aggregating links ...')

                          if(settings$include_case_measures){
                            A.edge = H %>% dplyr::group_by(caseID, status, nextStatus) %>% dplyr::summarise(count = sum(selected, na.rm = T), duration = sum(duration, na.rm = T))
                            B.edge = A.edge %>% dplyr::group_by(status, nextStatus) %>%
                              dplyr::summarise(meanCaseFreq = mean(count, na.rm = T)   , medCaseFreq = median(count, na.rm = T)   , sdCaseFreq = sd(count, na.rm = T),
                                               meanCaseTime = mean(duration, na.rm = T), medCaseTime = median(duration, na.rm = T), sdCaseTime = sd(duration, na.rm = T))
                          }

                          C.edge = H %>% dplyr::group_by(status, nextStatus) %>%
                            dplyr::summarise(totalFreq = sum(selected, na.rm = T), totalTime = sum(duration, na.rm = T), meanTime = mean(duration, na.rm = T), medTime = median(duration, na.rm = T), sdTime = sd(duration, na.rm = T))

                          if(settings$include_case_measures){
                            C.edge %<>% full_join(B.edge, by = c('status', 'nextStatus'))
                          }
                          if(is.empty(C.edge)){
                            data.frame(status = character(), nextStatus = character(), totalFreq = numeric(), totalTime = numeric(), meanTime = numeric(), medTime = numeric(), sdTime = numeric(), stringsAsFactors = F) %>% as_tibble -> C.edge
                          }

                          nsel = sum(history$selected)
                          nrow = nrow(history)

                          if(full  | nsel == nrow){tables$links <<- C.edge}
                          if(!full | nsel == nrow){report$links <<- C.edge}
                          cat('Done!', '\n')
                          return(C.edge)
                        },

                        # Incomplete functions
                        get.metric.status = function(statusID, measure = c('freq', 'totTime', 'loopRate', 'caseRatio')){
                          measure = match.arg(measure)
                          switch(measure, 'freq' = {
                            CSF <- get.case.status.freq()
                            val = CSF[,statusID] %>% sum(na.rm = T) %>% as.integer
                          }, 'totTime' = {
                            CST <- get.case.status.duration()
                            val = CST[,statusID] %>% sum(na.rm = T)
                          }, 'loopRate' = {
                            CSF <- get.case.status.freq()
                            entry = CSF[, statusID] %>% na.omit %>% zero.omit
                            val = sum(entry > 1)/length(entry)
                          }, 'caseRatio' = {
                            CSF <- get.case.status.freq()
                            entry = CSF[, statusID]
                            val = sum(entry > 0)/length(entry)
                          })
                          return(val %>% round(digits = 2))
                        },

                        get.status.volumes = function(statusID, period = c('daily', 'hourly'), as_timeseries = T){
                          period = match.arg(period)
                          listnm = statusID %>% paste('volumes', period, sep = '.')
                          if(is.null(timeseries[[listnm]])){
                            vin   = get.volumeIn(as_timeseries = F, period = period) %>% pull(statusID)
                            vinc  = vin %>% cumulative
                            vout  = get.volumeOut(as_timeseries = F, period = period) %>% pull(statusID)
                            voutc = vout %>% cumulative
                            back  = get.backlog(as_timeseries = F, period = period) %>% pull(statusID)
                            dt    = get.backlog(as_timeseries = F, period = period) %>% pull(chif(period == 'daily', 'date', 'time'))

                            vol   = data.frame(time = dt, volumeIn = c(vin, NA), volumeOut = c(vout, NA), volumeInCum = c(vinc, NA), volumeOutCum = c(voutc, NA), backlog = back)
                            switch(period, 'daily' = {
                              volts = new('TS.DAILY', from = min(dt), until = max(dt))
                              volts$feedData(vol, date_col = 'time')
                            }, 'hourly' = {
                              volts = new('TS.HOURLY', from = min(dt), until = max(dt))
                              volts$feedData(vol, hour_col = 'time')
                            })
                            timeseries[[listnm]] <<- volts
                          }
                          if(as_timeseries){return(timeseries[[listnm]])} else {return(timeseries[[listnm]]$data)}
                        },

                        get.metric.case   = function(caseID, measure = c('freq', 'time'), aggregator = c('sum', 'mean', 'median', 'sd')){},
                        get.metric.trace  = function(traceID, measure = c('freq', 'time'), aggregator = c('sum', 'mean', 'median', 'sd')){},

                        get.time.status.volume = function(){
                          if(is.null(report$time.status.volume)){
                            hist = history %>% mutate(startTime = as.character(startTime), endTime = as.character(endTime))
                            hist %>% reshape2::dcast(startTime ~ status, fun.aggregate = length, value.var = 'caseID') %>% column2Rownames('startTime') %>% as.matrix -> vin
                            hist %>% reshape2::dcast(endTime ~ status,  fun.aggregate = length, value.var = 'caseID')  %>% column2Rownames('endTime') %>% as.matrix -> vout
                            times = sort(rownames(vin) %^% rownames(vout))
                            cols  = colnames(vin) %^% colnames(vout)
                            cumulative.sum(vin[times, cols] - vout[times, cols]) ->> report$time.status.volume
                          }
                          return(report$time.status.volume)
                        },
                        
                        get.transition_matrix = function(measure = c('rate', 'freq', 'time'), aggregator = c('sum', 'mean', 'median', 'sd'), remove_ends = F, full = F){
                          "
                          Returns the square \\href{https://en.wikipedia.org/wiki/Adjacency_matrix}{Adjecancy Matrix}
                          associated with the transition system. It returns the same information that method get.links() provides but in a square matrix. \\cr \\cr
                          
                          \\emph{Arguments:} \\cr \\cr
                          
                          \\code{measure (character)}: one of the three options: \\code{'rate'}, \\code{'freq'} or \\code{'time'}.
                          The default is \\code{'rate'}. Any other value will raise a value error. \\cr
                          - \\code{'rate'}: the returned matrix contains rates of transitions from one status to itself or another status as 
                          percentages of total cases in the origin status. \\cr 
                          For example, if \\emph{A} and \\emph{B} are two statuses in the transition system, 
                          the value of cell in row A and column B shows what fraction of cases transitioned from status A to B. \\cr
                          - \\code{'freq'}: the returned matrix contains raw frequencies of case transitions. \\cr
                          - \\code{'time'}: the returned matrix contains aggregated durations of case transitions. \\cr \\cr
                          
                          \\code{aggregator (character)}: Only used when \\code{measure} is set to \\code{'time'}.
                          Can be one of these options \\code{'sum', 'mean', 'median', 'sd'}
                          This specifies aggregator to aggregate transition times (durations). 
                          For example if \\code{measure = 'time'} and  \\code{aggregator = 'mean'}, 
                          the value of cell in row A and column B shows the average transition time from status A to B 
                          (average time cases spent in status A before migrating to status B) \\cr \\cr
                          
                          \\code{remove_ends (boolean)}: If set to \\code{TRUE}, then the rows and columns 
                          associated with the START, END, ENTER and EXIT statuses will be eliminated from the adjacency matrix.
                          "
                          
                          measure      = match.arg(measure)
                          aggregator   = match.arg(aggregator)
                          tabName      = 'adjacency' %>% paste(measure, aggregator, chif(full, 'full', ''), sep = '.')

                          if(is.null(report[[tabName]])){
                            timevar = c(sum = 'totalTime', mean = 'meanTime', median = 'medTime', sd = 'sdTime')
                            edges   = get.links(full = full)
                            if(is.empty(edges)){return(data.frame())}
                            E = edges %>%
                              reshape2::dcast(status ~ nextStatus, value.var = chif(measure %in% c('rate', 'freq'), 'totalFreq', timevar[aggregator])) %>%
                              column2Rownames('status') %>% na2zero

                            for (i in get.statuses(full = full) %-% colnames(E)){E[, i] <- 0}
                            for (i in get.statuses(full = full) %-% rownames(E)){E[i, ] <- 0}

                            to_remove = chif(remove_ends, c('START', 'END', 'ENTER', 'EXIT'), c())
                            E[get.statuses(full = full) %-% to_remove, get.statuses(full = full) %-% to_remove] ->> report[[tabName]]
                            if(measure == 'rate'){
                              rsms = rowSums(report[[tabName]])
                              report[[tabName]] <<- report[[tabName]] %>% apply(2, function(v) v/rsms)
                            }
                            report[[tabName]] <<- na2zero(report[[tabName]])
                            if('END' %in% rownames(report[[tabName]])){
                              report[[tabName]]['END', 'END'] <<- 1.0
                            }
                          }
                          return(report[[tabName]])
                        },

                        get.statuses = function(full = F){
                          if(full){return(tables$profile.status$status)}
                          else {
                            if(is.empty(report$statuses)){
                              report$statuses <<- history$status[history$selected] %U% history$nextStatus[history$selected]
                            }
                            return(report$statuses)
                          }
                        },

                        get.cases = function(full = F){
                          if(full){return(tables$profile.case$caseID)}
                          else {
                            if(is.empty(report$caseIDs)){
                              report$caseIDs <<- unique(history[history$selected, 'caseID'])
                            }
                            return(report$caseIDs)
                          }
                        },

                        get.case.status.time = function(){
                          "Returns entry time of each case to each status in a matrix"
                          if(is.null(report$case.status.time)){
                            history %>% filter(selected) %>% 
                              reshape2::dcast(caseID ~ status, value.var = 'startTime', fun.aggregate = min) %>% column2Rownames('caseID') -> wide
                            for(i in sequence(ncol(wide))) {wide[,i] %<>% as.POSIXct(origin = '1970-01-01')}
                            report$case.status.time <<- wide
                          }
                          return(report$case.status.time)
                        },
                        
                        get.status.case.time = function(){
                          "Returns entry time to each status for each case in a matrix (transpose of 'case.status.time')"
                          if(is.null(report$status.case.time)){
                            history %>% filter(selected) %>% 
                              reshape2::dcast(status ~ caseID, value.var = 'startTime', fun.aggregate = min) %>% 
                              column2Rownames('status') %>% numeric2time -> wide
                            # for(i in sequence(ncol(wide))) {wide[,i] %<>% as.POSIXct(origin = '1970-01-01')}
                            report$status.case.time <<- wide
                          }
                          return(report$status.case.time)
                        },
                          
                        get.traces = function(){
                          if(is.null(report$traces)){
                            casePath = get.case.path() %>% filter(caseID %in% get.cases())
                            cat('\n', 'Aggregating traces ...')
                            report$traces <<- casePath %>% dplyr::group_by(path) %>%
                              dplyr::summarise(freq = length(path), totTime = sum(duration, na.rm = T), avgTime = mean(duration, na.rm = T), medTime = median(duration, na.rm = T), sdTime = sd(duration, na.rm = T), complete = completed[1]) %>%
                              dplyr::ungroup() %>% dplyr::arrange(freq) %>%
                              dplyr::mutate(variation = 'Variation ' %++% rev(sequence(nrow(.))))
                            cat('Done!', '\n')
                          }
                          return(report$traces)
                        },
                        
                        get.longest_path = function(){
                          "
                          Returns the longest sequence observed
                          "
                          if(is.null(report$longest_path)){
                              get.traces() %>% pull(path) %>% strsplit('-') -> a
                              report$longest_path <<- a[[a %>% lapply(length) %>% unlist %>% order %>% tail(1)]]
                          }
                          return(report$longest_path)
                        },
                        
                        get.transition_probabilities = function(){
                          if(is.null(report$transition_probabilities)){
                            get.links() %>% 
                              select(status, nextStatus, totalFreq) %>%
                              arrange(status, nextStatus) %>%
                              group_by(status) %>%
                              mutate(cum_freq = cumsum(totalFreq)) %>%
                              mutate(prob = totalFreq/sum(totalFreq), cum_prob = cum_freq/sum(totalFreq)) %>% ungroup() %>%
                              select(status, nextStatus, prob, cum_prob) ->> report$transition_probabilities
                          }
                          return(report$transition_probabilities)
                        },
                        
                        feed.case_features = function(cf, caseID_col = 'caseID', features = NULL){
                          caseID_col %<>% verify('character', domain = colnames(cf), lengths = 1, default = 'caseID')
                          features %<>% verify('character', domain = colnames(cf), default = colnames(cf)) %-% c(caseID_col, 'caseID')
                          common_features = tables$profile.case %>% colnames %>% intersect(features)
                          warnif(length(common_features) > 0, 'These features are overwritten: ' %++% common_features %++% '\n')
                          tables$profile.case %>% column_drop(features) %>% 
                            left_join(cf[c(caseID_col, features)] %>% rename(caseID = caseID_col), by = 'caseID') ->> tables$profile.case
                        },

                        filter.reset = function(){
                          if(!is.empty(history)){history$selected <<- T}
                          if(!is.empty(tables$profile.transition)){tables$profile.transition$selected <<- T}
                          if(!is.empty(tables$profile.case)){tables$profile.case$selected <<- T}
                          clear()
                          report$caseIDs  <<- tables$profile.case$caseID
                          report$statuses <<- tables$profile.status$status
                          report$nodes    <<- tables$nodes
                          report$links    <<- tables$links
                          timeseries      <<- timeseries.full
                          settings$filter <<- list()
                        },
                        
                        filter.event = function(...){
                          keep = history$selected
                          history %>% select(-selected) %>% mutate(selected = ...) %>% 
                            {colnames(.)[ncol(.)]<-'selected';.} ->> history
                          
                          history$selected <<- history$selected & keep
                          
                          clear()
                        },

                        filter.case = function(complete = NULL, minLoops = NULL, maxLoops = NULL, statusDomain = NULL, startStatuses = NULL, endStatuses = NULL, IDs = NULL, freqRateCut = NULL, reset = T){
                          if(reset){filter.reset()}
                          
                          if(nrow(history) > 0){
                            if(!is.null(complete %>% verify('logical'))){
                              chosen = tables$profile.case$caseStart & tables$profile.case$caseEnd
                              if(!complete){chosen = !chosen}
                              report$caseIDs <<- get.cases() %^% rownames(tables$profile.case)[chosen]
                              history$selected <<- history$caseID %in% report$caseIDs
                              settings$filter$complete <<- complete
                            }

                            if(!is.null(minLoops) | !is.null(maxLoops)){
                              cp = get.case.path()
                              if(is.null(minLoops)){minLoops = min(cp$loops, na.rm = T)}
                              if(is.null(maxLoops)){maxLoops = max(cp$loops, na.rm = T)}
                              chosen = cp$loops >= minLoops & cp$loops <= maxLoops
                              report$caseIDs <<- get.cases() %^% rownames(cp)[chosen]
                              history$selected <<- history$caseID %in% report$caseIDs
                              settings$filter$minLoops <<- minLoops
                              settings$filter$maxLoops <<- maxLoops
                            }

                            if(!is.null(statusDomain)){
                              history$selected <<- history$selected & (history$status %in% statusDomain)
                              report$caseIDs   <<- history$caseID[history$selected] %>% unique
                              settings$filter$statusDomain <<- statusDomain
                            }

                            if(!is.null(startStatuses %>% verify('character'))){
                              cp      = get.case.path()
                              chosen  = cp$startStatus %in% startStatuses
                              report$caseIDs <<- get.cases() %^% rownames(cp)[chosen]
                              history$selected <<- history$caseID %in% report$caseIDs
                              settings$filter$startStatuses <<- startStatuses
                            }

                            if(!is.null(endStatuses %>% verify('character'))){
                              cp      = get.case.path()
                              chosen  = cp$endStatus %in% endStatuses
                              report$caseIDs <<- get.cases() %^% rownames(cp)[chosen]
                              history$selected <<- history$caseID %in% report$caseIDs
                              settings$filter$endStatuses <<- endStatuses
                            }

                            if(!is.null(IDs %>% verify('character'))){
                              report$caseIDs <<- report$caseIDs %^% IDs
                              settings$filter$IDs <<- IDs
                            }

                            history$selected <<- history$caseID %in% report$caseIDs
                            

                            if(!is.null(freqRateCut %>% verify(c('numeric','integer'), domain = c(0, 1), lengths = 1))){
                              adjcy = get.transition_matrix(measure = 'freq')
                              if(!is.empty(adjcy)){
                                adjcy %>% apply(1, vect.normalise) %>% t %>% as.data.frame %>% rownames2Column('status') %>%
                                  reshape2::melt(id.vars = 'status', variable.name = "nextStatus") %>% filter(value < 1 - freqRateCut) -> paths

                                histsel  <- history[history$selected, ]
                                histsel  <- histsel[histsel$path %in% (paths$status %++% '-' %++% paths$nextStatus),]
                                outliers <- histsel$caseID %>% unique
                                report$caseIDs <<- report$caseIDs %-% outliers
                                history$selected <<- history$caseID %in% report$caseIDs
                                settings$filter$freqRateCut <<- freqRateCut
                              }
                            }
                            
                            tables$profile.case$selected <<- tables$profile.case$caseID %in% report$caseIDs
                            if(!is.empty(tables$profile.transition)){tables$profile.transition$selected <<- tables$profile.transition$caseID %in% report$caseIDs}
                            
                            clear()
                          }
                        },
                        
                        filter.time = function(from = modelStart, to = modelEnd){
                          history$selected <<- history$selected & (history$startTime >= from) & (history$startTime <= to)
                        },
                        
                        reset = function(){
                          tables        <<- list()
                          report        <<- list()
                          metrics       <<- list()
                          plots         <<- list()
                          timeseries    <<- list()
                          timeseries.full  <<- list()
                        },
                        
                        random_walk = function(starting_status = 'START', num_steps = 1){
                          "
                          Runs a random walk analysis starting from a given status and returns a dataframe
                          containing probability distributions over statuses at each step
                          
                          \\emph{Arguments:} \\cr \\cr
                          
                          \\code{starting_status (character)}: must be one of statuses defined in the transition system.
                          The default is \\code{'START'}. \\cr

                          \\code{num_steps (integer)}: Number of steps in the random walk.

                          "
                          TM  = get.transition_matrix()
                          p0  = rep(0, nrow(TM))
                          assert(starting_status %in% rownames(TM), "Given 'starting_status' does not exist in the list of statuses")
                          p0[which(rownames(TM) == starting_status)] = 1.0

                          p = list(p0 %*% TM)
                          if (num_steps > 1){
                            for (i in 2:num_steps){
                              p[[i]] = p[[i-1]] %*% TM
                            }
                          }
                          
                          purrr::reduce(p, rbind) %>% as.data.frame
                        },
                        
                        to_periodic = function(period = 'day'){
                          defaultW <- getOption("warn")
                          options(warn = -1)
                          
                          status_decode = get.statuses()
                          status_code   = status_decode %>% length %>% sequence
                          names(status_code) <- get.statuses()
                          
                          history %>% 
                            filter(selected) %>% 
                            mutate(st_code = status_code[status], eventType = 'TRANS', attribute = 'status') %>% 
                            select(caseID, eventTime = startTime, eventType, attribute, value = st_code) %>% 
                            dfg_pack(period = period, sequential = T, event_funs = c(), var_funs = 'last') -> pack
                          
                          tsp = new("TransitionSystem")
                          pack$var_last %>% 
                            mutate(status = status_decode[TRANS_status_last]) %>% 
                            select(caseID, eventTime, status) %>% 
                            tsp$feed.eventlog(startTime_col = 'eventTime')
                          
                          options(warn = defaultW)
                          return(tsp)
                        },
                        
                        # Extracts a TransitionSystem object containing subset of the history within a specified time-line window 
                        extract_subset = function(time_from, time_to){
                          
                          cases_started_before = history %>% filter(startTime < time_from, caseStart) %>% pull(caseID)
                          cases_ended_after    = history %>% filter(startTime > time_to, caseEnd) %>% pull(caseID)
                          new_history = history %>% 
                            filter(startTime >= time_from, startTime <= time_to) %>% 
                            mutate(caseStart = caseStart & !(caseID %in% cases_started_before)) %>% 
                            mutate(caseEnd   = caseEnd & !(caseID %in% cases_ended_after))
                          new_obj  = new('TransitionSystem')
                          new_obj$settings = settings
                          new_obj$feed.eventlog(new_history, caseStartFlag_col = 'caseStart', caseEndFlag_col = 'caseEnd')
                          return(new_obj)
                        },
                        
                        

                        # Among the cases in the case list, returns IDs of cases who have ever been in the given 'status'
                        casesInStatus = function(status){
                          casedf  = history %>% filter(selected) %>% filter(status == status) %>% extract('caseID') %>% unique
                          return(casedf$caseID)
                        },

                        casesInTransition = function(source, target){
                          casedf  = history %>% filter(selected) %>% filter(path == source %++% '-' %++% target) %>% extract('caseID') %>% unique
                          return(casedf$caseID)
                        },
                        
                        # Argument 'time_generator' must be class function. This function should have three and only three inputs:
                        # 'histobj': an object of class TransitionSystem. 'histobj' will be passed to this function directly
                        # The output of this function must be a dataframe with added column 'pred_duration' containing 
                        # the estimated/predicted durations.
                        # all argument in ... will be passed to both event_generator and time_generator.
                        run.simulation = function(
                          start_dt, target_dt, new_starts = NULL, 
                          event_generator = markovchain_transition_classifier, 
                          time_generator = markovchain_transition_time_estimator, silent = T, ...){
                          
                          start_dt     <- lubridate::as_datetime(start_dt)
                          target_dt    <- lubridate::as_datetime(target_dt)
                          final_events <- tibble(caseID = character(), status = character(), startTime = empt, nextStatus = character(), nxtTrTime = empt)

                          if(is.null(new_starts)){
                            # keep the latest transition of each case started earlier than start_dt 
                            # which is not led to END. Simulation engine will create future transitions for these cases. 
                            current_backlog = history %>% filter(selected) %>% filter(startTime < start_dt) %>% as.tbl() %>%  
                              group_by(caseID) %>% filter(startTime == max(startTime)) %>% ungroup %>% distinct(caseID, .keep_all = T) %>% 
                              filter(nextStatus != 'END') %>% 
                              select(caseID, status, startTime) %>% arrange(caseID, startTime)
                            new_starts = history %>% filter(selected) %>% filter(startTime > start_dt, startTime < target_dt, status == 'START') %>% select(caseID, status, startTime) %>% as.tbl()
                          } else {
                            ## todo: why current_backlog becomes null?
                            ## todo: generate new_starts
                            current_backlog = NULL
                          }
                          
                          # historical_data <- history %>% filter(selected) %>% filter(startTime < start_dt) %>% as.tbl() %>%  select(caseID, status, startTime) %>% unique
                          # if(is.empty(historical_data)) return(final_events)
                          
                          histobj = .self$extract_subset(time_from = start_dt, time_to = target_dt)

                          # (why should we filter?)
                          # histobj$filter.reset()
                          histobj$clear()
                          
                          # todo: if new_starts is specified, should not exit the function if filtered history is empty
                          if(is.empty(histobj$history)) return(final_events)
                          
                          # start simulation: 
                          tracking <- rbind(current_backlog, new_starts)
                          if(!silent){cnt = 1}
                          
                          while(nrow(tracking) > 0){
                            tracking <- event_generator(tracking, histobj = histobj, ...) %>% 
                              time_generator(histobj = histobj, current_time = start_dt, new_events = final_events, ...) %>% 
                              mutate(nxtTrTime = startTime + pred_duration)
                            
                            final_events <- rbind(final_events, tracking)
                            # extract completed events and those that wont be completed before target_dt
                            # remove those rows from tracking and update
                            tracking %<>% 
                              filter(!(nextStatus == "END" | nxtTrTime > target_dt)) %>%
                              transmute(caseID, status = nextStatus, startTime = nxtTrTime)
                            
                            if(!silent){
                              if(nrow(tracking) > 0)
                                cat('iter: ', cnt, ' Time range from ', as.character(min(tracking$startTime)), ' to ', as.character(max(tracking$startTime)), '\n')
                              cnt = cnt + 1
                            }
                          }
                          if(!silent){cat('No more transitions (End of simulation)', '\n')}
                          
                          final_events %>% arrange(caseID, startTime)
                        },
                        
                        plot.process.map = function(obj, measure = c('rate', 'freq', 'time'), time_unit = c('second', 'minute', 'hour', 'day', 'week', 'year'), plotter = c('grviz', 'visNetwork'), config = NULL, remove_ends = F, ...){
                          plotter   = match.arg(plotter)
                          measure   = match.arg(measure)
                          time_unit = match.arg(time_unit)
                          nontime   = c('freq', 'rate')
                          plotname  = 'process_map' %>% paste(measure, plotter, chif(remove_ends, 'RE', ''), sep = '_')
                          if(!measure %in% nontime){plotname %<>% paste(time_unit, sep = '_')}
                          
                          if(is.null(plots[[plotname]])){
                            plots[[plotname]] <<- plot_process_map(.self, measure = measure, time_unit = time_unit, plotter = plotter, config = config, remove_ends = remove_ends, ...)
                          }
                          return(plots[[plotname]])
                        }
                      )
)


#' @export
summary.TransitionSystem = function(obj){
  
  ff = obj$settings$filter %>% lapply(function(x) paste(x, collapse = ',')) %>% unlist
  "Prcoess time range from: " %>% paste0(
    obj$modelStart %>% as.character,
    " until: ",
    obj$modelEnd %>% as.character,
    " with ",
    nrow(obj$history),
    " events (",
    sum(obj$history$selected, na.rm = T),
    " filtered) and ",
    obj$get.statuses(full = T) %>% length,
    " stasuses (",
    obj$get.statuses(full = F) %>% length,
    " filtered) impacting ",
    obj$get.cases(full = T) %>% length,
    " cases (",
    obj$get.cases(full = F) %>% length,
    " filtered). Filter settings: ",
    chif(is.empty(ff), 'No filtering applied', names(ff) %>% paste(":") %>% paste(ff, collapse = ' , ')))
  # "32,400 filtered) and 24 statuses (13 filtered) impacting 2,450 cases (83 filtered). Filter on completed cases with relative frequency threshold of 0.25 and loops range between 0 and 12"
}
genpack/promer documentation built on Jan. 26, 2025, 11:30 p.m.