tslabeler: Labeling Tool For Time Series Data

Documented in checkprep_dt_from_env default_tag_values get_dt_from_env

# Make sure data.table knows we know we're using it
.datatable.aware <- TRUE

# Prevent R CMD check from complaining about the use of pipe expressions
# standard data.table variables
if (getRversion() >= "2.15.1") {
        utils::globalVariables(c(":=", ".", ".I", ".N", ".SD"), utils::packageName())
        utils::globalVariables(c(".grp", ".anomaly", "ds", "value", ".tag"))
}

#' Get dataframe/datatable list from calling environment
#'
#' This returns a list of all dataframes / datatables in the calling environment.
#'
#' (Internal function)
#'
#' @return char vector of dt list
get_dt_from_env <- function() {
        dt_list <- ls(envir = .GlobalEnv)[sapply(ls(envir = .GlobalEnv),
                                                 function(t)
                                                         is.data.frame(get(t)))]
        if (length(dt_list) == 0) {
                dt_list <- list("None")
        }
        dt_list
}

#' Process dataframe selected from environment
#'
#' This internal function is responsible for pre-processing a selected df from
#' the env. The shiny app relies on this func to check all req of the df (cols,
#' col types, data quality etc).
#'
#' (Internal function)
#'
#' @param df_full (data.table/data.frame/tibble) raw dataframe from env
#' @param picker_select_datecol (char) name of respective column selected by user
#' @param picker_select_grpcol (char) name of respective column selected by user
#' @param picker_select_valuecol (char) name of respective column selected by user
#' @param picker_select_tagcol (char) name of respective column selected by user
#' @param picker_select_anomalycol (char) name of respective column selected by user
#'
#' @return list containing preprocessed DT & info about preprocessing steps
checkprep_dt_from_env <- function(df_full,
                                  picker_select_datecol,
                                  picker_select_grpcol,
                                  picker_select_valuecol,
                                  picker_select_tagcol,
                                  picker_select_anomalycol) {
        preprocess_steps <- list(
                "df_class" = list(
                        "initial_class" = NULL,
                        "convert_to_datatable" = FALSE,
                        "msg" = NULL
                ),
                "datetime_col" = list(
                        "initial_data_type_check_successful" = FALSE,
                        # Date or POSIXct
                        "conversion_attempted" = FALSE,
                        "conversion_successful" = FALSE,
                        "msg" = NULL
                ),
                "tag_col" = list(
                        "add_tag_col" = FALSE,
                        "count_NA" = 0,
                        "NA_replaced" = FALSE,
                        "msg" = NULL
                ),
                "anomaly_col" = list(
                        "add_anomaly_col" = FALSE,
                        "only_contains_T_or_F" = TRUE,
                        "msg" = NULL
                ),
                "other_col" = list(
                        "other_cols_exist" = FALSE,
                        "msg" = NULL
                ),
                "grp_col" = list(
                        "initial_data_type_check_successful" = FALSE,
                        "converted_to_char" = FALSE,
                        "msg" = NULL
                ),
                "value_col" = list(
                        "initial_data_type_check_successful" = FALSE,
                        "msg" = NULL
                ),
                "quality_checks" = list(
                        "anomaly1_tag_empty" = FALSE,
                        "anomaly1_tag_empty_msg" = NULL,
                        "anomaly0_tag_nonempty" = FALSE,
                        "anomaly0_tag_nonempty_msg" = NULL
                ),
                "col_list" = list(
                        "datecol" = NULL,
                        "grpcol" = NULL,
                        "valuecol" = NULL,
                        "tagcol" = NULL,
                        "anomalycol" = NULL,
                        "othercols" = NULL
                ),
                "metadata" = list("custom_tags" = NULL),
                "sorted_by_key_index" = FALSE,
                "go_nogo" = FALSE
        )
        
        # Class of input dataframe
        preprocess_steps$df_class$initial_class <- class(df_full)[1]
        
        if (!data.table::is.data.table(df_full)) {
                preprocess_steps$df_class$convert_to_datatable <- TRUE
                preprocess_steps$df_class$msg <-
                        "Converting input dataframe to data.table. On saving back to your environment, you will be returned an object of your original class (data.frame or tibble)"
                df_full <- data.table::as.data.table(df_full)
        } else {
                preprocess_steps$df_class$initial_class <- "data.table"
        }
        
        # .columns
        if (picker_select_anomalycol == "No anomaly column") {
                preprocess_steps$anomaly_col$add_anomaly_col <- TRUE
                preprocess_steps$anomaly_col$msg <- "Adding .anomaly column"
                picker_select_anomalycol <- ".anomaly"
                df_full[, .anomaly := FALSE]
        }
        
        if (picker_select_tagcol == "No tag column") {
                preprocess_steps$tag_col$add_tag_col <- TRUE
                preprocess_steps$tag_col$msg <- "Adding .tag column"
                picker_select_tagcol <- ".tag"
                df_full[, .tag := ""]
        }
        
        # Col data types
        ## Is the datecol of class Date or POSIXct?
        if (!any(lubridate::is.Date(df_full[, get(picker_select_datecol)]),
                 lubridate::is.POSIXct(df_full[, get(picker_select_datecol)]))) {
                preprocess_steps$datetime_col$initial_data_type_check_successful <-
                        FALSE
                
                # Attempting conversion to POSIXct
                df_full[, c(picker_select_datecol) := lubridate::ymd_hms(get(picker_select_datecol),
                                                                         truncated = 3,
                                                                         tz = "UTC")]
                
                if (any(is.na(df_full[, get(picker_select_datecol)]))) {
                        preprocess_steps$datetime_col$conversion_attempted <- TRUE
                        preprocess_steps$datetime_col$conversion_successful <- FALSE
                        preprocess_steps$datetime_col$msg <-
                                paste0(
                                        "Could not parse date-time column `",
                                        picker_select_datecol,
                                        "`. Format expected - For date-time: %Y-%m-%dT%H:%M:%SZ or similar. For date: %Y-%m-%d."
                                )
                } else {
                        preprocess_steps$datetime_col$conversion_attempted <- TRUE
                        preprocess_steps$datetime_col$conversion_successful <- TRUE
                        preprocess_steps$datetime_col$msg <-
                                paste0(
                                        "Date-time column`",
                                        picker_select_datecol,
                                        "` converted to POSIXct."
                                )
                }
                
        } else {
                preprocess_steps$datetime_col$initial_data_type_check_successful <-
                        TRUE
                preprocess_steps$datetime_col$msg <-
                        paste0("Date/Time column `",
                               picker_select_datecol,
                               "` checked: OK")
        }
        ## Grp Column
        if (picker_select_grpcol != "No group column") {
                if (class(df_full[, get(picker_select_grpcol)]) %in% c("character", "factor")) {
                        preprocess_steps$grp_col$initial_data_type_check_successful <- TRUE
                        preprocess_steps$grp_col$msg <-
                                paste0(
                                        "Class of group column `",
                                        picker_select_grpcol,
                                        "` checked: OK"
                                )
                } else {
                        preprocess_steps$grp_col$initial_data_type_check_successful <- FALSE
                        preprocess_steps$grp_col$msg <-
                                paste0(
                                        "Converting group column `",
                                        picker_select_grpcol,
                                        "` to class: character"
                                )
                        df_full[, c(picker_select_grpcol) := as.character(get(picker_select_grpcol))]
                }
                total_grps <-
                        df_full[, length(unique(get(picker_select_grpcol)))]
                grp_unique_list <- df_full[, unique(get(picker_select_grpcol))]
        } else {
                total_grps <- 0
                grp_unique_list <- NA
        }
        ## Value Column
        if (class(df_full[, get(picker_select_valuecol)]) != "numeric") {
                preprocess_steps$value_col$initial_data_type_check_successful <-
                        FALSE
                preprocess_steps$value_col$msg <-
                        paste0(
                                "Value column `",
                                picker_select_valuecol,
                                "` needs to be of class: numeric or integer"
                        )
        } else {
                preprocess_steps$value_col$initial_data_type_check_successful <-
                        TRUE
                preprocess_steps$value_col$msg <-
                        paste0("Value column `",
                               picker_select_valuecol,
                               "` checked: OK")
        }
        ## Remaining Cols, if any
        standard_cols <- c(picker_select_datecol,
                           picker_select_valuecol,
                           picker_select_tagcol,
                           picker_select_anomalycol)
        if(picker_select_grpcol != "No group column")
                standard_cols <- c(standard_cols, picker_select_grpcol)
        other_user_cols <- colnames(df_full)[!(colnames(df_full) %in% standard_cols)]
        if(length(other_user_cols)==0){
                preprocess_steps$other_col$other_cols_exist <- FALSE
                preprocess_steps$other_col$msg <- "No user columns found"
        } else {
                preprocess_steps$other_col$other_cols_exist <- TRUE
                preprocess_steps$other_col$msg <- paste0(length(other_user_cols),
                                                         " other user columns found. These will be preserved.")
        }
        
        # Data Quality Checks
        if (!all(df_full[, unique(get(picker_select_anomalycol))] %in% c(FALSE, TRUE))) {
                preprocess_steps$anomaly_col$only_contains_T_or_F <- FALSE
                preprocess_steps$anomaly_col$msg <-
                        paste0(
                                "Anomaly column `",
                                picker_select_anomalycol,
                                "` contains values not [FALSE, TRUE]"
                        )
        }
        
        # Test for NA values in tag. If present, replace by ""
        if ((count_ <-
             df_full[is.na(get(picker_select_tagcol)), .N]) != 0) {
                preprocess_steps$tag_col$count_NA <- count_
                preprocess_steps$tag_col$NA_replaced <- TRUE
                preprocess_steps$tag_col$msg <-
                        paste0(
                                "Found ",
                                count_,
                                " NA values in tag column `",
                                picker_select_tagcol,
                                "`. Replacing by empty string ('')"
                        )
                df_full[is.na(get(picker_select_tagcol)), c(picker_select_tagcol) := ""]
        }
        
        # Any anomaly=1 where tag = ""?
        count_ <-
                df_full[get(picker_select_anomalycol) == 1 &
                                get(picker_select_tagcol) == "", .N]
        if (count_ != 0) {
                preprocess_steps$quality_checks$anomaly1_tag_empty <- TRUE
                preprocess_steps$quality_checks$anomaly1_tag_empty_msg <-
                        paste0(
                                "Warning: Found ",
                                count_,
                                " rows where anomaly is TRUE, yet tag is empty ('')"
                        )
        }
        
        # Any anomaly=0 where tag != ""?
        count_ <-
                df_full[get(picker_select_anomalycol) == 0 &
                                get(picker_select_tagcol) != "", .N]
        if (count_ != 0) {
                preprocess_steps$quality_checks$anomaly0_tag_nonempty <- TRUE
                preprocess_steps$quality_checks$anomaly0_tag_nonempty_msg <-
                        paste0(
                                "Warning: Found ",
                                count_,
                                " rows where anomaly is FALSE, yet tag is not empty"
                        )
        }
        
        
        #Sort by key-index
        if (picker_select_grpcol == "No group column") {
                data.table::setkeyv(df_full, picker_select_datecol)
                preprocess_steps$sorted_by_key_index <- TRUE
        } else {
                data.table::setkeyv(df_full,
                                    c(picker_select_datecol, picker_select_grpcol))
                preprocess_steps$sorted_by_key_index <- TRUE
        }
        
        return_list <- list()
        
        # Process tag values
        tag_choices <- tag_values <- default_tag_values()
        tag_choices[tag_choices == ""] <- "remove tag"
        
        tags_in_file <- df_full[get(picker_select_anomalycol) == TRUE, unique(get(picker_select_tagcol))]
        
        custom_tags <- tags_in_file[!(tags_in_file %in% tag_choices)]
        
        if (length(custom_tags) > 0) {
                tag_values <- c(tag_values[tag_values != ""], custom_tags, "")
                tag_choices <- c(tag_choices[tag_choices != "remove tag"], custom_tags, "remove tag")
        }
        
        # Save everything to return back to caller
        ## Metadata
        preprocess_steps$metadata$custom_tags <- custom_tags
        preprocess_steps$metadata$tag_values <- tag_values
        preprocess_steps$metadata$tag_choices <- tag_choices
        preprocess_steps$metadata$total_pts <- df_full[, .N]
        preprocess_steps$metadata$total_grps <- total_grps
        preprocess_steps$metadata$grp_unique_list <- grp_unique_list
        preprocess_steps$metadata$count_existing_anomalies <-
                df_full[get(picker_select_anomalycol) == TRUE, .N]
        
        ## Data
        return_list$preprocessed_df <- df_full
        
        ## Final Go-NoGo
        if (all(
                any(
                        preprocess_steps$datetime_col$initial_data_type_check_successful,
                        preprocess_steps$datetime_col$conversion_successful
                ),
                preprocess_steps$anomaly_col$only_contains_T_or_F,
                preprocess_steps$value_col$initial_data_type_check_successful
        )) {
                preprocess_steps$go_nogo <- TRUE
        }
        
        ## Cols
        preprocess_steps$col_list$datecol <- picker_select_datecol 
        preprocess_steps$col_list$grpcol <- picker_select_grpcol 
        preprocess_steps$col_list$valuecol <- picker_select_valuecol 
        preprocess_steps$col_list$tagcol <- picker_select_tagcol 
        preprocess_steps$col_list$anomalycol <- picker_select_anomalycol 
        preprocess_steps$col_list$othercols <- other_user_cols 
        
        ## Process Steps
        return_list$preprocess_steps <- preprocess_steps

        return_list
}

#' Default tag values
#'
#' @return char vector of default tag values
default_tag_values <- function() {
        c(
                "spike",
                "trend-change",
                "level-shift",
                "variance-shift",
                "outlier",
                ""
        )
}