R/helpers.R

Defines functions report.outliers report.no.outliers check.is.model.obj check.outliers.print check.nthreads discard.input.data split.types.new get.cols.ordered split.types check.is.df

check.is.df <- function(df) {
    if (inherits(df, c("tibble", "data.table")) || length(class(df)) > 1L)
        df <- as.data.frame(df) 
    if (!is.data.frame(df)) { stop("Input data must be a data.frame.") }
    if (!NCOL(df)) { stop("Input data frame has no columns.")}
    if (!NROW(df)) { stop("Input data frame has no rows.")   }
    return(df)
}

split.types <- function(df, cols_ignore = NULL, nthreads = 1) {
    
    ### validate input data
    df <- check.is.df(df)
    if (NROW(df) < 25)  { stop("Input data has too few row.") }
    supported_types <- c("numeric", "integer", "character", "factor", "ordered", "logical", "Date", "POSIXct")
    coltypes_df     <- Reduce(c, lapply(df, class))
    if (length(setdiff(unique(coltypes_df), c(supported_types, "POSIXt"))) > 0) {
        stop(paste0("Input data can only have the collowing column types: ",
                    paste(supported_types, collapse = ", "),
                    " - got passed the following: ",
                    paste(setdiff(unique(coltypes_df), supported_types), collapse = ", ")))
    }
    if (!is.null(cols_ignore)) {
        if ("character" %in% class(cols_ignore)) {
            if (length(setdiff(cols_ignore, names(df)))) {
                stop(paste0("'cols_ignore' contains names not present in 'df' - head: ",
                            paste(head(setdiff(cols_ignore, names(df)), 3), collapse = ", ")))
            }
        } else if ("logical" %in% class(cols_ignore)) {
            if (length(cols_ignore) != NCOL(df)) {
                stop("'cols_ignore' must have one entry per column of the input data frame.")
            }
        } else {
            stop(paste0("'cols_ignore' must be either a vector with column names to ignore, ",
                        "or a logical (boolean) vector indicating which columns to ignore"))
        }
    }
    
    ### initialize output object
    outp <- list()
    
    ### split by column type
    all_cols <- names(df)
    cols_numeric <- all_cols[sapply(df, function(x) "numeric"   %in% class(x))]
    cols_integer <- all_cols[sapply(df, function(x) "integer"   %in% class(x))]
    cols_boolean <- all_cols[sapply(df, function(x) "logical"   %in% class(x))]
    cols_factor  <- all_cols[sapply(df, function(x) "factor"    %in% class(x))]
    cols_ord     <- all_cols[sapply(df, function(x) "ordered"   %in% class(x))]
    cols_txt     <- all_cols[sapply(df, function(x) "character" %in% class(x))]
    cols_date    <- all_cols[sapply(df, function(x) "Date"      %in% class(x))]
    cols_ts      <- all_cols[sapply(df, function(x) "POSIXct"   %in% class(x))]
    
    outp$cols_num   <- c(cols_numeric, cols_integer)
    outp$cols_cat   <- c(cols_factor,  cols_txt)
    outp$cols_ord   <- cols_ord
    outp$cols_bool  <- cols_boolean
    outp$cols_date  <- cols_date
    outp$cols_ts    <- cols_ts
    outp$date_min   <- as.numeric()
    outp$ts_min     <- as.numeric()
    outp$cat_levels <- list()
    outp$ord_levels <- list()
    
    if (is.null(outp$cols_num))  { outp$cols_num  <- as.character() }
    if (is.null(outp$cols_cat))  { outp$cols_cat  <- as.character() }
    if (is.null(outp$cols_ord))  { outp$cols_ord  <- as.character() }
    if (is.null(outp$cols_bool)) { outp$cols_bool <- as.character() }
    if (is.null(outp$cols_date)) { outp$cols_date <- as.character() }
    if (is.null(outp$cols_ts))   { outp$cols_ts   <- as.character() }
    
    if (NROW(outp$cols_cat) && NROW(outp$cols_ord)) {
        outp$cols_cat <- setdiff(outp$cols_cat, outp$cols_ord)
    }
    
    if (NROW(cols_date)) {
        df[, cols_date] <- as.data.frame(lapply(df[, cols_date, drop = FALSE], as.numeric))
        outp$date_min   <- sapply(df[, cols_date, drop = FALSE], min, na.rm = TRUE)
        df[, cols_date] <- sweep(df[, cols_date, drop = FALSE], 2, outp$date_min - 1, "-")
        ## the extra 1 is for the way in which package applies log transforms
    }
    
    if (NROW(cols_ts)) {
        df[, cols_ts] <- as.data.frame(lapply(df[, cols_ts, drop = FALSE], as.numeric))
        outp$ts_min   <- sapply(df[, cols_ts, drop = FALSE], min, na.rm = TRUE)
        df[, cols_ts] <- sweep(df[, cols_ts, drop = FALSE], 2, outp$ts_min - 1, "-")
    }
    
    if (NROW(outp$cols_cat)) {
        df[, outp$cols_cat]  <- as.data.frame(lapply(df[, outp$cols_cat, drop = FALSE], factor))
        outp$cat_levels      <- lapply(df[, outp$cols_cat, drop = FALSE], levels)
        df[, outp$cols_cat]  <- as.data.frame(lapply(df[, outp$cols_cat, drop = FALSE],
                                                     function(x) ifelse(is.na(x), -1L, as.integer(x) - 1L))
                                              )
    }
    
    if (NROW(outp$cols_ord)) {
        outp$ord_levels     <- lapply(df[, outp$cols_ord, drop = FALSE], levels)
        df[, outp$cols_ord] <- as.data.frame(lapply(df[, outp$cols_ord, drop = FALSE],
                                                    function(x) ifelse(is.na(x), -1L, as.integer(x) - 1L))
                                             )
        ### check that they have at least 3 levels
        min_ord_levs = min(sapply(outp$ord_levels, length))
        if (min_ord_levs < 3) { stop("Ordinal columns must have at least 3 levels.") }
    }
    
    if (NROW(outp$cols_bool)) {
        df[, outp$cols_bool] <- as.data.frame(lapply(df[, outp$cols_bool, drop = FALSE],
                                                     function(x) ifelse(is.na(x), -1L, as.integer(x)))
                                              )
    }
    
    outp$arr_num  <- as.numeric()
    outp$arr_cat  <- as.integer()
    outp$arr_ord  <- as.integer()
    outp$ncol_num <- 0
    outp$ncol_cat <- 0
    outp$ncol_ord <- 0
    outp$nrow     <- NROW(df)
    outp$ncat     <- as.integer()
    outp$ncat_ord <- as.integer()
    outp$cols_ign <- as.logical()
    
    if (NROW(outp$cols_num) || NROW(outp$cols_date) || NROW(outp$cols_ts)) {
        outp$arr_num  <- as.numeric(as.matrix(df[, c(outp$cols_num, outp$cols_date, outp$cols_ts), drop = FALSE]))
        outp$ncol_num <- length(c(outp$cols_num, outp$cols_date, outp$cols_ts))
        
        ### check that they are not binary
        too_few_vals <- check_few_values(outp$arr_num, outp$nrow, outp$ncol_num, nthreads)
        if (any(too_few_vals)) {
            warning(paste0("Passed numeric columns with less than 3 different values - head: ",
                           paste(c(outp$cols_num, outp$cols_date, outp$cols_ts)[too_few_vals], collapse = ", ")))
        }

    }
    
    if (NROW(outp$cols_cat) || NROW(outp$cols_bool)) {
        outp$arr_cat  <- as.integer(as.matrix(df[, c(outp$cols_cat, outp$cols_bool), drop = FALSE]))
        outp$ncol_cat <- length(c(outp$cols_cat, outp$cols_bool))
        outp$ncat     <- sapply(df[, c(outp$cols_cat, outp$cols_bool), drop = FALSE], max) + 1L
    }
    
    if (NROW(outp$cols_ord)) {
        outp$arr_ord  <- as.integer(as.matrix(df[, outp$cols_ord, drop = FALSE]))
        outp$ncol_ord <- length(outp$cols_ord)
        outp$ncat_ord <- sapply(df[, outp$cols_ord, drop = FALSE], max) + 1L
    }
    
    if (!is.null(cols_ignore)) {
        cols_order <- get.cols.ordered(outp)
        if ("character" %in% class(cols_ignore)) {
            outp$cols_ign <- cols_order %in% cols_ignore
        } else if ("logical" %in% class(cols_ignore)) {
            outp$cols_ign <- as.logical(cols_ignore[match(names(df), cols_order)])
        }
    }
    
    return(outp)
}

get.cols.ordered <- function(model_data) {
    return(c(
                c(model_data$cols_num, model_data$cols_date, model_data$cols_ts),
                c(model_data$cols_cat, model_data$cols_bool),
                model_data$cols_ord
            ))
}

split.types.new <- function(df, model_data) {
    df <- check.is.df(df)
    if (length(setdiff(get.cols.ordered(model_data), names(df)))) {
        stop(paste0("Input data frame is missing some columns - head: ",
                    paste(head(setdiff(get.cols.ordered(model_data), names(df)), 3), collapse = ", ")))
    }
    throw_new_lev_warn <- FALSE
    outp = list(
        arr_num = as.numeric(),
        arr_cat = as.integer(),
        arr_ord = as.integer()
    )
    
    if (NROW(model_data$cols_num)) {
        df[, model_data$cols_num] <- as.data.frame(lapply(df[, model_data$cols_num, drop = FALSE], as.numeric))
    }
    if (NROW(model_data$cols_date)) {
        df[, model_data$cols_date] <- mapply(function(a, b) a - b + 1,
                                             as.data.frame(lapply(df[, model_data$cols_date, drop = FALSE], as.numeric)),
                                             model_data$date_min)
    }
    if (NROW(model_data$cols_ts)) {
        df[, model_data$cols_ts] <- mapply(function(a, b) a - b + 1,
                                           as.data.frame(lapply(df[, model_data$cols_ts, drop = FALSE], as.numeric)),
                                           model_data$ts_min)
    }
    if (NROW(model_data$cols_cat)) {
        for (cl in 1:NROW(model_data$cols_cat)) {
            new_levels <- !(df[[model_data$cols_cat[[cl]]]] %in% model_data$cat_levels[[cl]]) &
                          !is.na(df[[model_data$cols_cat[[cl]]]])
            df[[model_data$cols_cat[cl]]] <- factor(df[[model_data$cols_cat[[cl]]]],
                                                    unname(unlist(model_data$cat_levels[[cl]])))
            df[[model_data$cols_cat[cl]]] <- ifelse(is.na(df[[model_data$cols_cat[[cl]]]]),
                                                    -1L, as.integer(df[[model_data$cols_cat[[cl]]]]) - 1L)
            if (any(new_levels)) {
                df[[model_data$cols_cat[[cl]]]][new_levels] <- length(model_data$cat_levels[[cl]])
                throw_new_lev_warn <- TRUE
            }
        }
    }
    if (NROW(model_data$cols_bool)) {
        df[, model_data$cols_bool] <- as.data.frame(lapply(df[, model_data$cols_bool, drop = FALSE],
                                                           function(x) ifelse(is.na(x), -1L, as.integer(as.logical(x)))))
    }
    if (NROW(model_data$cols_ord)) {
        for (cl in 1:NROW(model_data$cols_ord)) {
            new_levels <- !(df[[model_data$cols_ord[[cl]]]] %in% model_data$ord_levels[[cl]]) &
                          !is.na(df[[model_data$cols_ord[[cl]]]])
            df[[model_data$cols_ord[[cl]]]] <- factor(df[[model_data$cols_ord[cl]]], unname(unlist(model_data$ord_levels[[cl]])))
            df[[model_data$cols_ord[[cl]]]] <- ifelse(is.na(df[[model_data$cols_ord[[cl]]]]),
                                                      -1L, as.integer(df[[model_data$cols_ord[[cl]]]]) - 1L)
            if (any(new_levels)) {
                df[[model_data$cols_ord[[cl]]]][new_levels] <- length(model_data$ord_levels[[cl]])
                throw_new_lev_warn <- TRUE
            }
        }
    }
    
    if (NROW(model_data$cols_num) || NROW(model_data$cols_date) || NROW(model_data$cols_ts)) {
        outp$arr_num  <- as.numeric(as.matrix(df[, c(model_data$cols_num,
                                                     model_data$cols_date,
                                                     model_data$cols_ts),
                                                 drop = FALSE]))
    }
    
    if (NROW(model_data$cols_cat) || NROW(model_data$cols_bool)) {
        outp$arr_cat  <- as.integer(as.matrix(df[, c(model_data$cols_cat, model_data$cols_bool), drop = FALSE]))
    }
    
    if (NROW(model_data$cols_ord)) {
        outp$arr_ord  <- as.integer(as.matrix(df[, model_data$cols_ord, drop = FALSE]))
    }
    
    if (throw_new_lev_warn) {
        warning("Some column(s) contain new factor levels, these will be ignored.")
    }
    
    return(outp)
}

discard.input.data <- function(model_data) {
    model_data$arr_num  <- NULL
    model_data$arr_cat  <- NULL
    model_data$arr_ord  <- NULL
    model_data$ncat     <- NULL
    model_data$ncat_ord <- NULL
    model_data$nrow     <- NULL
    model_data$cols_ign <- NULL
    return(model_data)
}

check.nthreads <- function(nthreads) {
    if (is.null(nthreads)) {
        nthreads <- 1L
    } else if (is.na(nthreads)) {
        nthreads <- 1L
    } else if (nthreads < 1L) {
        nthreads <- 1L
    }
    nthreads <- as.integer(nthreads)
    if (nthreads > 1L && !R_has_openmp()) {
        msg <- paste0("Attempting to use more than 1 thread, but ",
                      "package was compiled without OpenMP support.")
        if (tolower(Sys.info()[["sysname"]]) == "darwin")
            msg <- paste0(msg, " See https://github.com/david-cortes/installing-optimized-libraries#4-macos-install-and-enable-openmp")
        warning(msg)
    }
    return(nthreads)
}

check.outliers.print <- function(outliers_print) {
    if (NROW(outliers_print) > 1) { stop("Must pass a scalar value for 'outliers_print'.") }
    if (is.null(outliers_print) || is.na(outliers_print) || outliers_print <= 0) {
        return(as.integer(0))
    }
    if ("numeric"   %in% class(outliers_print))  { outliers_print <- as.integer(outliers_print) }
    if (!("integer" %in% class(outliers_print))) { stop("'outliers_print' must be a positive integer.") }
    if (outliers_print <= 0) { stop("'outliers_print' must be a positive integer.") }
    return(outliers_print)
}

check.is.model.obj <- function(model_obj) {
    if (!("outliertree" %in% class(model_obj))) {
        stop("Must pass an Outlier Tree model object as generated by function 'outlier.tree'.")
    }
    if (is.null(model_obj$obj_from_cpp$ptr_model$ptr) ||
        identical(model_obj$obj_from_cpp$ptr_model$ptr, methods::new("externalptr"))
    ) {
        stop("Outlier Tree model object has been corrupted.")
    }
}

report.no.outliers <- function() {
    cat("No outliers were found.\n")
}

report.outliers <- function(lst, rnames, outliers_print, min_decimals=2) {

    if (NROW(lst) == 0) { report.no.outliers(); return(invisible(NULL)); }
    
    suspicous_value  <- lst$suspicous_value
    group_statistics <- lst$group_statistics
    conditions       <- lst$conditions
    
    ### determine which ones to show
    df_outlierness <- data.frame(
        ix_num         = 1:NROW(lst$tree_depth),
        uses_NA_branch = lst$uses_NA_branch,
        tree_depth     = lst$tree_depth,
        outlier_score  = lst$outlier_score
    )
    ### https://stackoverflow.com/questions/1296646/how-to-sort-a-dataframe-by-multiple-columns
    df_outlierness <- df_outlierness[with(df_outlierness, order(uses_NA_branch, tree_depth, outlier_score)), ]
    
    ### if there are no outliers, stop at that
    if (is.na(df_outlierness[1, "tree_depth"]) || NROW(df_outlierness) == 0) {
        report.no.outliers()
        return(invisible(NULL));
    }
    
    ### otherwise, report only the most outlying ones
    df_outlierness <- df_outlierness[!is.na(df_outlierness$uses_NA_branch), ]
    cat(sprintf("Reporting top %d outliers [out of %d found]\n\n",
                min(outliers_print, NROW(df_outlierness)),
                NROW(df_outlierness)))
    df_outlierness <- df_outlierness[1:min(outliers_print, NROW(df_outlierness)), ]
    for (row in 1:NROW(df_outlierness)) {
        row_ix <- df_outlierness$ix_num[row]
        min_dec_this <- min_decimals
        
        ### print suspicious value
        cat(sprintf("row [%s] - suspicious column: [%s] - ", rnames[row_ix], suspicous_value[[row_ix]]$column))
        if ("numeric" %in% class(suspicous_value[[row_ix]]$value)) {
            if ("decimals" %in% names(suspicous_value[[row_ix]])) {
                min_dec_this <- pmax(min_dec_this, suspicous_value[[row_ix]]$decimals)
            }
            cat(sprintf(sprintf("suspicious value: [%%.%df]\n", min_dec_this),
                        suspicous_value[[row_ix]]$value))
        } else {
            cat(sprintf("suspicious value: [%s]\n", suspicous_value[[row_ix]]$value))
        }
        
        
        ### print distribution
        if ("mean" %in% names(group_statistics[[row_ix]])) {
            if ("upper_thr" %in% names(group_statistics[[row_ix]])) {
                if ("numeric" %in% class(group_statistics[[row_ix]]$upper_thr)) {
                    cat(sprintf(sprintf("\tdistribution: %%.%df%%%% <= %%.%df - [mean: %%.%df] - [sd: %%.%df] - [norm. obs: %%d]\n",
                                        3L, min_dec_this, min_dec_this, min_dec_this),
                                group_statistics[[row_ix]]$pct_below * 100.,
                                group_statistics[[row_ix]]$upper_thr,
                                group_statistics[[row_ix]]$mean,
                                group_statistics[[row_ix]]$sd,
                                group_statistics[[row_ix]]$n_obs))
                } else {
                    cat(sprintf("\tdistribution: %.3f%% <= [%s] - [mean: %s] - [norm. obs: %d]\n",
                                group_statistics[[row_ix]]$pct_below * 100.,
                                group_statistics[[row_ix]]$upper_thr,
                                group_statistics[[row_ix]]$mean,
                                group_statistics[[row_ix]]$n_obs))
                }
            } else {
                if ("numeric" %in% class(group_statistics[[row_ix]]$lower_thr)) {
                    cat(sprintf(sprintf("\tdistribution: %%.%df%%%% >= %%.%df - [mean: %%.%df] - [sd: %%.%df] - [norm. obs: %%d]\n",
                                        3L, min_dec_this, min_dec_this, min_dec_this),
                                group_statistics[[row_ix]]$pct_above * 100.,
                                group_statistics[[row_ix]]$lower_thr,
                                group_statistics[[row_ix]]$mean,
                                group_statistics[[row_ix]]$sd,
                                group_statistics[[row_ix]]$n_obs))
                } else {
                    cat(sprintf("\tdistribution: %.3f%% >= [%s] - [mean: %s] - [norm. obs: %d]\n",
                                group_statistics[[row_ix]]$pct_above * 100.,
                                group_statistics[[row_ix]]$lower_thr,
                                group_statistics[[row_ix]]$mean,
                                group_statistics[[row_ix]]$n_obs))
                }
            }
        } else if ("categs_common" %in% names(group_statistics[[row_ix]])) {
            if (NROW(group_statistics[[row_ix]]$categs_common) == 1) {
                cat(sprintf("\tdistribution: %.3f%% = [%s]\n",
                            group_statistics[[row_ix]]$pct_common * 100.,
                            group_statistics[[row_ix]]$categs_common))
            } else {
                cat(sprintf("\tdistribution: %.3f%% in [%s]\n",
                            group_statistics[[row_ix]]$pct_common * 100.,
                            paste(group_statistics[[row_ix]]$categs_common, collapse = ", ")))
            }
            if (NROW(conditions[[row_ix]])) {
                cat(sprintf("\t( [norm. obs: %d] - [prior_prob: %.3f%%] - [next smallest: %.3f%%] )\n",
                            group_statistics[[row_ix]]$n_obs,
                            group_statistics[[row_ix]]$prior_prob * 100.,
                            group_statistics[[row_ix]]$pct_next_most_comm * 100.))
            } else {
                cat(sprintf("\t( [norm. obs: %d] - [next smallest: %.3f%%] )\n",
                            group_statistics[[row_ix]]$n_obs,
                            group_statistics[[row_ix]]$pct_next_most_comm * 100.))
            }
        }  else if ("categ_maj" %in% names(group_statistics[[row_ix]])) {
            cat(sprintf("\tdistribution: %.3f%% = [%s]\n",
                        group_statistics[[row_ix]]$pct_common * 100.,
                        group_statistics[[row_ix]]$categ_maj))
            cat(sprintf("\t( [norm. obs: %d] - [prior_prob: %.3f%%] )\n",
                        group_statistics[[row_ix]]$n_obs,
                        group_statistics[[row_ix]]$prior_prob * 100))
        } else {
            cat(sprintf("\tdistribution: %.3f%% different [norm. obs: %d]",
                        group_statistics[[row_ix]]$pct_other * 100.,
                        group_statistics[[row_ix]]$n_obs))
            if (NROW(conditions[[row_ix]])) {
                cat(sprintf(" - [prior_prob: %.3f%%]", group_statistics[[row_ix]]$prior_prob * 100.))
            }
            cat("\n")
        }
        
        
        ### print conditions
        if (NROW(conditions[[row_ix]])) {
            cat("\tgiven:\n")
            conditions_this <- simplify.conditions(conditions[[row_ix]])
            for (cond in conditions_this) {
                
                min_dec_this <- pmax(min_decimals, cond$decimals)
                
                switch(cond$comparison,
                       "is NA" = {
                               cat(sprintf("\t\t[%s] is NA\n", cond$column))
                       },
                       "<=" = {
                               if ("numeric" %in% class(cond$value_this)) {
                                   cat(sprintf(sprintf("\t\t[%%s] <= [%%.%df] (value: %%.%df)\n",
                                                       min_dec_this, min_dec_this),
                                               cond$column, cond$value_comp, cond$value_this))
                               } else {
                                   cat(sprintf("\t\t[%s] <= [%s] (value: %s)\n",
                                               cond$column, cond$value_comp, cond$value_this))
                               }
                       },
                       ">" = {
                               if ("numeric" %in% class(cond$value_this)) {
                                   cat(sprintf(sprintf("\t\t[%%s] > [%%.%df] (value: %%.%df)\n",
                                                       min_dec_this, min_dec_this),
                                               cond$column, cond$value_comp, cond$value_this))
                               } else {
                                   cat(sprintf("\t\t[%s] > [%s] (value: %s)\n",
                                               cond$column, cond$value_comp, cond$value_this))
                               }
                       },
                       "between" = {
                               if ("numeric" %in% class(cond$value_this)) {
                                   cat(sprintf(sprintf("\t\t[%%s] between (%%.%df, %%.%df] (value: %%.%df)\n",
                                                       min_dec_this, min_dec_this, min_dec_this),
                                               cond$column, cond$value_comp[1], cond$value_comp[2], cond$value_this))
                               } else {
                                   cat(sprintf("\t\t[%s] between (%s, %s] (value: %s)\n",
                                               cond$column, cond$value_comp[1], cond$value_comp[2], cond$value_this))
                               }
                       },
                       "=" = {
                               cat(sprintf("\t\t[%s] = [%s]\n", cond$column, cond$value_comp))
                       },
                       "!=" = {
                               cat(sprintf("\t\t[%s] != [%s] (value: %s)\n",
                                           cond$column, cond$value_comp, cond$value_this))
                       },
                       "in" = {
                               cat(sprintf("\t\t[%s] in [%s] (value: %s)\n",
                                           cond$column, paste(cond$value_comp, collapse = ", "), cond$value_this))
                       })
            }
        }
        
        cat("\n\n")
    }
}

simplify.conditions <- function(conditions) {
    if (NROW(conditions) <= 1) { return(conditions) }
    cols_taken <- sapply(conditions, function(x) x$column)
    if (NROW(unique(cols_taken)) < NROW(cols_taken)) {
        repeated_cols <- table(cols_taken, useNA = "no")
        repeated_cols <- names(repeated_cols)[repeated_cols > 1]
        replacing_cond <- list()
        for (cl in repeated_cols) {
            
            n_le <- 0
            n_gt <- 0
            n_in <- 0
            n_eq <- 0
            n_neq       <- 0
            lowest_le   <-  Inf
            highest_gt  <- -Inf
            val_eq      <- NA
            val_neq     <- NA
            smallest_in <- NULL
            highest_dec <- 0
            
            for (cn in 1:NROW(conditions)) {
                if (conditions[[cn]]$column == cl) {
                    val_this <- conditions[[cn]]$value_this
                    switch(conditions[[cn]]$comparison,
                           "<=" = {
                                   n_le <- n_le + 1
                                   if (conditions[[cn]]$value_comp < lowest_le) {
                                       lowest_le <- conditions[[cn]]$value_comp
                                   }
                                   if (!is.null(conditions[[cn]]$decimals))
                                       highest_dec <- pmax(highest_dec, conditions[[cn]]$decimals)
                           },
                           ">" = {
                                   n_gt <- n_gt + 1
                                   if (conditions[[cn]]$value_comp > highest_gt) {
                                       highest_gt <- conditions[[cn]]$value_comp
                                   }
                                   if (!is.null(conditions[[cn]]$decimals))
                                       highest_dec <- pmax(highest_dec, conditions[[cn]]$decimals)
                           },
                           "in" = {
                                   n_in <- n_in + 1
                                   if (is.null(smallest_in)) {
                                       smallest_in <- conditions[[cn]]$value_comp
                                   } else {
                                       smallest_in <- intersect(smallest_in, conditions[[cn]]$value_comp)
                                   }
                           },
                           "=" = {
                                   n_eq   <- n_eq + 1
                                   val_eq <- conditions[[cn]]$value_comp
                           },
                           "!=" = {
                                   n_neq   <- n_neq + 1
                                   val_neq <- conditions[[cn]]$value_comp
                           }
                    )
                }
            }
            
            if        (n_le  > 0 & n_gt == 0) {
                replacing_cond[[NROW(replacing_cond) + 1]] <- list(column = cl, value_this = val_this,
                                                                   comparison = "<=", value_comp = lowest_le,
                                                                   decimals = highest_dec)
            } else if (n_gt  > 0 & n_le == 0) {
                replacing_cond[[NROW(replacing_cond) + 1]] <- list(column = cl, value_this = val_this,
                                                                   comparison = ">", value_comp = highest_gt,
                                                                   decimals = highest_dec)
            } else if (n_le  > 0 & n_gt  > 0) {
                replacing_cond[[NROW(replacing_cond) + 1]] <- list(column = cl, value_this = val_this,
                                                                   comparison = "between", value_comp = c(highest_gt, lowest_le),
                                                                   decimals = highest_dec)
            } else if (n_in  > 0 & n_eq == 0 & n_neq == 0) {
                replacing_cond[[NROW(replacing_cond) + 1]] <- list(column = cl, value_this = val_this,
                                                                   comparison = "in", value_comp = smallest_in)
            } else if (n_in  > 0 & n_eq  > 0) {
                replacing_cond[[NROW(replacing_cond) + 1]] <- list(column = cl, value_this = val_this,
                                                                   comparison = "=", value_comp = val_eq)
            } else if (n_in  > 0 & n_neq > 0) {
                replacing_cond[[NROW(replacing_cond) + 1]] <- list(column = cl, value_this = val_this,
                                                                   comparison = "!=", value_comp = val_neq)
            }
            
        }
        conditions <- append(conditions[sapply(conditions, function(x) !(x$column %in% repeated_cols))],
                             replacing_cond)
    }
    
    conditions <- lapply(conditions, function(x)
                                        if ((x$comparison == "in") && (NROW(x$value_comp) == 1)) {
                                            x$comparison <- "="; return(x);
                                        } else { return(x) })
    return(conditions[rev(1:NROW(conditions))])
}


outliers.to.list <- function(df, outliers_info) {
    outliers_data <- lapply(1:NROW(df),
                            function(row, lst) list(
                                suspicous_value  = lst$suspicous_value[[row]],
                                group_statistics = lst$group_statistics[[row]],
                                conditions       = lst$conditions[[row]],
                                tree_depth       = lst$tree_depth[[row]],
                                uses_NA_branch   = lst$uses_NA_branch[[row]],
                                outlier_score    = lst$outlier_score[[row]]
                                ),
                            outliers_info)
    names(outliers_data) <- row.names(df)
    class(outliers_data) <- c("outlieroutputs", class(outliers_data))
    return(outliers_data)
}


list.to.outliers <- function(outliers_data) {
    return(list(
        suspicous_value  = sapply(outliers_data, function(x) x$suspicous_value, simplify = FALSE),
        group_statistics = sapply(outliers_data, function(x) x$group_statistics, simplify = FALSE),
        conditions       = sapply(outliers_data, function(x) x$conditions, simplify = FALSE),
        tree_depth       = sapply(outliers_data, function(x) x$tree_depth, simplify = TRUE),
        uses_NA_branch   = sapply(outliers_data, function(x) x$uses_NA_branch, simplify = TRUE),
        outlier_score    = sapply(outliers_data, function(x) x$outlier_score, simplify = TRUE)
    ))
}

produce.empty.outliers <- function(row_names) {
    empty_lst        <- rep_len(list(list(suspicous_value = list(), group_statistics= list(), conditions = list(),
                                          tree_depth = NA, uses_NA_branch = NA, outlier_score = NA)),
                                NROW(row_names))
    names(empty_lst) <- row_names
    class(empty_lst) <- c("outlieroutputs", class(empty_lst))
    return(empty_lst)
}

Try the outliertree package in your browser

Any scripts or data that you put into this service are public.

outliertree documentation built on Nov. 22, 2023, 1:08 a.m.