tests/testthat/test_discretization.R

context("test_discretization.R")
verbose <- TRUE
## build_bins
# ---------------
test_that("build_bins: with cols set to auto if perform it on all numerics", {
    # Given
    data_set <- data.table(num_col = c(1, 2, 3),
    not_num_col <- c("a", "b", "c"))

    # When
    bins <- build_bins(data_set, cols = "auto", verbose = verbose)

    # Then
    expect_equal("num_col", names(bins))
})

test_that("build_bins: type doesn't affect result shape", {
    # Given
    data_set <- data.table(num_col = 1 : 100)

    # When
    bins_freq <- build_bins(data_set, cols = "num_col", type = "equal_freq", verbose = verbose)
    bins_width <- build_bins(data_set, cols = "num_col", type = "equal_width", verbose = verbose)

    # Then
    expect_equal(length(bins_width), length(bins_freq))
    expect_equal(names(bins_width), names(bins_freq))
    expect_equal(sapply(bins_width, length), sapply(bins_freq, length))
})

test_that("build_bins: doesn't do anything on not numeric col", {
    # Given
    data_set <- data.table(not_numeric_col = c("a", "b", "c"))

    # When
    bins <- build_bins(data_set, cols = "not_numeric_col", verbose = verbose)

    # Then
    expect_equal(list(), bins)
})

test_that("build_bins: doesn't do anything on constant col", {
    # Given
    data_set <- data.table(constant_col = rep(1, 100))

    # When
    bins <- build_bins(data_set, cols = "constant_col", verbose = verbose)

    # Then
    expect_equal(list(), bins)
})

test_that("build_bins: expect error when type is not correct", {
    # Given
    data_set <- data.table(col = 1 : 10)
    wrong_type <- "a"

    # When and Then
    expect_error(build_bins(data_set, type = wrong_type, verbose = verbose),
    ": type should either be 'equal_width' or 'equal_freq'.")
})

# equal_width_splits
# ------------------
test_that("private function: equal_width_splits: Generate n_bins + 1 threshold to have n_bins bins", {
    # Given
    a_column <- runif(100)
    n_bins <- 9

    # When
    bins <- equal_width_splits(a_column, n_bins = n_bins, verbose = verbose)

    # Then
    expect_equal(length(bins), n_bins + 1)
})

test_that("private function: equal_width_splits: Generate n_bins + 1 threshold to have
          n_bins bins even with less values than n_bins", {
    # Given
    a_column <- runif(5)
    n_bins <- 9

    # When
    bins <- equal_width_splits(a_column, n_bins = n_bins, verbose = verbose)

    # Then
    expect_equal(length(bins), n_bins + 1)
})


test_that("private function: equal_width_splits", {
    # Given
    a_column <- c(1, 2, 3)
    n_bins <- 2

    # When
    bins <- equal_width_splits(a_column, n_bins = n_bins, verbose = verbose)

    # Then
    expect_equal(bins, c(1, 2, 3))
})

test_that("private function: equal_width_splits", {
    # Given
    a_column <- c(1, 2, 2.1, 2.2, 3)
    n_bins <- 2

    # When
    bins <- equal_width_splits(a_column, n_bins = n_bins, verbose = verbose)

    # Then
    expect_equal(bins, c(1, 2, 3))
})

test_that("private function: equal_width_splits", {
    # Given
    a_column <- c(1)
    n_bins <- 10

    # When
    bins <- equal_width_splits(a_column, n_bins = n_bins, verbose = verbose)

    # Then
    expect_equal(bins, c(1))
})

# equal_freq_splits
# ------------------
test_that("private function: equal_freq_splits: Generate n_bins + 1 threshold to have n_bins bins", {
    # Given
    a_column <- runif(100)
    n_bins <- 9

    # When
    bins <- equal_freq_splits(a_column, n_bins = n_bins, verbose = verbose)

    # Then
    expect_equal(length(bins), n_bins + 1)
})

test_that("private function: equal_freq_splits: ", {
    # Given
    a_column <- c(1, 2, 3)
    n_bins <- 2

    # When
    bins <- equal_freq_splits(a_column, n_bins = n_bins, verbose = verbose)

    # Then
    expect_equal(bins, c(- Inf, 2, + Inf))
})

test_that("private function: equal_freq_splits", {
    # Given
    a_column <- c(1, 2, 2.1, 2.2, 3)
    n_bins <- 2

    # When
    bins <- equal_freq_splits(a_column, n_bins = n_bins, verbose = verbose)

    # Then
    expect_equal(bins, c(- Inf, 2.1, + Inf))
})

test_that("private function: equal_freq_splits: with more bins than values", {
    # Given
    a_column <- c(1)
    n_bins <- 10

    # When
    bins <- equal_freq_splits(a_column, n_bins = n_bins, verbose = verbose)

    # Then
    expect_equal(bins, c(- Inf, 1, + Inf))
})

# is.possible_to_split
# ------------------
test_that("private function: is.possible_to_split: control sanity check data_set should be a vector.", {
    # Given
    wrong_data_set <- "something"
    n_bins <- 2

    # When and Then
    expect_error(is.possible_to_split(data_set = wrong_data_set, n_bins = n_bins),
    "data_set should be a vector of numerics and n_bins a numeric.")
})

test_that("private function: is.possible_to_split: control sanity check n_bins should be numeric.", {
    # Given
    data_set <- c(1, 2)
    wrong_n_bins <- "something"

    # When and Then
    expect_error(is.possible_to_split(data_set = data_set, n_bins = wrong_n_bins),
    "data_set should be a vector of numerics and n_bins a numeric.")
})


## fast_discretization
# -------------------
test_that("fast_discretization: after discretisation there are no more numerics", {
    # Given
    data_set <- data.table(col = runif(10))
    data_set[["col"]][1] <- NA # add a NA

    # When
    discretized_adult <- fast_discretization(data_set, bins = NULL, verbose = verbose)

    # Then
    expect_false(any(sapply(discretized_adult, is.numeric)))
})

## build_splits_names
# -------------------
test_that("private function: build_splits_names: without inf", {
    # Given
    splits <- c(0, 1, 2)
    expected_split_names <- c("[0, 1[", "[1, 2]")

    # When
    split_names <- build_splits_names(splits)

    # Then
    expect_identical(expected_split_names, split_names)
    expect_identical(build_splits_names(c(- Inf, 2, + Inf)), c("]-Inf, 2[", "[2, +Inf["))
})

test_that("private function: build_splits_names: with inf", {
    # Given
    splits <- c(- Inf, 2, + Inf)
    expected_split_names <- c("]-Inf, 2[", "[2, +Inf[")

    # When
    split_names <- build_splits_names(splits)

    # Then
    expect_identical(expected_split_names, split_names)
})

Try the dataPreparation package in your browser

Any scripts or data that you put into this service are public.

dataPreparation documentation built on July 9, 2023, 7:02 p.m.