tests/testthat/test-discretize.R

library(dplyr)
library(FSelectorRcpp)

iris_plus <- setNames(iris, gsub(
  pattern = "\\.",
  replacement = "+",
  x = colnames(iris)
))

test_that("Data frame output", {
  expect_s3_class(discretize(Species ~ ., iris),
                  class = "data.frame")
  expect_s3_class(discretize(Species ~ Sepal.Length, iris),
                  class = "data.frame")
  expect_s3_class(discretize(iris$Sepal.Length, iris$Species),
                  class = "data.frame")
})

if (require("FSelector") && require("RWeka")) {
  test_that("Discretization - basic", {
    dt <- lapply(1:5, function(xx) {
      x <- rnorm(1000, mean = 10 * xx)
      y <- rnorm(1000, mean = 0.5 * xx)
      z <- 10 * xx + 0.5 * sqrt(xx)
      data.frame(x, y, z)
    })

    dt <- do.call(bind_rows, dt)

    dt$z <- as.factor(as.integer(round(dt$z)))

    weka <- as.numeric(RWeka::Discretize(z ~ x, dt)[, 1])
    fs <- as.numeric(discretize(dt$x, dt$z)[[1]])

    expect_equal(weka, fs)

    weka <- RWeka::Discretize(z ~ x, dt)[, 1]
    fs <- discretize(dt$x, dt$z)[[1]]
    levels(weka)
    levels(fs)
  })


  test_that("Discretization - single NA (independent variable)", {
    iris$Sepal.Length[3] <- NA

    Weka <- as.numeric(RWeka::Discretize(Species ~ Sepal.Length,
                                         data = iris)[, 1])
    Weka <- c(Weka[1:2], NA, tail(Weka, -2))

    fs <- as.numeric(FSelectorRcpp::discretize(
      iris$Sepal.Length, iris$Species)[[2]])

    expect_equal(Weka, fs)
  })
}



test_that("Discretization - not supported data type - throw error.", {
  x <- "a"
  y <- "b"
  expect_error(discretize(x, y))
})

test_that("Discretization - formula works both ways.", {
  expect_equal(discretize(Species ~ ., iris), discretize(iris, Species ~ .))
})

test_that("Discretization - formula returns data.frame.", {
  expect_s3_class(discretize(Species ~ ., iris), "data.frame")
})

test_that("Discretization - expect warning when there is non numeric column in
          formula and all=FALSE.", {
  dt <- cbind(iris, b = "a")
  expect_warning(discretize(Species ~ ., dt, all = FALSE))
})

test_that("Discretization - not implemented for data.frame", {
  dt <- cbind(iris, b = "a")
  expect_error(discretize(dt))
})

test_that("Discretization - not supported method", {
  expect_error(discretize(Species ~ ., iris, control = list(method = "test")))
  control <- structure(list(method = "test"), class = "discretizationControl")
  expect_error(discretize(Species ~ ., iris, control = control))
})

test_that("Discretization - equalsize - ordered.", {
  x <- as.numeric(1:6)
  y <- 1:6

  d <- discretize(x, y, control = equalsizeControl(k = 2))[[2]]
  expect_equal(as.numeric(d), c(1, 1, 1, 2, 2, 2))

  d <- discretize(x, y, control = equalsizeControl(k = 3))[[2]]
  expect_equal(as.numeric(d), c(1, 1, 2, 2, 3, 3))

  d <- discretize(x, y, control = equalsizeControl(k = 4))[[2]]
  expect_equal(as.numeric(d), c(1, 1, 2, 2, 3, 4))

  d <- discretize(x, y, control = equalsizeControl(k = 5))[[2]]
  expect_equal(as.numeric(d), c(1, 1, 2, 3, 4, 5))
})

test_that("Discretization - equalsize - reverse order", {
  x <- as.numeric(6:1)
  y <- 1:6

  d <- discretize(x, y, control = equalsizeControl(k = 2))[[2]]
  expect_equal(as.numeric(d), c(1, 1, 1, 2, 2, 2) %>% rev)

  d <- discretize(x, y, control = equalsizeControl(k = 3))[[2]]
  expect_equal(as.numeric(d), c(1, 1, 2, 2, 3, 3) %>% rev)

  d <- discretize(x, y, control = equalsizeControl(k = 4))[[2]]
  expect_equal(as.numeric(d), c(1, 1, 2, 2, 3, 4) %>% rev)

  d <- discretize(x, y, control = equalsizeControl(k = 5))[[2]]
  expect_equal(as.numeric(d), c(1, 1, 2, 3, 4, 5) %>% rev)
})

test_that("Discretization - equalsize - pseudo-random order", {
  x <- c(6, 4, 2, 3, 1, 5)
  y <- 1:6

  d <- discretize(x, y, control = equalsizeControl(k = 2))[[2]]
  expect_equal(as.numeric(d), c(2, 2, 1, 1, 1, 2))

  d <- discretize(x, y, control = equalsizeControl(k = 3))[[2]]
  expect_equal(as.numeric(d), c(3, 2, 1, 2, 1, 3))

  d <- discretize(x, y, control = equalsizeControl(k = 4))[[2]]
  expect_equal(as.numeric(d), c(4, 2, 1, 2, 1, 3))

  d <- discretize(x, y, control = equalsizeControl(k = 5))[[2]]
  expect_equal(as.numeric(d), c(5, 3, 1, 2, 1, 4))
})

test_that("Zero split points", {
  set.seed(1)

  x <- rep(0, 10)
  y <- rep(0, 10)

  expect_warning(discretize(x, y))
})

test_that("List interface inside function", {
  fnc <- function(xx) {
    discretize(list(xx$"Sepal+Length", xx[[2]],
                    xx[["Petal+Length"]]), xx$Species)

  }

  expect_equal(
    colnames(fnc(iris_plus)),
    c("Species", "Sepal+Length", "Sepal+Width", "Petal+Length"))
})

test_that("Interfaces", {
  expect_equal(
    colnames(discretize(iris$Sepal.Length, iris[["Species"]])),
    c("Species", "Sepal.Length")
  )

  expect_equal(
    colnames(discretize(iris$Sepal.Length, iris[[5]])),
    c("Species", "Sepal.Length")
  )

  expect_equal(
    colnames(discretize(list(iris$Sepal.Length, iris[[2]],
                             iris[["Petal.Length"]]), iris$Species)),
    colnames(discretize(Species ~ . - Petal.Width, iris, all = FALSE))
  )

  expect_equal(
    colnames(discretize(list(iris_plus$"Sepal+Length", iris_plus[[2]],
                             iris_plus[["Petal+Length"]]), iris_plus$Species)),
    c("Species", "Sepal+Length", "Sepal+Width", "Petal+Length")
  )

  expect_s3_class(discretize(iris[-5], iris$Species), "data.frame")
})

test_that("Custom breaks", {
  breaks <- c(0, 2, 4, 6, 8, 20, Inf)
  disc <- discretize(iris, Species ~ Sepal.Length, customBreaksControl(breaks))

  cc <- cut(iris$Sepal.Length, breaks = breaks, ordered_result = TRUE)
  expect_true(all(disc$Sepal.Length == cc))

  expect_error(customBreaksControl(c("A")))
})

test_that("Throw error for duplicated columns", {
  x <- iris
  colnames(x)[1:2] <- "X"
  expect_error(discretize(Species ~ ., x))
  expect_error(discretize(iris, iris$Species))
})

test_that("Throw an error when there's no numeric columns", {
  expect_error(discretize(discretize(Species ~ ., iris), Species ~ .))
})


iris_num <- iris[, c(1, 5)]
iris_num[["SepLenInteger"]] <- as.integer(iris_num$Sepal.Length)
iris_num[["SepLenNumeric"]] <- as.numeric(as.integer(iris_num$Sepal.Length))
iris_num <- iris_num[, -1] # remove Sepal.Length column


test_that("to not discretize integer column set discIntegers = FALSE", {

  expect_equal(
    discretize(
      Species ~ ., iris_num,
      discIntegers = FALSE)[["SepLenInteger"]],
    iris_num[["SepLenInteger"]]
  )

  expect_equal(
    discretize(
      iris_num, Species ~ .,
      discIntegers = FALSE)[["SepLenInteger"]],
    iris_num[["SepLenInteger"]]
  )

  expect_equal(
    discretize(
      iris_num[, 2:3], iris_num$Species,
      discIntegers = FALSE)[["SepLenInteger"]],
    iris_num[["SepLenInteger"]]
  )
})

test_that("By default integer columns are discretized", {

  res <- discretize(Species ~ ., iris_num)
  expect_equal(
    res[["SepLenInteger"]],
    res[["SepLenNumeric"]]
  )

  res <- discretize(iris_num, Species ~ .)
  expect_equal(
    res[["SepLenInteger"]],
    res[["SepLenNumeric"]]
  )

  res <- discretize(iris_num[, 2:3], iris_num$Species)
  expect_equal(
    res[["SepLenInteger"]],
    res[["SepLenNumeric"]]
  )
})

test_that("no double values error when discIntegers = FALSE", {
  dt <- data.frame(y = c("A", "A"), x = c("A", "B"), z = 1:2)
  expect_error(discretize(y~., dt, discIntegers = FALSE))
})

Try the FSelectorRcpp package in your browser

Any scripts or data that you put into this service are public.

FSelectorRcpp documentation built on April 28, 2023, 5:07 p.m.