tests/testthat/test_gpb.convert_with_rules.R

context("gpb.convert_with_rules()")

# Avoid that long tests get executed on CRAN
if(Sys.getenv("GPBOOST_ALL_TESTS") == "GPBOOST_ALL_TESTS"){
  
  test_that("gpb.convert_with_rules() rejects inputs that are not a data.table or data.frame", {
    bad_inputs <- list(
      matrix(1.0:10.0, 2L, 5L)
      , TRUE
      , c("a", "b")
      , NA
      , 10L
      , gpb.Dataset(
        data = matrix(1.0:10.0, 2L, 5L)
        , params = list()
      )
    )
    for (bad_input in bad_inputs) {
      expect_error({
        conversion_result <- gpb.convert_with_rules(bad_input)
      }, regexp = "gpb.convert_with_rules: you provided", fixed = TRUE)
    }
  })
  
  test_that("gpb.convert_with_rules() should work correctly for a dataset with only character columns", {
    testDF <- data.frame(
      col1 = c("a", "b", "c")
      , col2 =  c("green", "green", "red")
      , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
      conversion_result <- gpb.convert_with_rules(input_data)
      # dataset should have been converted to integer
      converted_dataset <- conversion_result[["data"]]
      expect_identical(class(input_data), class(converted_dataset))
      expect_identical(class(converted_dataset[["col1"]]), "integer")
      expect_identical(class(converted_dataset[["col2"]]), "integer")
      expect_identical(converted_dataset[["col1"]], c(1L, 2L, 3L))
      expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
      # rules should be returned and correct
      rules <- conversion_result$rules
      expect_is(rules, "list")
      expect_length(rules, ncol(input_data))
      expect_identical(rules[["col1"]], c("a" = 1L, "b" = 2L, "c" = 3L))
      expect_identical(rules[["col2"]], c("green" = 1L, "red" = 2L))
    }
  })
  
  test_that("gpb.convert_with_rules() should work correctly for a dataset with only factor columns", {
    testDF <- data.frame(
      col1 = as.factor(c("a", "b", "c"))
      , col2 =  as.factor(c("green", "green", "red"))
      , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
      conversion_result <- gpb.convert_with_rules(input_data)
      # dataset should have been converted to integer
      converted_dataset <- conversion_result[["data"]]
      expect_identical(class(input_data), class(converted_dataset))
      expect_identical(class(converted_dataset[["col1"]]), "integer")
      expect_identical(class(converted_dataset[["col2"]]), "integer")
      expect_identical(converted_dataset[["col1"]], c(1L, 2L, 3L))
      expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
      # rules should be returned and correct
      rules <- conversion_result$rules
      expect_is(rules, "list")
      expect_length(rules, ncol(input_data))
      expect_identical(rules[["col1"]], c("a" = 1L, "b" = 2L, "c" = 3L))
      expect_identical(rules[["col2"]], c("green" = 1L, "red" = 2L))
    }
  })
  
  test_that("gpb.convert_with_rules() should not change a dataset with only integer columns", {
    testDF <- data.frame(
      col1 = 11L:15L
      , col2 = 16L:20L
      , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
      conversion_result <- gpb.convert_with_rules(input_data)
      # dataset should have been converted to integer
      converted_dataset <- conversion_result[["data"]]
      expect_identical(converted_dataset, input_data)
      # rules should be returned and correct
      rules <- conversion_result$rules
      expect_identical(rules, list())
    }
  })
  
  test_that("gpb.convert_with_rules() should work correctly for a dataset with numeric, factor, and character columns", {
    testDF <- data.frame(
      character_col = c("a", "b", "c")
      , numeric_col = c(1.0, 9.0, 10.0)
      , factor_col = as.factor(c("n", "n", "y"))
      , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
      conversion_result <- gpb.convert_with_rules(input_data)
      # dataset should have been converted to numeric
      converted_dataset <- conversion_result[["data"]]
      expect_identical(class(input_data), class(converted_dataset))
      expect_identical(class(converted_dataset[["character_col"]]), "integer")
      expect_identical(class(converted_dataset[["factor_col"]]), "integer")
      expect_identical(converted_dataset[["character_col"]], c(1L, 2L, 3L))
      expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
      # rules should be returned and correct
      rules <- conversion_result$rules
      expect_is(rules, "list")
      expect_length(rules, 2L)
      expect_identical(rules[["character_col"]], c("a" = 1L, "b" = 2L, "c" = 3L))
      expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L))
      
      # today, gpb.convert_with_rules() does not convert numeric columns
      expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
      expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, 10.0))
    }
  })
  
  test_that("gpb.convert_with_rules() should convert missing values to the expected value", {
    testDF <- data.frame(
      character_col = c("a", NA_character_, "c")
      , na_col = rep(NA, 3L)
      , na_real_col = rep(NA_real_, 3L)
      , na_int_col = rep(NA_integer_,  3L)
      , na_character_col = rep(NA_character_, 3L)
      , numeric_col = c(1.0, 9.0, NA_real_)
      , factor_col = as.factor(c("n", "n", "y"))
      , integer_col = c(1L, 9L, NA_integer_)
      , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
      conversion_result <- gpb.convert_with_rules(input_data)
      # dataset should have been converted to integer
      converted_dataset <- conversion_result[["data"]]
      expect_identical(class(input_data), class(converted_dataset))
      
      expect_identical(class(converted_dataset[["character_col"]]), "integer")
      expect_identical(converted_dataset[["character_col"]], c(1L, 0L, 2L))
      
      # does not try to fill 0s in for already-integer columns
      expect_identical(class(converted_dataset[["integer_col"]]), "integer")
      expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_))
      expect_identical(class(converted_dataset[["na_int_col"]]), "integer")
      expect_identical(converted_dataset[["na_int_col"]], rep(NA_integer_, nrow(converted_dataset)))
      
      expect_identical(class(converted_dataset[["factor_col"]]), "integer")
      expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
      
      # NAs in character columns should be converted to 0
      expect_identical(class(converted_dataset[["na_character_col"]]), "integer")
      expect_identical(converted_dataset[["na_character_col"]], rep(0L, nrow(converted_dataset)))
      
      # logical should be converted to integer
      expect_identical(class(converted_dataset[["na_col"]]), "integer")
      expect_identical(converted_dataset[["na_col"]], rep(-1L, 3L))
      
      # gpb.convert_with_rules() should not convert numeric columns to integer
      expect_identical(class(converted_dataset[["na_real_col"]]), "numeric")
      expect_identical(converted_dataset[["na_real_col"]], rep(NA_real_, nrow(converted_dataset)))
      expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
      expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, NA_real_))
      
      # rules should be returned and correct
      rules <- conversion_result$rules
      expect_is(rules, "list")
      expect_length(rules, 3L)
      expect_identical(rules[["character_col"]], c("a" = 1L, "c" = 2L))
      expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L))
      expect_identical(rules[["na_col"]], stats::setNames(c(0L, 1L), c(FALSE, TRUE)))
    }
  })
  
  test_that("gpb.convert_with_rules() should work correctly if you provide your own well-formed rules", {
    testDF <- data.frame(
      character_col = c("a", NA_character_, "c", "a", "a", "c")
      , na_col = rep(NA, 6L)
      , na_real_col = rep(NA_real_, 6L)
      , na_int_col = rep(NA_integer_, 6L)
      , na_character_col = rep(NA_character_, 6L)
      , numeric_col = c(1.0, 9.0, NA_real_, 10.0, 11.0, 12.0)
      , factor_col = as.factor(c("n", "n", "y", "y", "n", "n"))
      , integer_col = c(1L, 9L, NA_integer_, 1L, 1L, 1L)
      , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    
    # value used by gpb.convert_with_rules() when it encounters a categorical value that
    # is not in the provided rules
    UNKNOWN_FACTOR_VALUE <- 0L
    UNKNOWN_LOGICAL_VALUE <- -1L
    for (input_data in list(testDF, testDT)) {
      custom_rules <- list(
        "character_col" = c(
          "a" = 5L
          , "c" = -10L
        )
        , "factor_col" = c(
          "n" = 65L
          , "y" = 66L
        )
      )
      conversion_result <- gpb.convert_with_rules(
        data = input_data
        , rules = custom_rules
      )
      
      # dataset should have been converted to integer
      converted_dataset <- conversion_result[["data"]]
      expect_identical(class(input_data), class(converted_dataset))
      
      expect_identical(class(converted_dataset[["character_col"]]), "integer")
      expect_identical(converted_dataset[["character_col"]], c(5L, UNKNOWN_FACTOR_VALUE, -10L, 5L, 5L, -10L))
      
      expect_identical(class(converted_dataset[["factor_col"]]), "integer")
      expect_identical(converted_dataset[["factor_col"]], c(65L, 65L, 66L, 66L, 65L, 65L))
      
      # columns not specified in rules are not going to be converted, unless they are all NA
      for (col in c("na_real_col", "na_int_col", "numeric_col", "integer_col")) {
        expect_identical(converted_dataset[[col]], input_data[[col]])
      }
      
      # non-numeric/integer columns that are all NA should have been filled in
      expect_identical(converted_dataset[["na_col"]], rep(UNKNOWN_LOGICAL_VALUE, 6L))
      expect_identical(converted_dataset[["na_character_col"]], rep(UNKNOWN_FACTOR_VALUE, 6L))
      
      # the rules you passed in should be returned unchanged
      rules <- conversion_result$rules
      expect_identical(rules, custom_rules)
    }
  })
  
  test_that("gpb.convert_with_rules() should modify data.tables in-place", {
    testDT <- data.table::data.table(
      character_col = c("a", NA_character_, "c")
      , na_col = rep(NA, 3L)
      , na_real_col = rep(NA_real_, 3L)
      , na_int_col = rep(NA_integer_,  3L)
      , na_character_col = rep(NA_character_, 3L)
      , numeric_col = c(1.0, 9.0, NA_real_)
      , factor_col = as.factor(c("n", "n", "y"))
      , integer_col = c(1L, 9L, NA_integer_)
    )
    conversion_result <- gpb.convert_with_rules(testDT)
    resultDT <- conversion_result[["data"]]
    expect_identical(resultDT, testDT)
  })
  
}

Try the gpboost package in your browser

Any scripts or data that you put into this service are public.

gpboost documentation built on Oct. 24, 2023, 9:09 a.m.