tests/testthat/test-multi_column_transformation.r

context('multi column transformations')

test_that('correctly transforms one column by multiplying by two', {
  iris2 <- iris
  doubler <- multi_column_transformation(function(x) 2 * x)
  doubler(iris2, 'Sepal.Length')
  expect_identical(iris2[[1]], 2 * iris[[1]],
               "multi_column_transformation must double first column of iris2")
})

test_that('correctly creates derivative column that is multiple by two', {
  iris2 <- iris
  doubler <- multi_column_transformation(`*`)
  doubler(iris2, 'Sepal.Length', 'Sepal.Length2', 2)
  expect_identical(iris2[['Sepal.Length2']], 2 * iris[[1]],
    paste0("multi_column_transformation must create new column 'Sepal.Length2'",
           " whose values are double those in the first column of iris2"))
})

test_that('correctly combines two columns into one new derivative column', {
  iris2 <- iris
  adder <- multi_column_transformation(`+`)
  adder(iris2, c('Sepal.Length', 'Sepal.Width'), 'sum')
  expect_equal(iris2[['sum']], with(iris2, Sepal.Length + Sepal.Width),
   info = paste("multi_column_transformation must create new column 'sum' ",
                "w/ values the sum of those in Sepal.Length and Sepal.Width"))
})

test_that('correctly swaps two columns', {
  iris2 <- iris
  swapper <- multi_column_transformation(function(x, y) list(y, x))
  swapper(iris2, c('Sepal.Length', 'Sepal.Width'))
  expect_equal(unname(iris2[c('Sepal.Length', 'Sepal.Width')]),
               unname(iris[c('Sepal.Width', 'Sepal.Length')]),
   info = paste("multi_column_transformation must swap first 2 columns"))
})

test_that('correctly removes columns', {
  iris2 <- iris
  remover <- multi_column_transformation(function(...) NULL)
  remover(iris2, 'Sepal.Length')
  expect_true(!'Sepal.Length' %in% colnames(iris2),
    info = paste("multi_column_transformation must remove first column"))
  remover(iris2, c(2, 3))
  expect_equal(sum(c('Petal.Length', 'Petal.Width') %in% colnames(iris2)), 0,
    info = paste("multi_column_transformation must remove first two columns"))
  expect_equal(length(iris2), 2,
    info = paste("multi_column_transformation must retain 2nd and 5th columns"))
})

test_that('correctly replaces NA', {
  iris2 <- iris
  replace_na <- function(...) {
    args <- list(...)
    val <- args[[length(args)]]
    args <- args[seq_len(length(args) - 1)]
    if (length(args) == 1) { args[[1]][is.na(args[[1]])] <- val; args[[1]] }
    else lapply(args, function(x) { x[is.na(x)] <- val; x })
  }
  iris2[1, ] <- NA
  na_replacer <- multi_column_transformation(replace_na)
  cols <- colnames(iris2)[1:1]
  replace_val <- 10
  na_replacer(iris2, cols, cols, replace_val)
  
  expect_equal(unname(unlist(iris2[1, cols])), rep(replace_val, length(cols)),
    info = paste(
      "multi_column_transformation",
      "na_replacer must replace NAs in first row w/", replace_val))
})

test_that('correctly transforms using numeric column indices', {
  iris2 <- iris
  doubler <- multi_column_transformation(function(x) 2 * x)
  doubler(iris2, 2)
  
  expect_equal(iris2[[2]], 2 * iris[[2]],
     info = paste("multi_column_transformation must be able to reference",
                  "columns using numeric indices",
                  "(e.g., doubler(iris2, 2)"))
})

test_that('correctly transforms using logical column indices', {
  iris2 <- iris
  doubler <- multi_column_transformation(function(x) 2 * x)
  doubler(iris2, 'Sepal.Width' == colnames(iris2))
  
  expect_equal(iris2[[2]], 2 * iris[[2]],
     info = paste("multi_column_transformation must be able to reference",
                  "columns using numeric indices",
                  "(e.g., doubler(iris2, c(F,T,F,F,F))"))
})

test_that('accepts transformation calls with missing arguments', {
  iris2 <- iris[, 1:4]
  scaler <- multi_column_transformation(function(...) {
    args <- list(...); const <- args[[length(args)]]; args <- head(args, -1)
    if (length(args) == 1) args[[1]] * const
    else lapply(args, function(col) col * const)
  })
  scaler(iris2, , , 2)
  expect_equal(iris2, 2 * iris[, 1:4],
    info = "multi_column_transformation must double first column of iris2")
})

test_that('transforms a partial data frame', {
  iris2 <- iris
  swapper <- multi_column_transformation(function(x, y) list(y, x))
  swapper(iris2[c(1, 2)])
  expect_equal(unname(iris2[, 1:2]), unname(iris[, 2:1]),
    info = "multi_column_transformation must swap values of first two columns")
})

test_that('correctly renames a column', {
  iris2 <- iris
  renamer <- multi_column_transformation(function(x) list(NULL, x))
  renamer(iris2, 'Sepal.Length', c('Sepal.Length', 'seplen'))
  
  expect_true('seplen' %in% colnames(iris2),
              !'Sepal.Length' %in% colnames(iris2),
             info = paste("multi_column_transformation must be able to reference",
                          "columns using numeric indices",
                          "(e.g., doubler(iris2, c(F,T,F,F,F))"))
  expect_equal(length(colnames(iris2)), 5,
    info = "multi_column_transformation must remove old column 'Sepal.Length'")
})

# This is technically a benchmark but I have no place to put it yet
test_that('it doubles a column no more than 5x as slow as a raw operation', {
  require(microbenchmark)
  iris2 <- iris
  # Beef up our data.frame
  iris2 <- do.call(rbind.data.frame, replicate(5, iris2))
  iris2 <- do.call(cbind.data.frame, replicate(5, iris2))
  raw_double <- function(dataframe, cols) {
    class(dataframe) <- 'list'
    for(col in cols) dataframe[[col]] <- 2 * dataframe[[col]]
    class(dataframe) <- 'data.frame'
    dataframe
  }
  numeric_cols <- names(which(vapply(iris2, is.numeric, logical(1))))
  doubler <- multi_column_transformation(
    function(...) lapply(list(...), function(x) 2 * x))
  speeds <- summary(microbenchmark(doubler(iris2, numeric_cols),
                                   raw_double(iris2, numeric_cols),
                                   times = 5L))
  multi_column_transformation_runtime <- speeds$median[[1]]
  apply_raw_function_runtime <- speeds$median[[2]]

  # The 3.5 is sort of a magic value here but it is almost always OK.
  expect_true(multi_column_transformation_runtime <
              3.5 * apply_raw_function_runtime,
    paste0("Execution of ",
     "multi_column_transformation took too long: \nFormer took ",
     paste0(multi_column_transformation_runtime, "ms"),
     " but latter took ",
     paste0(apply_raw_function_runtime, "ms"), ".\n",
     "You need to make sure the code for multi_column_transformation\n",
     "stays efficient relative to raw_double (see code for this unit test)"))
})
robertzk/mungebits documentation built on May 27, 2019, 10:35 a.m.