tests/testthat/test-standardize-data.R

# standardize.numeric -----------------------------------------------------
test_that("standardize.numeric", {
  x <- standardize(seq(0, 1, length.out = 100))
  expect_equal(mean(x), 0, tolerance = 0.01)

  x <- standardize(seq(0, 1, length.out = 100), two_sd = TRUE)
  expect_equal(sd(x), 0.5, tolerance = 0.01)

  x <- standardize(seq(0, 1, length.out = 100), robust = TRUE)
  expect_equal(median(x), 0, tolerance = 0.01)

  x <- standardize(seq(0, 1, length.out = 100), robust = TRUE, two_sd = TRUE)
  expect_equal(mad(x), 0.5, tolerance = 0.01)

  expect_message(standardize(c(0, 0, 0, 1, 1)))

  x <- standardize(c(-1, 0, 1), reference = seq(3, 4, length.out = 100))
  expect_equal(mean(x), -11.943, tolerance = 0.01)
})


# standardize factor / Date -----------------------------------------------
test_that("standardize.numeric", {
  f <- factor(c("c", "a", "b"))
  expect_identical(standardize(f), f)
  expect_equal(standardize(f, force = TRUE), c(1, -1, 0), ignore_attr = TRUE)

  d <- as.Date(c("1989/08/06", "1989/08/04", "1989/08/05"))
  expect_identical(standardize(d), d)
  expect_equal(standardize(d, force = TRUE), c(1, -1, 0), ignore_attr = TRUE)
})


# standardize.data.frame --------------------------------------------------

test_that("standardize.data.frame", {
  skip_if_not_installed("poorman")

  data(iris)
  x <- standardize(iris)
  expect_equal(mean(x$Sepal.Length), 0, tolerance = 0.01)
  expect_length(levels(x$Species), 3)
  expect_equal(mean(subset(x, Species == "virginica")$Sepal.Length), 0.90, tolerance = 0.01)

  # check class attributes
  expect_identical(
    vapply(x, class, character(1)),
    c(
      Sepal.Length = "numeric", Sepal.Width = "numeric", Petal.Length = "numeric",
      Petal.Width = "numeric", Species = "factor"
    )
  )

  x2 <- standardize(x = iris[1, ], reference = iris)
  expect_true(all(x2[1, ] == x[1, ]))


  x <- standardize(poorman::group_by(iris, Species))
  expect_equal(mean(x$Sepal.Length), 0, tolerance = 0.01)
  expect_length(levels(x$Species), 3)
  expect_equal(mean(subset(x, Species == "virginica")$Sepal.Length), 0, tolerance = 0.01)
})


test_that("standardize.data.frame, NAs", {
  skip_if_not_installed("poorman")

  data(iris)
  iris$Sepal.Width[c(148, 65, 33, 58, 54, 93, 114, 72, 32, 23)] <- NA
  iris$Sepal.Length[c(11, 30, 141, 146, 13, 149, 6, 8, 48, 101)] <- NA

  x <- standardize(iris)
  expect_equal(head(x$Sepal.Length), c(-0.9163, -1.1588, -1.4013, -1.5226, -1.0376, NA), tolerance = 0.01)
  expect_equal(head(x$Sepal.Width), c(1.0237, -0.151, 0.3189, 0.0839, 1.2586, 1.9635), tolerance = 0.01)
  expect_identical(mean(x$Sepal.Length), NA_real_)

  x <- standardize(iris, two_sd = TRUE)
  expect_equal(head(x$Sepal.Length), c(-0.4603, -0.5811, -0.7019, -0.7623, -0.5207, NA), tolerance = 0.01)
  expect_equal(head(x$Sepal.Width), c(0.5118, -0.0755, 0.1594, 0.042, 0.6293, 0.9817), tolerance = 0.01)
  expect_identical(mean(x$Sepal.Length), NA_real_)


  x <- standardize(poorman::group_by(iris, .data$Species))
  expect_equal(head(x$Sepal.Length), c(0.2547, -0.3057, -0.8661, -1.1463, -0.0255, NA), tolerance = 0.01)
  expect_equal(head(x$Sepal.Width), c(0.2369, -1.0887, -0.5584, -0.8235, 0.502, 1.2974), tolerance = 0.01)
  expect_identical(mean(x$Sepal.Length), NA_real_)
})


test_that("standardize.data.frame, apend", {
  skip_if_not_installed("poorman")

  data(iris)
  iris$Sepal.Width[c(26, 43, 56, 11, 66, 132, 23, 133, 131, 28)] <- NA
  iris$Sepal.Length[c(32, 12, 109, 92, 119, 49, 83, 113, 64, 30)] <- NA

  x <- standardize(iris, append = TRUE)
  expect_identical(colnames(x), c(
    "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width",
    "Species", "Sepal.Length_z", "Sepal.Width_z", "Petal.Length_z",
    "Petal.Width_z"
  ))
  expect_equal(head(x$Sepal.Length_z), c(-0.8953, -1.1385, -1.3816, -1.5032, -1.0169, -0.5306), tolerance = 0.01)
  expect_equal(head(x$Sepal.Width_z), c(1.04, -0.1029, 0.3543, 0.1257, 1.2685, 1.9542), tolerance = 0.01)
  expect_identical(mean(x$Sepal.Length_z), NA_real_)

  x <- standardize(iris, two_sd = TRUE, append = TRUE)
  expect_equal(head(x$Sepal.Length_z), c(-0.4477, -0.5692, -0.6908, -0.7516, -0.5084, -0.2653), tolerance = 0.01)
  expect_equal(head(x$Sepal.Width_z), c(0.52, -0.0514, 0.1771, 0.0629, 0.6343, 0.9771), tolerance = 0.01)
  expect_identical(mean(x$Sepal.Length_z), NA_real_)


  x <- standardize(poorman::group_by(iris, .data$Species), append = TRUE)
  expect_equal(head(x$Sepal.Length_z), c(0.2746, -0.2868, -0.8483, -1.129, -0.0061, 1.1168), tolerance = 0.01)
  expect_equal(head(x$Sepal.Width_z), c(0.1766, -1.1051, -0.5924, -0.8487, 0.4329, 1.2019), tolerance = 0.01)
  expect_identical(mean(x$Sepal.Length_z), NA_real_)
})


test_that("standardize.data.frame, weights", {
  skip_if_not_installed("poorman")

  x <- rexp(30)
  w <- rpois(30, 20) + 1

  expect_equal(
    sqrt(cov.wt(cbind(x, x), w)$cov[1, 1]),
    attr(standardize(x, weights = w), "scale"),
    tolerance = 1e-4
  )
  expect_equal(
    standardize(x, weights = w),
    standardize(data.frame(x), weights = w)$x,
    tolerance = 1e-4,
    ignore_attr = TRUE
  )

  # name and vector give same results
  expect_equal(
    standardize(mtcars, exclude = "cyl", weights = mtcars$cyl),
    standardize(mtcars, weights = "cyl"),
    tolerance = 1e-4
  )

  d <- poorman::group_by(mtcars, am)
  expect_warning(standardize(d, weights = d$cyl))
})


# Unstandardize -----------------------------------------------------------
test_that("unstandardize, numeric", {
  data(iris)
  x <- standardize(iris$Petal.Length)
  rez <- unstandardize(x)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  rez <- unstandardize(x, reference = iris$Petal.Length)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  rez <- unstandardize(x, center = mean(iris$Petal.Length), scale = stats::sd(iris$Petal.Length))
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  rez <- unstandardize(0, center = mean(iris$Petal.Length), scale = stats::sd(iris$Petal.Length))
  expect_equal(rez, mean(iris$Petal.Length), tolerance = 1e-3)

  x <- standardize(iris$Petal.Length, robust = TRUE, two_sd = TRUE)
  rez <- unstandardize(x, robust = TRUE, two_sd = TRUE)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  x <- scale(iris$Petal.Length)
  rez <- unstandardize(x)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)

  x <- scale(iris$Petal.Length, center = 3, scale = 2)
  rez <- unstandardize(x)
  expect_equal(rez, iris$Petal.Length, tolerance = 1e-3, ignore_attr = TRUE)
})

test_that("unstandardize, data frame", {
  skip_if_not_installed("poorman")

  data(iris)
  x <- standardize(iris)
  rez <- unstandardize(x)
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)

  x <- standardize(iris, select = "Petal.Length")
  rez <- unstandardize(x)
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)

  x <- standardize(iris, select = starts_with("Pet"))
  rez <- unstandardize(x, select = starts_with("Pet"))
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)

  x <- standardize(iris, select = "Petal.Length")
  rez <- unstandardize(x,
    center = c(Petal.Length = mean(iris$Petal.Length)),
    scale = c(Petal.Length = stats::sd(iris$Petal.Length))
  )
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)

  expect_error(unstandardize(x,
    center = mean(iris$Petal.Length),
    scale = stats::sd(iris$Petal.Length)
  ))

  x <- standardize(iris)
  rez <- unstandardize(x, center = rep(0, 4), scale = rep(1, 4))
  expect_equal(rez, x, tolerance = 0.1, ignore_attr = TRUE)

  data(iris)
  x <- standardize(iris, robust = TRUE, two_sd = TRUE)
  rez <- unstandardize(x, robust = TRUE, two_sd = TRUE)
  expect_equal(rez, iris, tolerance = 0.1, ignore_attr = TRUE)
})

test_that("un/standardize, matrix", {
  set.seed(4)
  x <- matrix(sample(8), nrow = 4)
  colnames(x) <- letters[1:2]
  rownames(x) <- LETTERS[1:4]

  z1 <- standardize(x)
  z2 <- scale(x)

  expect_equal(z1, z2, ignore_attr = TRUE)
  expect_equal(unstandardize(z1), x, ignore_attr = TRUE)
  expect_identical(unstandardize(z2), unstandardize(z1))
})

test_that("unstandardize with reference (data frame)", {
  x <- standardize(x = iris, reference = iris)
  x2 <- unstandardize(x, reference = iris)
  expect_equal(x2, iris, ignore_attr = TRUE)

  x <- standardize(x = iris, reference = iris, robust = TRUE)
  x2 <- unstandardize(x, reference = iris, robust = TRUE)
  expect_equal(x2, iris, ignore_attr = TRUE)
})

test_that("unstandardize does nothing with characters and factors", {
  expect_identical(
    unstandardise(c("a", "b")),
    c("a", "b")
  )
  expect_identical(
    unstandardise(factor(c(1, 2))),
    factor(c(1, 2))
  )
})

# select helpers ------------------------------
test_that("standardize regex", {
  expect_identical(
    standardize(mtcars, select = "pg", regex = TRUE),
    standardize(mtcars, select = "mpg")
  )
})

# standardize when only providing one of center/scale ---------------
test_that("standardize when only providing one of center/scale", {
  x <- 1:10
  expect_identical(
    as.vector(datawizard::standardize(x, center = FALSE)),
    x / sd(x)
  )
  expect_identical(
    as.vector(datawizard::standardize(x, center = 2)),
    (x - 2) / sd(x)
  )
  expect_identical(
    as.vector(datawizard::standardize(x, scale = FALSE)),
    as.vector(datawizard::center(x))
  )
  expect_identical(
    as.vector(datawizard::standardize(x, scale = 1.5)),
    (x - mean(x)) / 1.5
  )
})


# grouped data

test_that("unstandardize: grouped data", {
  skip_if_not_installed("poorman")

  # 1 group, 1 standardized var
  stand <- poorman::group_by(mtcars, cyl)
  stand <- standardize(stand, "mpg")
  unstand <- unstandardize(stand, select = "mpg")
  expect_identical(
    poorman::ungroup(unstand),
    mtcars,
    ignore_attr = TRUE
  )

  expect_s3_class(unstand, "grouped_df")

  # 2 groups, 1 standardized var
  set.seed(123)
  test <- iris
  test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE)
  stand <- poorman::group_by(test, Species, grp)
  stand <- standardize(stand, "Sepal.Length")
  expect_identical(
    poorman::ungroup(unstandardize(stand, select = "Sepal.Length")),
    test
  )

  # 2 groups, 2 standardized vars
  set.seed(123)
  test <- iris
  test$grp <- sample(c("A", "B"), nrow(test), replace = TRUE)
  stand <- poorman::group_by(test, Species, grp)
  stand <- standardize(stand, c("Sepal.Length", "Petal.Length"))
  expect_identical(
    poorman::ungroup(unstandardize(stand, select = c("Sepal.Length", "Petal.Length"))),
    test
  )

  expect_s3_class(unstand, "grouped_df")

  # can't recover attributes
  stand <- poorman::group_by(iris, Species)
  stand <- standardize(stand, "Sepal.Length")
  attr(stand, "groups") <- NULL

  expect_error(
    unstandardize(stand, "Sepal.Length"),
    regexp = "Couldn't retrieve the necessary information"
  )

  # normalize applied on grouped data but unstandardize applied on ungrouped data
  stand <- poorman::group_by(mtcars, cyl)
  stand <- standardize(stand, "mpg")
  stand <- poorman::ungroup(stand)

  expect_error(
    unstandardize(stand, "mpg"),
    regexp = "must provide the arguments"
  )

  # standardize applied on grouped data but unstandardize applied different grouped
  # data
  stand <- poorman::group_by(stand, am)
  expect_error(
    unstandardize(stand, "mpg"),
    regexp = "Couldn't retrieve the necessary"
  )
})

Try the datawizard package in your browser

Any scripts or data that you put into this service are public.

datawizard documentation built on Sept. 15, 2023, 9:06 a.m.