library(cvms)
context("example functions")
## preprocess_functions()
test_that("the expected function definitions are returned in preprocess_functions()", {
testthat::skip("Fails in check - IMPROVE THESE TESTS")
expect_equal(
head(capture.output(preprocess_functions("standardize")), 22),
c(
"function(train_data, test_data, formula, hyperparameters){",
"",
" # Get centering and scaling parameters from the train_data",
" preprocess_params <- caret::preProcess(train_data,",
" method = c(\"scale\", \"center\"))",
"",
" # Apply standardization to all numeric variables in",
" # train_data and test_data",
" train_data <- stats::predict(preprocess_params, train_data)",
" test_data <- stats::predict(preprocess_params, test_data)",
"",
" # Extract parameters and add to tibble",
" tidy_parameters <- tibble::tibble(\"Measure\" = c(\"Mean\", \"SD\")) %>%",
" dplyr::bind_cols(",
" dplyr::bind_rows(preprocess_params$mean,",
" preprocess_params$std)",
" )",
"",
" list(\"train\" = train_data,",
" \"test\" = test_data,",
" \"parameters\" = tidy_parameters)",
" }"
)
)
expect_equal(
head(capture.output(preprocess_functions("normalize")), 22),
c(
"function(train_data, test_data, formula, hyperparameters){",
"",
" # Get normalization parameters from the train_data",
" preprocess_params <- caret::preProcess(train_data,",
" method = c(\"range\"),",
" rangeBounds = c(0,1))",
"",
" # Apply normalization to all numeric variables in",
" # train_data and test_data",
" train_data <- stats::predict(preprocess_params, train_data)",
" test_data <- stats::predict(preprocess_params, test_data)",
"",
" # Extract parameters and add to tibble",
" tidy_parameters <- tibble::tibble(\"Measure\" = c(\"Min\", \"Max\")) %>%",
" dplyr::bind_cols(",
" dplyr::as_tibble(preprocess_params$ranges)",
" )",
"",
" list(\"train\" = train_data,",
" \"test\" = test_data,",
" \"parameters\" = tidy_parameters)", " }"
)
)
})
test_that("the expected output is returned from preprocess_functions() functions", {
# Standardize
# Load data and fold it
xpectr::set_test_seed(1)
partitions <- groupdata2::partition(participant.scores,
p = 0.75,
cat_col = "diagnosis",
id_col = "participant",
list_out = FALSE
) %>%
dplyr::mutate(
diagnosis = as.numeric(diagnosis),
notAVar = 1
) %>%
dplyr::ungroup()
train_set <- partitions[partitions[[".partitions"]] == 1, ]
test_set <- partitions[partitions[[".partitions"]] == 2, ]
# Standardize with preprocess_fn
standardize_fn <- preprocess_functions("standardize")
standardized_by_fn <- standardize_fn(
train_data = train_set, test_data = test_set,
# notAVar is just to avoid changing the unit tests below
# when diagnosis wouldn't be standardized
formula = as.formula("notAVar ~ ."),
hyperparameters = NULL
)
# Standardize manually
train_mean_age <- mean(train_set$age)
train_sd_age <- sd(train_set$age)
train_mean_score <- mean(train_set$score)
train_sd_score <- sd(train_set$score)
train_mean_session <- mean(train_set$session)
train_sd_session <- sd(train_set$session)
train_mean_diagnosis <- mean(train_set$diagnosis)
train_sd_diagnosis <- sd(train_set$diagnosis)
standardized_train <- train_set
standardized_test <- test_set
standardized_train[["age"]] <- (standardized_train[["age"]] - train_mean_age) / train_sd_age
standardized_test[["age"]] <- (standardized_test[["age"]] - train_mean_age) / train_sd_age
standardized_train[["score"]] <- (standardized_train[["score"]] - train_mean_score) / train_sd_score
standardized_test[["score"]] <- (standardized_test[["score"]] - train_mean_score) / train_sd_score
standardized_train[["session"]] <- (standardized_train[["session"]] - train_mean_session) / train_sd_session
standardized_test[["session"]] <- (standardized_test[["session"]] - train_mean_session) / train_sd_session
standardized_train[["diagnosis"]] <- (standardized_train[["diagnosis"]] - train_mean_diagnosis) / train_sd_diagnosis
standardized_test[["diagnosis"]] <- (standardized_test[["diagnosis"]] - train_mean_diagnosis) / train_sd_diagnosis
# Test
expect_equal(standardized_by_fn[["train"]], standardized_train)
expect_equal(standardized_by_fn[["test"]], standardized_test)
expect_equal(
standardized_by_fn[["parameters"]]$Measure,
c("Mean", "SD")
)
expect_equal(standardized_by_fn[["parameters"]]$age,
c(train_mean_age, train_sd_age),
tolerance = 1e-4
)
expect_equal(standardized_by_fn[["parameters"]]$age,
c(29.2857142857143, 7.93185260290972),
tolerance = 1e-4
)
expect_equal(standardized_by_fn[["parameters"]]$diagnosis,
c(0.571428571428571, 0.50709255283711),
tolerance = 1e-4
)
expect_equal(standardized_by_fn[["parameters"]]$diagnosis,
c(train_mean_diagnosis, train_sd_diagnosis),
tolerance = 1e-4
)
expect_equal(standardized_by_fn[["parameters"]]$score,
c(40.2857142857143, 19.4220051929322),
tolerance = 1e-4
)
expect_equal(standardized_by_fn[["parameters"]]$score,
c(train_mean_score, train_sd_score),
tolerance = 1e-4
)
expect_equal(standardized_by_fn[["parameters"]]$session,
c(2, 0.836660026534076),
tolerance = 1e-4
)
expect_equal(standardized_by_fn[["parameters"]]$session,
c(train_mean_session, train_sd_session),
tolerance = 1e-4
)
# Normalize
# Normalize with preprocess_fn
normalize_fn <- preprocess_functions("range")
normalized_by_fn <- normalize_fn(
train_data = train_set, test_data = test_set,
formula = as.formula("notAVar ~ ."),
hyperparameters = NULL
)
# Normalize manually
train_min_age <- min(train_set[["age"]])
train_max_age <- max(train_set[["age"]])
train_min_diagnosis <- min(train_set[["diagnosis"]])
train_max_diagnosis <- max(train_set[["diagnosis"]])
train_min_score <- min(train_set[["score"]])
train_max_score <- max(train_set[["score"]])
train_min_session <- min(train_set[["session"]])
train_max_session <- max(train_set[["session"]])
minMaxScaler <- function(x, min__, max__) {
(x - min__) / (max__ - min__)
}
normalized_train_set <- train_set
normalized_test_set <- test_set
normalized_train_set[["age"]] <- minMaxScaler(
normalized_train_set[["age"]],
train_min_age, train_max_age
)
normalized_train_set[["diagnosis"]] <- minMaxScaler(
normalized_train_set[["diagnosis"]],
train_min_diagnosis, train_max_diagnosis
)
normalized_train_set[["score"]] <- minMaxScaler(
normalized_train_set[["score"]],
train_min_score, train_max_score
)
normalized_train_set[["session"]] <- minMaxScaler(
normalized_train_set[["session"]],
train_min_session, train_max_session
)
normalized_test_set[["age"]] <- minMaxScaler(
normalized_test_set[["age"]],
train_min_age, train_max_age
)
normalized_test_set[["diagnosis"]] <- minMaxScaler(
normalized_test_set[["diagnosis"]],
train_min_diagnosis, train_max_diagnosis
)
normalized_test_set[["score"]] <- minMaxScaler(
normalized_test_set[["score"]],
train_min_score, train_max_score
)
normalized_test_set[["session"]] <- minMaxScaler(
normalized_test_set[["session"]],
train_min_session, train_max_session
)
# Test
expect_identical(normalized_by_fn[["train"]], normalized_train_set)
expect_identical(normalized_by_fn[["test"]], normalized_test_set)
expect_equal(
normalized_by_fn[["parameters"]]$Measure,
c("Min", "Max")
)
expect_equal(normalized_by_fn[["parameters"]]$age,
c(20, 43),
tolerance = 1e-4
)
expect_equal(normalized_by_fn[["parameters"]]$diagnosis,
c(0, 1),
tolerance = 1e-4
)
expect_equal(normalized_by_fn[["parameters"]]$score,
c(10, 81),
tolerance = 1e-4
)
expect_equal(normalized_by_fn[["parameters"]]$session,
c(1, 3),
tolerance = 1e-4
)
})
# TODO test the other example function functions
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.