# add a test numerical feature with missing values of certain percentage
createMissingData <- function(trainData, missingness) {
missingData <- list(
folds = trainData$folds,
labels = trainData$labels
)
missingData$covariateData <- Andromeda::copyAndromeda(trainData$covariateData)
rowIds <- missingData$labels$rowId
nData <- floor(length(rowIds) * (1 - missingness))
covariateId <- rep(666, nData)
withr::with_seed(
1234,
covariateValue <- runif(n = nData)
)
Andromeda::appendToTable(
missingData$covariateData$covariates,
data.frame(
rowId = rowIds[1:nData],
covariateId = covariateId,
covariateValue = covariateValue
)
)
Andromeda::appendToTable(
(missingData$covariateData$covariateRef),
data.frame(
covariateId = 666,
covariateName = "fakeMissingVariable",
analysisId = 666,
conceptId = 666
)
)
Andromeda::appendToTable(
missingData$covariateData$analysisRef,
data.frame(
analysisId = 666,
analysisName = "missing",
domainId = "missing",
startDay = NA,
endDay = NA,
isBinary = "N",
missingMeansZero = "N"
)
)
missingData
}
test_that("createSimpleImputer works", {
imputer <- createSimpleImputer()
expect_equal(imputer$method, "mean")
expect_equal(imputer$missingThreshold, 0.3)
expect_equal(attr(imputer, "fun"), "simpleImpute")
expect_s3_class(imputer, "featureEngineeringSettings")
imputer <- createSimpleImputer(
method = "median",
missingThreshold = 0.5
)
expect_equal(imputer$method, "median")
expect_equal(imputer$missingThreshold, 0.5)
expect_s3_class(imputer, "featureEngineeringSettings")
expect_s3_class(imputer, "featureEngineeringSettings")
expect_error(createSimpleImputer(method = "mean", missingThreshold = -1))
expect_error(createSimpleImputer(method = "mean", missingThreshold = "0.5"))
expect_error(createSimpleImputer(method = "mean", missingThreshold = 1))
expect_error(createSimpleImputer(method = "notMean"))
})
test_that("createIterativeImputer works", {
skip_if_not_installed("glmnet")
imputer <- createIterativeImputer()
expect_equal(imputer$method, "pmm")
expect_error(createIterativeImputer(method = "notPmm"))
expect_equal(attr(imputer, "fun"), "iterativeImpute")
expect_s3_class(imputer, "featureEngineeringSettings")
expect_error(createIterativeImputer(method = "pmm", missingThreshold = -1))
expect_error(createIterativeImputer(method = "pmm", missingThreshold = "0.5"))
expect_error(createIterativeImputer(method = "pmm", missingThreshold = 1))
imputer <- createIterativeImputer(
method = "pmm",
missingThreshold = 0.5
)
expect_equal(imputer$missingThreshold, 0.5)
expect_s3_class(imputer, "featureEngineeringSettings")
})
test_that("simpleImpute works", {
skip_if_offline()
missingData <- createMissingData(tinyTrainData, 0.2)
imputer <- createSimpleImputer(method = "mean", missingThreshold = 0.3)
imputedData <- simpleImpute(missingData, imputer, done = FALSE)
newFeature <- imputedData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666) %>%
dplyr::pull(.data$covariateValue)
originalFeature <- missingData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666)
imputedFeature <- imputedData$covariateData$covariates %>%
dplyr::filter(
.data$covariateId == 666,
!.data$rowId %in% !!(originalFeature %>%
dplyr::pull(.data$rowId))
) %>%
dplyr::pull(.data$covariateValue)
originalFeature <- originalFeature %>%
dplyr::pull(.data$covariateValue)
expect_true(length(newFeature) > length(originalFeature))
expect_equal(length(newFeature), nrow(imputedData$labels))
expect_equal(mean(originalFeature), unique(imputedFeature))
missingTestData <- createMissingData(testData, 0.4)
# extract featureEngineeringSettings from imputedData
metaData <- attr(imputedData$covariateData, "metaData")
testSettings <- metaData$featureEngineering$simpleImputer$settings$featureEngineeringSettings
imputedTestData <- simpleImpute(missingTestData, testSettings, done = TRUE)
newFeatureTest <- imputedTestData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666) %>%
dplyr::pull(.data$covariateValue)
originalFeatureTest <- missingTestData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666)
imputedFeatureTest <- imputedTestData$covariateData$covariates %>%
dplyr::filter(
.data$covariateId == 666,
!.data$rowId %in% !!(originalFeatureTest %>%
dplyr::pull(.data$rowId))
) %>%
dplyr::pull(.data$covariateValue)
originalFeatureTest <- originalFeatureTest %>%
dplyr::pull(.data$covariateValue)
expect_true(length(newFeatureTest) > length(originalFeatureTest))
expect_equal(length(newFeatureTest), nrow(imputedTestData$labels))
# should use mean from training data
expect_equal(mean(originalFeature), unique(imputedFeatureTest))
imputer <- createSimpleImputer(method = "median", missingThreshold = 0.3)
imputedData <- simpleImpute(missingData, imputer, done = FALSE)
newFeature <- imputedData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666) %>%
dplyr::pull(.data$covariateValue)
originalFeature <- missingData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666)
imputedFeature <- imputedData$covariateData$covariates %>%
dplyr::filter(
.data$covariateId == 666,
!.data$rowId %in% !!(originalFeature %>%
dplyr::pull(.data$rowId))
) %>%
dplyr::pull(.data$covariateValue)
originalFeature <- originalFeature %>%
dplyr::pull(.data$covariateValue)
expect_true(length(newFeature) > length(originalFeature))
expect_equal(length(newFeature), nrow(imputedData$labels))
expect_equal(median(originalFeature), unique(imputedFeature))
imputer <- createSimpleImputer(method = "mean", missingThreshold = 0.1)
imputedData <- simpleImpute(missingData, imputer, done = FALSE)
newFeature <- imputedData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666) %>%
dplyr::pull(.data$covariateValue)
expect_true(length(newFeature) == 0)
})
test_that("IterativeImputer works", {
skip_if_offline()
skip_if_not_installed("glmnet")
missingData <- createMissingData(tinyTrainData, 0.2)
imputer <- createIterativeImputer(
method = "pmm", missingThreshold = 0.3,
methodSettings = list(
pmm = list(
k = 1,
iterations = 1
))
)
imputedData <- iterativeImpute(missingData, imputer, done = FALSE)
newFeature <- imputedData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666) %>%
dplyr::pull(.data$covariateValue)
originalFeature <- missingData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666)
imputedFeature <- imputedData$covariateData$covariates %>%
dplyr::filter(
.data$covariateId == 666,
!.data$rowId %in% !!(originalFeature %>%
dplyr::pull(.data$rowId))
) %>%
dplyr::pull(.data$covariateValue)
originalFeature <- originalFeature %>%
dplyr::pull(.data$covariateValue)
expect_true(length(newFeature) > length(originalFeature))
expect_equal(length(newFeature), nrow(imputedData$labels))
missingTestData <- createMissingData(testData, 0.4)
# extract featureEngineeringSettings from imputedData
metaData <- attr(imputedData$covariateData, "metaData")
testSettings <- metaData$featureEngineering$iterativeImputer$settings$featureEngineeringSettings
imputedTestData <- iterativeImpute(missingTestData, testSettings, done = TRUE)
newFeatureTest <- imputedTestData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666) %>%
dplyr::pull(.data$covariateValue)
originalFeatureTest <- missingTestData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 666)
imputedFeatureTest <- imputedTestData$covariateData$covariates %>%
dplyr::filter(
.data$covariateId == 666,
!.data$rowId %in% !!(originalFeatureTest %>%
dplyr::pull(.data$rowId))
) %>%
dplyr::pull(.data$covariateValue)
originalFeatureTest <- originalFeatureTest %>%
dplyr::pull(.data$covariateValue)
expect_true(length(newFeatureTest) > length(originalFeatureTest))
expect_equal(length(newFeatureTest), nrow(imputedTestData$labels))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.