# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
test_that("createNormalizer works", {
normalizer <- createNormalizer(type = "minmax")
expect_equal(normalizer$type, "minmax")
expect_equal(attr(normalizer, "fun"), "minMaxNormalize")
expect_s3_class(normalizer, "featureEngineeringSettings")
normalizer <- createNormalizer(type = "robust", settings = list(clip = TRUE))
expect_equal(normalizer$type, "robust")
expect_error(createNormalizer(type = "mean"))
expect_error(createNormalizer(type = "median"))
expect_error(createNormalizer(type = "zscore"))
expect_error(createNormalizer(type = "none"))
expect_error(createNormalizer(
type = "robust",
settings = list(clip = 3)
))
})
test_that("normalization works", {
skip_if_not_installed("RSQLite") # until Andromeda 1.0.0 is released
addFeature <- function(data, covariateId, minValue,
maxValue, outliers = FALSE) {
data$covariateData <- Andromeda::copyAndromeda(data$covariateData)
nSubjects <- nrow(data$labels)
Andromeda::appendToTable(
data$covariateData$covariates,
data.frame(
rowId = data$labels$rowId,
covariateId = rep(covariateId, nSubjects),
covariateValue = runif(nSubjects, minValue, maxValue)
)
)
if (outliers) {
nOutliers <- floor(nSubjects / 10)
outlierRows <- sample(data$labels$rowId, nOutliers)
outlierValues <- runif(nOutliers, minValue * 10, maxValue * 10)
outlierData <- data.frame(
rowId = outlierRows,
covariateId = covariateId,
covariateValue = outlierValues
)
data$covariateData$covariates <- data$covariateData$covariates %>%
dplyr::filter(!(.data$rowId %in% outlierRows &
.data$covariateId == covariateId))
Andromeda::appendToTable(
data$covariateData$covariates,
outlierData
)
}
Andromeda::appendToTable(
data$covariateData$covariateRef,
data.frame(
covariateId = covariateId,
covariateName = "testCovariate",
analysisId = 999,
conceptId = 1
)
)
Andromeda::appendToTable(
data$covariateData$analysisRef,
data.frame(
analysisId = 999,
analysisName = "testAnalysis",
domainId = "testDomain",
startDay = 0,
endDay = 0,
isBinary = "N",
missingMeansZero = "N"
)
)
return(data)
}
testNormalizer <- function(data, testData, normalizer, outliers = FALSE) {
newData <- addFeature(data, 12101, -10, 10, outliers = outliers)
newTestData <- addFeature(testData, 12101, -10, 10, outliers = outliers)
if (normalizer$type == "minmax") {
normalizeFn <- minMaxNormalize
manualFn <- function(data, testData) {
testFeature <- testData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue)
trainMin <- min(data$covariateData$covariates %>% dplyr::filter(.data$covariateId == 12101) %>% dplyr::pull(.data$covariateValue))
trainMax <- max(data$covariateData$covariates %>% dplyr::filter(.data$covariateId == 12101) %>% dplyr::pull(.data$covariateValue))
testNormFeature <- (testFeature - trainMin) / (trainMax - trainMin)
return(testNormFeature)
}
minLimit <- 0
maxLimit <- 1
} else if (normalizer$type == "robust") {
normalizeFn <- robustNormalize
manualFn <- function(data, testData) {
testFeature <- testData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue)
trainMedian <- median(data$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue))
if (inherits(data$covariateData, "SQLiteConnection")) {
trainIQR <- IQR(data$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue), type = 1)
} else {
trainIQR <- IQR(data$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue))
}
testNormFeature <- (testFeature - trainMedian) / trainIQR
if (clip) {
testNormFeature <- testNormFeature / sqrt(1 + (testNormFeature / 3)^2)
}
return(testNormFeature)
}
if (clip) {
minLimit <- -3
maxLimit <- 3
} else {
minLimit <- -100
maxLimit <- 100
}
} else {
stop("Unknown normalizer type")
}
trainNormalizedData <- normalizeFn(newData, normalizer)
metaData <- attr(trainNormalizedData$covariateData, "metaData")
if (normalizer$type == "minmax") {
testSettings <- metaData$featureEngineering$minMaxNormalize$settings$featureEngineeringSettings
} else if (normalizer$type == "robust") {
testSettings <- metaData$featureEngineering$robustNormalize$settings$featureEngineeringSettings
}
testNormalizedData <- normalizeFn(newTestData, testSettings, done = TRUE)
trainFeature <- trainNormalizedData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue)
expect_true(all(trainFeature >= minLimit) && all(trainFeature <= maxLimit))
newTestFeature <- testNormalizedData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue)
testNormFeature <- manualFn(newData, newTestData)
expect_equal(newTestFeature, testNormFeature)
}
normalizer <- createNormalizer(type = "minmax")
testNormalizer(tinyTrainData, testData, normalizer)
normalizer <- createNormalizer(type = "robust", settings = list(clip = TRUE))
clip <- TRUE
testNormalizer(tinyTrainData, testData, normalizer, outliers = TRUE)
normalizer <- createNormalizer(type = "robust", settings = list(clip = FALSE))
clip <- FALSE
testNormalizer(tinyTrainData, testData, normalizer, outliers = TRUE)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.