tests/testthat/test-normalizers.R

# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

test_that("createNormalizer works", {
  normalizer <- createNormalizer(type = "minmax")
  expect_equal(normalizer$type, "minmax")
  expect_equal(attr(normalizer, "fun"), "minMaxNormalize")
  expect_s3_class(normalizer, "featureEngineeringSettings")

  normalizer <- createNormalizer(type = "robust", settings = list(clip = TRUE))
  expect_equal(normalizer$type, "robust")

  expect_error(createNormalizer(type = "mean"))
  expect_error(createNormalizer(type = "median"))
  expect_error(createNormalizer(type = "zscore"))
  expect_error(createNormalizer(type = "none"))
  expect_error(createNormalizer(
    type = "robust",
    settings = list(clip = 3)
  ))
})

test_that("normalization works", {
  skip_if_not_installed("RSQLite") # until Andromeda 1.0.0 is released
  addFeature <- function(data, covariateId, minValue,
                         maxValue, outliers = FALSE) {
    data$covariateData <- Andromeda::copyAndromeda(data$covariateData)
    nSubjects <- nrow(data$labels)
    Andromeda::appendToTable(
      data$covariateData$covariates,
      data.frame(
        rowId = data$labels$rowId,
        covariateId = rep(covariateId, nSubjects),
        covariateValue = runif(nSubjects, minValue, maxValue)
      )
    )
    if (outliers) {
      nOutliers <- floor(nSubjects / 10)
      outlierRows <- sample(data$labels$rowId, nOutliers)
      outlierValues <- runif(nOutliers, minValue * 10, maxValue * 10)
      outlierData <- data.frame(
        rowId = outlierRows,
        covariateId = covariateId,
        covariateValue = outlierValues
      )
      data$covariateData$covariates <- data$covariateData$covariates %>%
        dplyr::filter(!(.data$rowId %in% outlierRows &
          .data$covariateId == covariateId))
      Andromeda::appendToTable(
        data$covariateData$covariates,
        outlierData
      )
    }

    Andromeda::appendToTable(
      data$covariateData$covariateRef,
      data.frame(
        covariateId = covariateId,
        covariateName = "testCovariate",
        analysisId = 999,
        conceptId = 1
      )
    )
    Andromeda::appendToTable(
      data$covariateData$analysisRef,
      data.frame(
        analysisId = 999,
        analysisName = "testAnalysis",
        domainId = "testDomain",
        startDay = 0,
        endDay = 0,
        isBinary = "N",
        missingMeansZero = "N"
      )
    )
    return(data)
  }

  testNormalizer <- function(data, testData, normalizer, outliers = FALSE) {
    newData <- addFeature(data, 12101, -10, 10, outliers = outliers)
    newTestData <- addFeature(testData, 12101, -10, 10, outliers = outliers)
    if (normalizer$type == "minmax") {
      normalizeFn <- minMaxNormalize
      manualFn <- function(data, testData) {
        testFeature <- testData$covariateData$covariates %>%
          dplyr::filter(.data$covariateId == 12101) %>%
          dplyr::pull(.data$covariateValue)
        trainMin <- min(data$covariateData$covariates %>% dplyr::filter(.data$covariateId == 12101) %>% dplyr::pull(.data$covariateValue))
        trainMax <- max(data$covariateData$covariates %>% dplyr::filter(.data$covariateId == 12101) %>% dplyr::pull(.data$covariateValue))
        testNormFeature <- (testFeature - trainMin) / (trainMax - trainMin)
        return(testNormFeature)
      }
      minLimit <- 0
      maxLimit <- 1
    } else if (normalizer$type == "robust") {
      normalizeFn <- robustNormalize
      manualFn <- function(data, testData) {
        testFeature <- testData$covariateData$covariates %>%
          dplyr::filter(.data$covariateId == 12101) %>%
          dplyr::pull(.data$covariateValue)
        trainMedian <- median(data$covariateData$covariates %>%
          dplyr::filter(.data$covariateId == 12101) %>%
          dplyr::pull(.data$covariateValue))
        if (inherits(data$covariateData, "SQLiteConnection")) {
          trainIQR <- IQR(data$covariateData$covariates %>%
            dplyr::filter(.data$covariateId == 12101) %>%
            dplyr::pull(.data$covariateValue), type = 1)
        } else {
          trainIQR <- IQR(data$covariateData$covariates %>%
            dplyr::filter(.data$covariateId == 12101) %>%
            dplyr::pull(.data$covariateValue))
        }
        testNormFeature <- (testFeature - trainMedian) / trainIQR
        if (clip) {
          testNormFeature <- testNormFeature / sqrt(1 + (testNormFeature / 3)^2)
        }
        return(testNormFeature)
      }
      if (clip) {
        minLimit <- -3
        maxLimit <- 3
      } else {
        minLimit <- -100
        maxLimit <- 100
      }
    } else {
      stop("Unknown normalizer type")
    }
    trainNormalizedData <- normalizeFn(newData, normalizer)
    metaData <- attr(trainNormalizedData$covariateData, "metaData")
    if (normalizer$type == "minmax") {
      testSettings <- metaData$featureEngineering$minMaxNormalize$settings$featureEngineeringSettings
    } else if (normalizer$type == "robust") {
      testSettings <- metaData$featureEngineering$robustNormalize$settings$featureEngineeringSettings
    }
    testNormalizedData <- normalizeFn(newTestData, testSettings, done = TRUE)

    trainFeature <- trainNormalizedData$covariateData$covariates %>%
      dplyr::filter(.data$covariateId == 12101) %>%
      dplyr::pull(.data$covariateValue)
    expect_true(all(trainFeature >= minLimit) && all(trainFeature <= maxLimit))
    newTestFeature <- testNormalizedData$covariateData$covariates %>%
      dplyr::filter(.data$covariateId == 12101) %>%
      dplyr::pull(.data$covariateValue)
    testNormFeature <- manualFn(newData, newTestData)
    expect_equal(newTestFeature, testNormFeature)
  }
  normalizer <- createNormalizer(type = "minmax")
  testNormalizer(tinyTrainData, testData, normalizer)

  normalizer <- createNormalizer(type = "robust", settings = list(clip = TRUE))
  clip <- TRUE
  testNormalizer(tinyTrainData, testData, normalizer, outliers = TRUE)

  normalizer <- createNormalizer(type = "robust", settings = list(clip = FALSE))
  clip <- FALSE
  testNormalizer(tinyTrainData, testData, normalizer, outliers = TRUE)
})
OHDSI/PatientLevelPrediction documentation built on Feb. 14, 2025, 9:44 a.m.