tests/testthat/test-dataSplitting.R

# @file test_DataSplitting.R
# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# make sure pop is all plpData people
if (internet) {
  populationT <- plpData$cohorts
  populationT$outcomeCount <- sample(c(0, 1), nrow(populationT), replace = TRUE)
  attr(populationT, "metaData")$outcomeId <- outcomeId
  attr(populationT, "metaData")$populationSettings <- list(madeup = TRUE)
  attr(populationT, "metaData")$restrictPlpDataSettings <- list(madeup = TRUE)
  attr(populationT, "metaData")$attrition <- c(1, 2, 3)
}
# check correct inputs
testFraction1 <- sample(9, 1) / 10
trainFraction1 <- 1 - testFraction1
splitSeed1 <- sample(100000, 1)
nfold1 <- 1 + sample(10, 1)
type1 <- sample(c("stratified", "time", "subject"), 1)

defaultSetting <- function(
    testFraction = testFraction1,
    trainFraction = trainFraction1,
    splitSeed = splitSeed1,
    nfold = nfold1,
    type = type1) {
  result <- createDefaultSplitSetting(
    testFraction = testFraction,
    trainFraction = trainFraction,
    splitSeed = splitSeed,
    nfold = nfold,
    type = type
  )
  return(result)
}


test_that("createDefaultSplitSetting", {
  splitSettings <- defaultSetting()

  expect_s3_class(splitSettings, "splitSettings")

  expectFun <- "randomSplitter"
  if (type1 == "time") {
    expectFun <- "timeSplitter"
  }
  if (type1 == "subject") {
    expectFun <- "subjectSplitter"
  }

  expect_equal(attr(splitSettings, "fun"), expectFun)

  expect_equal(splitSettings$test, testFraction1)
  expect_equal(splitSettings$train, trainFraction1)
  expect_equal(splitSettings$seed, splitSeed1)
  expect_equal(splitSettings$nfold, nfold1)

  # check input errors for testFraction
  expect_error(
    defaultSetting(testFraction = "character")
  )

  expect_error(
    defaultSetting(testFraction = -0.1)
  )

  expect_error(
    defaultSetting(testFraction = 1.001)
  )

  # check input error for trainFraction
  expect_error(
    defaultSetting(trainFraction = "trainFraction")
  )

  expect_error(
    defaultSetting(trainFraction = 1.2)
  )

  expect_error(
    defaultSetting(trainFraction = -0.2)
  )

  # check error for splitSeed

  expect_error(
    defaultSetting(splitSeed = NULL)
  )

  expect_error(
    defaultSetting(splitSeed = "NULL")
  )

  # check error for nfold
  expect_error(
    defaultSetting(nfold = NULL)
  )
  expect_error(
    defaultSetting(nfold = "NULL")
  )

  # incorrect type
  expect_error(
    defaultSetting(type = "madeup")
  )
  expect_error(
    defaultSetting(type = NULL)
  )
  expect_error(
    defaultSetting(type = 1)
  )
})



test_that("Main split function: splitData", {
  skip_if_offline()
  # check default settings with test/train
  splitSettings <- defaultSetting()

  splitData <- splitData(
    plpData = plpData,
    population = populationT,
    splitSettings = splitSettings
  )

  # check class
  expect_s3_class(splitData, "splitData")

  # should have test/train
  expect_equal(names(splitData), c("Train", "Test"))

  # train and test are CovariateData
  expect_s4_class(splitData$Train$covariateData, "CovariateData")
  expect_s4_class(splitData$Test$covariateData, "CovariateData")

  # Train has labels/folds/covariateData
  expect_equal(names(splitData$Train), c("labels", "folds", "covariateData"))

  # Test has labels/covariateData
  expect_equal(names(splitData$Test), c("labels", "covariateData"))

  # check attributes for Train
  expect_equal(attr(splitData$Train, "metaData")$outcomeId, attr(populationT, "metaData")$outcomeId)
  expect_equal(attr(splitData$Train, "metaData")$targetId, plpData$metaData$databaseDetails$targetId)
  expect_equal(
    attr(splitData$Train, "metaData")$cdmDatabaseSchema,
    plpData$metaData$databaseDetails$cdmDatabaseSchema
  )

  expect_type(attr(splitData$Train, "metaData")$restrictPlpDataSettings, "list")
  expect_equal(
    attr(splitData$Train, "metaData")$covariateSettings,
    plpData$metaData$covariateSettings
  )
  expect_equal(
    attr(splitData$Train, "metaData")$populationSettings,
    attr(populationT, "metaData")$populationSettings
  )
  expect_equal(
    attr(splitData$Train, "metaData")$attrition,
    attr(populationT, "metaData")$attrition
  )

  expect_equal(
    attr(splitData$Train, "metaData")$splitSettings,
    splitSettings
  )

  # train+test should be full data as train+test = 1
  expect_equal(
    nrow(splitData$Train$labels) + nrow(splitData$Test$labels),
    nrow(populationT)
  )
  expect_equal(
    splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() +
      splitData$Test$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
    plpData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
  )

  # make sure nfolds is correct
  expect_equal(
    min(splitData$Train$folds$index),
    1
  )

  expect_equal(
    max(splitData$Train$folds$index),
    splitSettings$nfold
  )


  # check when test is 0
  splitSettings <- defaultSetting(
    testFraction = 0,
    trainFraction = 1
  )

  splitData <- splitData(
    plpData = plpData,
    population = populationT,
    splitSettings = splitSettings
  )

  # should just have train
  expect_equal(names(splitData), c("Train"))

  # train labels should be the same size at the population
  expect_equal(
    nrow(splitData$Train$labels),
    nrow(populationT)
  )
  expect_equal(
    splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
    plpData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
  )
})

test_that("dataSummary works", {
  skip_if_offline()
  splitSettings <- defaultSetting(
    testFraction = 0,
    trainFraction = 1
  )

  splitData <- splitData(
    plpData = plpData,
    population = populationT,
    splitSettings = splitSettings
  )

  summaryPrint <- dataSummary(splitData)
  expect_equal(summaryPrint, TRUE)
})


test_that("Data stratified splitting", {
  splitSettings <- defaultSetting(
    test = 0.3,
    nfold = 3
  )

  # error due to insufficient outcomes
  dsPopulation1 <- data.frame(rowId = 1:20, outcomeCount = c(1, 1, 1, 1, rep(0, 16)))
  expect_error(randomSplitter(population = dsPopulation1, splitSettings = splitSettings))

  dsPopulation2 <- data.frame(rowId = 1:200, outcomeCount = c(rep(1, 42), rep(0, 158)))
  splitSettings <- defaultSetting(
    train = 0.7,
    test = 0.3,
    nfold = 4
  )
  # fold creation check 1 (fixed)
  test <- randomSplitter(population = dsPopulation2, splitSettings = splitSettings)
  test <- merge(dsPopulation2, test)
  test <- table(test$outcomeCount, test$index)
  testReturned <- paste(test, collapse = "-")
  testExpected <- paste(matrix(c(47, 28, 28, 28, 27, 12, 8, 8, 7, 7), ncol = 5, byrow = TRUE), collapse = "-")
  expect_identical(testReturned, testExpected)

  # fold creation check 2 (sum)
  size <- 500
  dsPopulation3 <- data.frame(rowId = 1:size, outcomeCount = c(rep(1, floor(size / 3)), rep(0, size - floor(size / 3))))
  splitSettings <- defaultSetting(
    train = 0.8,
    test = 0.2,
    nfold = 4
  )
  test <- randomSplitter(population = dsPopulation3, splitSettings = splitSettings)
  test <- merge(dsPopulation3, test)
  test <- table(test$outcomeCount, test$index)
  expect_equal(sum(test), size)

  # test the training fraction parameter for learning curves
  size <- 500
  dsPopulation4 <- data.frame(
    rowId = 1:size,
    outcomeCount = c(
      rep(1, floor(size / 3)),
      rep(0, size - floor(size / 3))
    )
  )
  splitSettings <- defaultSetting(
    train = 0.4,
    test = 0.2,
    nfold = 4
  )
  test <- randomSplitter(population = dsPopulation4, splitSettings = splitSettings)

  tolerance <- 5
  excludedPatients <- 200
  # test, if the number of patients in each fold are roughly the same
  expect_equal(length(test$index[test$index == 1]),
    length(test$index[test$index == 3]),
    tolerance = tolerance
  )
  expect_equal(length(test$index[test$index == 2]),
    length(test$index[test$index == 4]),
    tolerance = tolerance
  )
  expect_equal(length(test$index[test$index == 1]),
    length(test$index[test$index == 4]),
    tolerance = tolerance
  )
  # test, if patients were excluded according to the training fraction
  expect_equal(
    length(test$index[test$index == 0]),
    excludedPatients
  )
})

test_that("Data splitting by time", {
  # fold creation check (sum)
  size <- 500
  set.seed(1)
  dsPopulation2 <- data.frame(
    rowId = 1:size,
    outcomeCount = sample(0:1, size, replace = TRUE),
    cohortStartDate = as.Date("2010-01-01") + c(1:size)
  )
  splitSettings <- defaultSetting(
    train = 0.8,
    test = 0.2,
    nfold = 4
  )

  test <- timeSplitter(population = dsPopulation2, splitSettings = splitSettings)
  test <- merge(dsPopulation2, test)
  test <- table(test$outcomeCount, test$index)
  expect_equal(sum(test), size)

  # test the training fraction parameter for learning curves
  size <- 500
  set.seed(1)
  dsPopulation3 <- data.frame(
    rowId = 1:size,
    outcomeCount = sample(0:1, size, replace = TRUE),
    cohortStartDate = as.Date("2010-01-01") + c(1:size)
  )
  splitSettings <- defaultSetting(
    train = 0.4,
    test = 0.2,
    nfold = 4
  )
  test <- timeSplitter(population = dsPopulation3, splitSettings = splitSettings)

  tolerance <- 5
  excludedPatients <- 196
  # test, if the number of patients in each fold are roughly the same
  expect_equal(length(test$index[test$index == 1]),
    length(test$index[test$index == 3]),
    tolerance = tolerance
  )
  expect_equal(length(test$index[test$index == 2]),
    length(test$index[test$index == 4]),
    tolerance = tolerance
  )
  expect_equal(length(test$index[test$index == 1]),
    length(test$index[test$index == 4]),
    tolerance = tolerance
  )
  # test, if patients were excluded according to the training fraction
  expect_equal(
    length(test$index[test$index == 0]),
    excludedPatients
  )
})



test_that("Data splitting by subject", {
  # error message checks
  dsPopulation1 <- data.frame(rowId = 1:20, subjectId = 1:20, outcomeCount = c(1, 1, 1, 1, rep(0, 16)))
  splitSettings <- defaultSetting(
    train = 0.7,
    test = 0.3,
    nfold = 3
  )
  expect_error(subjectSplitter(population = dsPopulation1, splitSettings = splitSettings))

  dsPopulation2 <- data.frame(rowId = 1:200, subjectId = 1:200, outcomeCount = c(rep(1, 42), rep(0, 158)))
  splitSettings <- defaultSetting(
    train = 0.8,
    test = 0.2,
    nfold = 4
  )
  test <- subjectSplitter(population = dsPopulation2, splitSettings = splitSettings)
  test <- merge(dsPopulation2, test)
  test <- table(test$outcomeCount, test$index)
  testReturned <- paste(test, collapse = "-")
  testExpected <- paste(matrix(c(32, 32, 32, 31, 31, 8, 9, 9, 8, 8), ncol = 5, byrow = TRUE), collapse = "-")
  expect_identical(testReturned, testExpected)

  # test that people are not in multiple folds
  dsPopulation3 <- data.frame(rowId = 1:200, subjectId = rep(1:50, 4), outcomeCount = c(rep(1, 42), rep(0, 158)))
  splitSettings <- defaultSetting(
    train = 0.75,
    test = 0.25,
    nfold = 3
  )
  test <- subjectSplitter(population = dsPopulation3, splitSettings = splitSettings)
  test <- merge(dsPopulation3, test)

  expect_equal(unique(table(test$subjectId[test$index == -1])), 4)
  expect_equal(unique(table(test$subjectId[test$index == 2])), 4)
  expect_equal(unique(table(test$subjectId[test$index == 3])), 4)
  expect_equal(unique(table(test$subjectId[test$index == 1])), 4)

  # test that no subject is not assigned a fold
  expect_equal(sum(test$index == 0), 0)
})

test_that("Existing data splitter works", {
  skip_if_offline()
  # split by age
  age <- population$ageYear
  # create empty index same lengths as age
  index <- rep(0, length(age))
  index[age > 43] <- -1 # test set
  index[age <= 35] <- 1 # train fold 1
  index[age > 35 & age <= 43] <- 2 # train fold 2
  splitIds <- data.frame(rowId = population$rowId, index = index)
  splitSettings <- createExistingSplitSettings(splitIds)
  ageSplit <- splitData(
    plpData = plpData,
    population = population,
    splitSettings = splitSettings
  )

  # test only old people in test
  expect_equal(
    length(ageSplit$Test$labels$rowId),
    sum(age > 43)
  )
  # only young people in train
  expect_equal(
    length(ageSplit$Train$labels$rowId),
    sum(age <= 43)
  )
  # no overlap
  expect_equal(
    length(intersect(ageSplit$Test$labels$rowId, ageSplit$Train$labels$rowId)),
    0
  )
})

test_that("Outcome options works", {
  testPop <- data.frame(outcomeCount = sample(c(0, 1), 9, replace = TRUE))
  # regular plp outcome check
  expect_error(checkOutcomes(testPop, train = 0.75, nfold = 3))

  withr::with_options(list("plp.outcomes" = 100), {
    testPop <- data.frame(outcomeCount = sample(c(0, 1), 90, replace = TRUE))
    expect_error(checkOutcomes(testPop, train = 0.75, nfold = 3))
  })
})

Try the PatientLevelPrediction package in your browser

Any scripts or data that you put into this service are public.

PatientLevelPrediction documentation built on April 3, 2025, 9:58 p.m.