test-dataSplitting.R
In PatientLevelPrediction: Develop Clinical Prediction Models Using the Common Data Model

# @file test_DataSplitting.R
# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# make sure pop is all plpData people
if (internet) {
  populationT <- plpData$cohorts
  populationT$outcomeCount <- sample(c(0, 1), nrow(populationT), replace = TRUE)
  attr(populationT, "metaData")$outcomeId <- outcomeId
  attr(populationT, "metaData")$populationSettings <- list(madeup = TRUE)
  attr(populationT, "metaData")$restrictPlpDataSettings <- list(madeup = TRUE)
  attr(populationT, "metaData")$attrition <- c(1, 2, 3)
}
# check correct inputs
testFraction1 <- sample(9, 1) / 10
trainFraction1 <- 1 - testFraction1
splitSeed1 <- sample(100000, 1)
nfold1 <- 1 + sample(10, 1)
type1 <- sample(c("stratified", "time", "subject"), 1)

defaultSetting <- function(
    testFraction = testFraction1,
    trainFraction = trainFraction1,
    splitSeed = splitSeed1,
    nfold = nfold1,
    type = type1) {
  result <- createDefaultSplitSetting(
    testFraction = testFraction,
    trainFraction = trainFraction,
    splitSeed = splitSeed,
    nfold = nfold,
    type = type
  )
  return(result)
}


test_that("createDefaultSplitSetting", {
  splitSettings <- defaultSetting()

  expect_s3_class(splitSettings, "splitSettings")

  expectFun <- "randomSplitter"
  if (type1 == "time") {
    expectFun <- "timeSplitter"
  }
  if (type1 == "subject") {
    expectFun <- "subjectSplitter"
  }

  expect_equal(attr(splitSettings, "fun"), expectFun)

  expect_equal(splitSettings$test, testFraction1)
  expect_equal(splitSettings$train, trainFraction1)
  expect_equal(splitSettings$seed, splitSeed1)
  expect_equal(splitSettings$nfold, nfold1)

  # check input errors for testFraction
  expect_error(
    defaultSetting(testFraction = "character")
  )

  expect_error(
    defaultSetting(testFraction = -0.1)
  )

  expect_error(
    defaultSetting(testFraction = 1.001)
  )

  # check input error for trainFraction
  expect_error(
    defaultSetting(trainFraction = "trainFraction")
  )

  expect_error(
    defaultSetting(trainFraction = 1.2)
  )

  expect_error(
    defaultSetting(trainFraction = -0.2)
  )

  # check error for splitSeed

  expect_error(
    defaultSetting(splitSeed = NULL)
  )

  expect_error(
    defaultSetting(splitSeed = "NULL")
  )

  # check error for nfold
  expect_error(
    defaultSetting(nfold = NULL)
  )
  expect_error(
    defaultSetting(nfold = "NULL")
  )

  # incorrect type
  expect_error(
    defaultSetting(type = "madeup")
  )
  expect_error(
    defaultSetting(type = NULL)
  )
  expect_error(
    defaultSetting(type = 1)
  )
})



test_that("Main split function: splitData", {
  skip_if_offline()
  # check default settings with test/train
  splitSettings <- defaultSetting()

  splitData <- splitData(
    plpData = plpData,
    population = populationT,
    splitSettings = splitSettings
  )

  # check class
  expect_s3_class(splitData, "splitData")

  # should have test/train
  expect_equal(names(splitData), c("Train", "Test"))

  # train and test are CovariateData
  expect_s4_class(splitData$Train$covariateData, "CovariateData")
  expect_s4_class(splitData$Test$covariateData, "CovariateData")

  # Train has labels/folds/covariateData
  expect_equal(names(splitData$Train), c("labels", "folds", "covariateData"))

  # Test has labels/covariateData
  expect_equal(names(splitData$Test), c("labels", "covariateData"))

  # check attributes for Train
  expect_equal(attr(splitData$Train, "metaData")$outcomeId, attr(populationT, "metaData")$outcomeId)
  expect_equal(attr(splitData$Train, "metaData")$targetId, plpData$metaData$databaseDetails$targetId)
  expect_equal(
    attr(splitData$Train, "metaData")$cdmDatabaseSchema,
    plpData$metaData$databaseDetails$cdmDatabaseSchema
  )

  expect_type(attr(splitData$Train, "metaData")$restrictPlpDataSettings, "list")
  expect_equal(
    attr(splitData$Train, "metaData")$covariateSettings,
    plpData$metaData$covariateSettings
  )
  expect_equal(
    attr(splitData$Train, "metaData")$populationSettings,
    attr(populationT, "metaData")$populationSettings
  )
  expect_equal(
    attr(splitData$Train, "metaData")$attrition,
    attr(populationT, "metaData")$attrition
  )

  expect_equal(
    attr(splitData$Train, "metaData")$splitSettings,
    splitSettings
  )

  # train+test should be full data as train+test = 1
  expect_equal(
    nrow(splitData$Train$labels) + nrow(splitData$Test$labels),
    nrow(populationT)
  )
  expect_equal(
    splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() +
      splitData$Test$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
    plpData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
  )

  # make sure nfolds is correct
  expect_equal(
    min(splitData$Train$folds$index),
    1
  )

  expect_equal(
    max(splitData$Train$folds$index),
    splitSettings$nfold
  )


  # check when test is 0
  splitSettings <- defaultSetting(
    testFraction = 0,
    trainFraction = 1
  )

  splitData <- splitData(
    plpData = plpData,
    population = populationT,
    splitSettings = splitSettings
  )

  # should just have train
  expect_equal(names(splitData), c("Train"))

  # train labels should be the same size at the population
  expect_equal(
    nrow(splitData$Train$labels),
    nrow(populationT)
  )
  expect_equal(
    splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
    plpData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
  )
})

test_that("dataSummary works", {
  skip_if_offline()
  splitSettings <- defaultSetting(
    testFraction = 0,
    trainFraction = 1
  )

  splitData <- splitData(
    plpData = plpData,
    population = populationT,
    splitSettings = splitSettings
  )

  summaryPrint <- dataSummary(splitData)
  expect_equal(summaryPrint, TRUE)
})


test_that("Data stratified splitting", {
  splitSettings <- defaultSetting(
    test = 0.3,
    nfold = 3
  )

  # error due to insufficient outcomes
  dsPopulation1 <- data.frame(rowId = 1:20, outcomeCount = c(1, 1, 1, 1, rep(0, 16)))
  expect_error(randomSplitter(population = dsPopulation1, splitSettings = splitSettings))

  dsPopulation2 <- data.frame(rowId = 1:200, outcomeCount = c(rep(1, 42), rep(0, 158)))
  splitSettings <- defaultSetting(
    train = 0.7,
    test = 0.3,
    nfold = 4
  )
  # fold creation check 1 (fixed)
  test <- randomSplitter(population = dsPopulation2, splitSettings = splitSettings)
  test <- merge(dsPopulation2, test)
  test <- table(test$outcomeCount, test$index)
  testReturned <- paste(test, collapse = "-")
  testExpected <- paste(matrix(c(47, 28, 28, 28, 27, 12, 8, 8, 7, 7), ncol = 5, byrow = TRUE), collapse = "-")
  expect_identical(testReturned, testExpected)

  # fold creation check 2 (sum)
  size <- 500
  dsPopulation3 <- data.frame(rowId = 1:size, outcomeCount = c(rep(1, floor(size / 3)), rep(0, size - floor(size / 3))))
  splitSettings <- defaultSetting(
    train = 0.8,
    test = 0.2,
    nfold = 4
  )
  test <- randomSplitter(population = dsPopulation3, splitSettings = splitSettings)
  test <- merge(dsPopulation3, test)
  test <- table(test$outcomeCount, test$index)
  expect_equal(sum(test), size)

  # test the training fraction parameter for learning curves
  size <- 500
  dsPopulation4 <- data.frame(
    rowId = 1:size,
    outcomeCount = c(
      rep(1, floor(size / 3)),
      rep(0, size - floor(size / 3))
    )
  )
  splitSettings <- defaultSetting(
    train = 0.4,
    test = 0.2,
    nfold = 4
  )
  test <- randomSplitter(population = dsPopulation4, splitSettings = splitSettings)

  tolerance <- 5
  excludedPatients <- 200
  # test, if the number of patients in each fold are roughly the same
  expect_equal(length(test$index[test$index == 1]),
    length(test$index[test$index == 3]),
    tolerance = tolerance
  )
  expect_equal(length(test$index[test$index == 2]),
    length(test$index[test$index == 4]),
    tolerance = tolerance
  )
  expect_equal(length(test$index[test$index == 1]),
    length(test$index[test$index == 4]),
    tolerance = tolerance
  )
  # test, if patients were excluded according to the training fraction
  expect_equal(
    length(test$index[test$index == 0]),
    excludedPatients
  )
})

test_that("Data splitting by time", {
  # fold creation check (sum)
  size <- 500
  set.seed(1)
  dsPopulation2 <- data.frame(
    rowId = 1:size,
    outcomeCount = sample(0:1, size, replace = TRUE),
    cohortStartDate = as.Date("2010-01-01") + c(1:size)
  )
  splitSettings <- defaultSetting(
    train = 0.8,
    test = 0.2,
    nfold = 4
  )

  test <- timeSplitter(population = dsPopulation2, splitSettings = splitSettings)
  test <- merge(dsPopulation2, test)
  test <- table(test$outcomeCount, test$index)
  expect_equal(sum(test), size)

  # test the training fraction parameter for learning curves
  size <- 500
  set.seed(1)
  dsPopulation3 <- data.frame(
    rowId = 1:size,
    outcomeCount = sample(0:1, size, replace = TRUE),
    cohortStartDate = as.Date("2010-01-01") + c(1:size)
  )
  splitSettings <- defaultSetting(
    train = 0.4,
    test = 0.2,
    nfold = 4
  )
  test <- timeSplitter(population = dsPopulation3, splitSettings = splitSettings)

  tolerance <- 5
  excludedPatients <- 196
  # test, if the number of patients in each fold are roughly the same
  expect_equal(length(test$index[test$index == 1]),
    length(test$index[test$index == 3]),
    tolerance = tolerance
  )
  expect_equal(length(test$index[test$index == 2]),
    length(test$index[test$index == 4]),
    tolerance = tolerance
  )
  expect_equal(length(test$index[test$index == 1]),
    length(test$index[test$index == 4]),
    tolerance = tolerance
  )
  # test, if patients were excluded according to the training fraction
  expect_equal(
    length(test$index[test$index == 0]),
    excludedPatients
  )
})



test_that("Data splitting by subject", {
  # error message checks
  dsPopulation1 <- data.frame(rowId = 1:20, subjectId = 1:20, outcomeCount = c(1, 1, 1, 1, rep(0, 16)))
  splitSettings <- defaultSetting(
    train = 0.7,
    test = 0.3,
    nfold = 3
  )
  expect_error(subjectSplitter(population = dsPopulation1, splitSettings = splitSettings))

  dsPopulation2 <- data.frame(rowId = 1:200, subjectId = 1:200, outcomeCount = c(rep(1, 42), rep(0, 158)))
  splitSettings <- defaultSetting(
    train = 0.8,
    test = 0.2,
    nfold = 4
  )
  test <- subjectSplitter(population = dsPopulation2, splitSettings = splitSettings)
  test <- merge(dsPopulation2, test)
  test <- table(test$outcomeCount, test$index)
  testReturned <- paste(test, collapse = "-")
  testExpected <- paste(matrix(c(32, 32, 32, 31, 31, 8, 9, 9, 8, 8), ncol = 5, byrow = TRUE), collapse = "-")
  expect_identical(testReturned, testExpected)

  # test that people are not in multiple folds
  dsPopulation3 <- data.frame(rowId = 1:200, subjectId = rep(1:50, 4), outcomeCount = c(rep(1, 42), rep(0, 158)))
  splitSettings <- defaultSetting(
    train = 0.75,
    test = 0.25,
    nfold = 3
  )
  test <- subjectSplitter(population = dsPopulation3, splitSettings = splitSettings)
  test <- merge(dsPopulation3, test)

  expect_equal(unique(table(test$subjectId[test$index == -1])), 4)
  expect_equal(unique(table(test$subjectId[test$index == 2])), 4)
  expect_equal(unique(table(test$subjectId[test$index == 3])), 4)
  expect_equal(unique(table(test$subjectId[test$index == 1])), 4)

  # test that no subject is not assigned a fold
  expect_equal(sum(test$index == 0), 0)
})

test_that("Existing data splitter works", {
  skip_if_offline()
  # split by age
  age <- population$ageYear
  # create empty index same lengths as age
  index <- rep(0, length(age))
  index[age > 43] <- -1 # test set
  index[age <= 35] <- 1 # train fold 1
  index[age > 35 & age <= 43] <- 2 # train fold 2
  splitIds <- data.frame(rowId = population$rowId, index = index)
  splitSettings <- createExistingSplitSettings(splitIds)
  ageSplit <- splitData(
    plpData = plpData,
    population = population,
    splitSettings = splitSettings
  )

  # test only old people in test
  expect_equal(
    length(ageSplit$Test$labels$rowId),
    sum(age > 43)
  )
  # only young people in train
  expect_equal(
    length(ageSplit$Train$labels$rowId),
    sum(age <= 43)
  )
  # no overlap
  expect_equal(
    length(intersect(ageSplit$Test$labels$rowId, ageSplit$Train$labels$rowId)),
    0
  )
})

test_that("Outcome options works", {
  testPop <- data.frame(outcomeCount = sample(c(0, 1), 9, replace = TRUE))
  # regular plp outcome check
  expect_error(checkOutcomes(testPop, train = 0.75, nfold = 3))

  withr::with_options(list("plp.outcomes" = 100), {
    testPop <- data.frame(outcomeCount = sample(c(0, 1), 90, replace = TRUE))
    expect_error(checkOutcomes(testPop, train = 0.75, nfold = 3))
  })
})
Any scripts or data that you put into this service are public.
PatientLevelPrediction documentation built on April 3, 2025, 9:58 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PatientLevelPrediction
Develop Clinical Prediction Models Using the Common Data Model

tests/testthat/test-dataSplitting.R
In PatientLevelPrediction: Develop Clinical Prediction Models Using the Common Data Model

Try the PatientLevelPrediction package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PatientLevelPrediction Develop Clinical Prediction Models Using the Common Data Model

tests/testthat/test-dataSplitting.R In PatientLevelPrediction: Develop Clinical Prediction Models Using the Common Data Model

Try the PatientLevelPrediction package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PatientLevelPrediction
Develop Clinical Prediction Models Using the Common Data Model

tests/testthat/test-dataSplitting.R
In PatientLevelPrediction: Develop Clinical Prediction Models Using the Common Data Model