R/Sampling.R

Defines functions overSampleData underSampleData sameData sampleData createSampleSettings

Documented in createSampleSettings

# @file Sampling.R
# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Create the settings for defining how the trainData from \code{splitData} are sampled using
#' default sample functions.
#'
#' @details
#' Returns an object of class \code{sampleSettings} that specifies the sampling function that will be called and the settings
#'
#' @param type              (character) Choice of:  \itemize{
#'                                         \item 'none' No sampling is applied - this is the default
#'                                         \item 'underSample' Undersample the non-outcome class to make the data more balanced
#'                                         \item 'overSample' Oversample the outcome class by adding in each outcome multiple times
#'                                         }
#' @param numberOutcomestoNonOutcomes   (numeric) A numeric specifying the required number of outcomes per non-outcomes
#' @param sampleSeed         (numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)
#'
#' @return
#' An object of class \code{sampleSettings}
#' @examplesIf rlang::is_installed("Eunomia")
#' \donttest{
#' # sample even rate of outcomes to non-outcomes
#' sampleSetting <- createSampleSettings(type = "underSample", 
#'                                       numberOutcomestoNonOutcomes = 1, 
#'                                       sampleSeed = 42)
#' }
#' @export
createSampleSettings <- function(type = "none",
                                 numberOutcomestoNonOutcomes = 1,
                                 sampleSeed = sample(10000, 1)) {
  checkIsClass(numberOutcomestoNonOutcomes, c("numeric", "integer"))
  checkHigher(numberOutcomestoNonOutcomes, 0)
  checkIsClass(sampleSeed, c("numeric", "integer"))
  checkIsClass(type, c("character"))
  if (!type %in% c("none", "underSample", "overSample")) {
    stop("Incorrect type.  Pick: none/underSample/overSample")
  }

  if (type %in% c("underSample", "overSample")) {
    ParallelLogger::logWarn("The previous documentation for `numberOutcomestoNonOutcomes` used to not reflect the functionality and has now been changed. The user needs to make sure the code is not relying on what was in the docs previously.")
  }

  sampleSettings <- list(
    numberOutcomestoNonOutcomes = numberOutcomestoNonOutcomes,
    sampleSeed = ifelse(type == "none", 1, sampleSeed) # to make it the same for none
  )

  if (type == "none") {
    attr(sampleSettings, "fun") <- "sameData"
  }
  if (type == "underSample") {
    attr(sampleSettings, "fun") <- "underSampleData"
  }
  if (type == "overSample") {
    attr(sampleSettings, "fun") <- "overSampleData" # TODO
  }
  class(sampleSettings) <- "sampleSettings"
  return(sampleSettings)
}

# code to run the sampling - add desc
sampleData <- function(trainData, sampleSettings) {
  start <- Sys.time()
  metaData <- attr(trainData, "metaData")

  ParallelLogger::logInfo("Starting data sampling")

  # if a single setting, make it a list
  if (inherits(sampleSettings, "sampleSettings")) {
    sampleSettings <- list(sampleSettings)
  }

  for (sampleSetting in sampleSettings) {
    fun <- attr(sampleSetting, "fun")
    args <- list(
      trainData = trainData,
      sampleSettings = sampleSetting
    )
    ParallelLogger::logInfo(paste0("Applying ", fun))
    trainData <- do.call(eval(parse(text = fun)), args)
  }

  ParallelLogger::logInfo("Finished data sampling")

  metaData$sampleSettings <- sampleSettings

  attr(trainData, "metaData") <- metaData
  delta <- Sys.time() - start
  ParallelLogger::logInfo("Sampling took ", 
    signif(delta, 3), " ", attr(delta, "units"))
  return(trainData)
}



sameData <- function(trainData, ...) {
  ParallelLogger::logInfo("No sampling - returning same data")

  # add attribute for FE
  featureEngeering <- list(
    funct = "sameData",
    settings = list(
      none = TRUE
    )
  )
  attr(trainData, "metaData")$featureEngineering <- listAppend(
    attr(trainData, "metaData")$featureEngineering,
    featureEngeering
  )

  return(trainData)
}

underSampleData <- function(trainData, sampleSettings) {
  checkIsClass(sampleSettings$sampleSeed, c("numeric", "integer"))
  checkIsClass(sampleSettings$numberOutcomestoNonOutcomes, c("numeric", "integer"))
  checkHigherEqual(sampleSettings$numberOutcomestoNonOutcomes, 0)

  ParallelLogger::logInfo(paste0("sampleSeed: ", sampleSettings$sampleSeed))
  ParallelLogger::logInfo(paste0("numberOutcomestoNonOutcomes: ", sampleSettings$numberOutcomestoNonOutcomes))

  set.seed(sampleSettings$sampleSeed)
  ParallelLogger::logInfo(paste0("Starting undersampling with seed ", sampleSettings$sampleSeed))

  population <- trainData$labels %>% dplyr::collect()
  folds <- trainData$folds %>% dplyr::collect()

  population <- merge(population, folds, by = "rowId")

  ParallelLogger::logInfo(paste0(
    "Initial train data has ", sum(population$outcomeCount > 0), " outcomes to ",
    sum(population$outcomeCount == 0), " non-outcomes"
  ))

  pplOfInterest <- c()
  for (i in unique(folds$index)) {
    outcomeIds <- population$rowId[population$outcomeCount > 0 & population$index == i]
    nonoutcomeIds <- population$rowId[population$outcomeCount == 0 & population$index == i]

    sampleSize <- length(outcomeIds) / sampleSettings$numberOutcomestoNonOutcomes

    if (sampleSize > length(nonoutcomeIds)) {
      ParallelLogger::logWarn("Non-outcome count less that require sample size")
      sampleSize <- length(nonoutcomeIds)
    }

    # randomly pick non-outcome people
    sampleNonoutcomeIds <- sample(nonoutcomeIds, sampleSize)

    pplOfInterest <- c(pplOfInterest, outcomeIds, sampleNonoutcomeIds)
  }

  # filter to these patients
  sampleTrainData <- list()
  class(sampleTrainData) <- "plpData"
  sampleTrainData$labels <- trainData$labels %>% dplyr::filter(.data$rowId %in% pplOfInterest)
  sampleTrainData$folds <- trainData$folds %>% dplyr::filter(.data$rowId %in% pplOfInterest)



  sampleTrainData$covariateData <- Andromeda::andromeda()
  sampleTrainData$covariateData$covariateRef <- trainData$covariateData$covariateRef
  sampleTrainData$covariateData$covariates <- trainData$covariateData$covariates %>%
    dplyr::filter(.data$rowId %in% pplOfInterest)

  # update metaData$populationSize = nrow(trainData$labels)
  metaData <- attr(trainData$covariateData, "metaData")
  metaData$populationSize <- nrow(sampleTrainData$labels)
  attr(sampleTrainData$covariateData, "metaData") <- metaData

  class(sampleTrainData$covariateData) <- "CovariateData"

  return(sampleTrainData)
}

overSampleData <- function(trainData, sampleSettings) {
  checkIsClass(sampleSettings$sampleSeed, c("numeric", "integer"))
  checkIsClass(sampleSettings$numberOutcomestoNonOutcomes, c("numeric", "integer"))
  checkHigherEqual(sampleSettings$numberOutcomestoNonOutcomes, 0)

  ParallelLogger::logInfo(paste0("sampleSeed: ", sampleSettings$sampleSeed))
  ParallelLogger::logInfo(paste0("numberOutcomestoNonOutcomes: ", sampleSettings$numberOutcomestoNonOutcomes))

  set.seed(sampleSettings$sampleSeed)
  ParallelLogger::logInfo(paste0("Starting oversampling with seed ", sampleSettings$sampleSeed))

  population <- trainData$labels %>% dplyr::collect()
  folds <- trainData$folds %>% dplyr::collect()

  population <- merge(population, folds, by = "rowId")

  ParallelLogger::logInfo(paste0(
    "Initial train data has ", sum(population$outcomeCount > 0), " outcomes to ",
    sum(population$outcomeCount == 0), " non-outcomes"
  ))

  sampleTrainData <- list()
  class(sampleTrainData) <- "plpData"
  sampleTrainData$labels <- trainData$labels %>% dplyr::collect()
  sampleTrainData$folds <- trainData$folds %>% dplyr::collect()

  sampleTrainData$covariateData <- Andromeda::andromeda()
  sampleTrainData$covariateData$covariateRef <- trainData$covariateData$covariateRef
  sampleTrainData$covariateData$covariates <- trainData$covariateData$covariates

  for (i in unique(folds$index)) {
    outcomeIds <- population$rowId[population$outcomeCount > 0 & population$index == i]
    nonoutcomeIds <- population$rowId[population$outcomeCount == 0 & population$index == i]

    sampleSize <- floor(length(nonoutcomeIds) * sampleSettings$numberOutcomestoNonOutcomes)

    if (sampleSize > length(nonoutcomeIds)) {
      ParallelLogger::logWarn("Non-outcome count less than required sample size")
      sampleSize <- length(nonoutcomeIds) - length(outcomeIds)
    } else {
      sampleSize <- sampleSize - length(outcomeIds)
    }

    while (sampleSize > 0) {
      tempSampleSize <- min(length(outcomeIds), sampleSize)
      # randomly oversample outcome people
      sampleOutcomeIds <- sample(outcomeIds, tempSampleSize, replace = TRUE)
      pplOfInterest <- unique(sampleOutcomeIds) # to enable oversampling with replacement
      sampleSize <- sampleSize - length(pplOfInterest)

      addTrainData <- list()
      addTrainData$labels <- trainData$labels %>% dplyr::filter(.data$rowId %in% pplOfInterest)
      addTrainData$labels <- addTrainData$labels %>% dplyr::mutate(newRowId = 1:nrow(addTrainData$labels))
      addTrainData$labels <- addTrainData$labels %>% dplyr::mutate(newRowId = .data$newRowId + max(sampleTrainData$labels$rowId))

      addTrainData$folds <- trainData$folds %>% dplyr::filter(.data$rowId %in% pplOfInterest)
      addTrainData$folds <- dplyr::inner_join(addTrainData$folds, addTrainData$labels[, c("rowId", "newRowId")], copy = TRUE, by = "rowId")
      addTrainData$folds <- addTrainData$folds %>% dplyr::mutate(rowId = .data$newRowId)
      addTrainData$folds <- dplyr::select(addTrainData$folds, -dplyr::starts_with("newRowId"))

      addTrainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% pplOfInterest)
      addTrainData$covariateData$covariates <- dplyr::inner_join(addTrainData$covariateData$covariates, addTrainData$labels[, c("rowId", "newRowId")], copy = TRUE, by = "rowId")
      addTrainData$covariateData$covariates <- addTrainData$covariateData$covariates %>% dplyr::mutate(rowId = .data$newRowId)
      addTrainData$covariateData$covariates <- dplyr::select(addTrainData$covariateData$covariates, -dplyr::starts_with("newRowId"))

      addTrainData$labels <- addTrainData$labels %>% dplyr::mutate(rowId = .data$newRowId)
      addTrainData$labels <- dplyr::select(addTrainData$labels, -dplyr::starts_with("newRowId"))

      sampleTrainData$labels <- dplyr::bind_rows(sampleTrainData$labels, addTrainData$labels)
      sampleTrainData$folds <- dplyr::bind_rows(sampleTrainData$folds, addTrainData$folds)
      Andromeda::appendToTable(sampleTrainData$covariateData$covariates, addTrainData$covariateData$covariates)
    }
  }

  # update metaData$populationSize = nrow(trainData$labels)
  metaData <- attr(trainData$covariateData, "metaData")
  metaData$populationSize <- nrow(sampleTrainData$labels)
  attr(sampleTrainData$covariateData, "metaData") <- metaData

  class(sampleTrainData$covariateData) <- "CovariateData"

  return(sampleTrainData)
}

Try the PatientLevelPrediction package in your browser

Any scripts or data that you put into this service are public.

PatientLevelPrediction documentation built on April 3, 2025, 9:58 p.m.