Nothing
# @file LearningCurve.R
#
# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#' @title createLearningCurve
#'
#' @description Creates a learning curve object, which can be plotted using the
#' \code{plotLearningCurve()} function.
#'
#' @param plpData An object of type \code{plpData} - the patient level prediction
#' data extracted from the CDM.
#' @param outcomeId (integer) The ID of the outcome.
#' @param analysisId (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.
#' @param populationSettings An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that
#' specifies how the data class labels are defined and addition any exclusions to apply to the
#' plpData cohort
#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test.
#' The default settings can be created using \code{createDefaultSplitSetting}.
#' @param sampleSettings An object of type \code{sampleSettings} that specifies any under/over sampling to be done.
#' The default is none.
#' @param trainFractions A list of training fractions to create models for.
#' Note, providing \code{trainEvents} will override your input to
#' \code{trainFractions}.
#' @param trainEvents Events have shown to be determinant of model performance.
#' Therefore, it is recommended to provide \code{trainEvents} rather than
#' \code{trainFractions}. Note, providing \code{trainEvents} will override
#' your input to \code{trainFractions}. The format should be as follows:
#' \itemize{
#' \item \code{c(500, 1000, 1500) } - a list of training events
#' }
#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)
#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
#' target population who must have a covariate for it to be included in the model training
#' and whether to normalise the covariates before training
#' @param modelSettings An object of class \code{modelSettings} created using one of the function:
#' \itemize{
#' \item \code{setLassoLogisticRegression()} A lasso logistic regression model
#' \item \code{setGradientBoostingMachine()} A gradient boosting machine
#' \item \code{setAdaBoost()} An ada boost model
#' \item \code{setRandomForest()} A random forest model
#' \item \code{setDecisionTree()} A decision tree model
#' \item \code{setKNN()} A KNN model
#' }
#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings}
#' specifying how the logging is done
#' @param executeSettings An object of \code{executeSettings} specifying which parts of the analysis to run
#'
#'
#' @param saveDirectory The path to the directory where the results will be saved (if NULL uses working directory)
#' @param cores The number of computer cores to use if running in parallel
#' @param parallel Whether to run the code in parallel
#'
#' @return A learning curve object containing the various performance measures
#' obtained by the model for each training set fraction. It can be plotted
#' using \code{plotLearningCurve}.
#' @examplesIf rlang::is_installed("parallel")
#' \donttest{ \dontshow{ # takes too long }
#' data("simulationProfile")
#' plpData <- simulatePlpData(simulationProfile, n = 1000)
#' outcomeId <- 3
#' modelSettings <- setLassoLogisticRegression(seed=42)
#' learningCurve <- createLearningCurve(plpData, outcomeId, modelSettings = modelSettings,
#' saveDirectory = file.path(tempdir(), "learningCurve"), cores = 2)
#' # clean up
#' unlink(file.path(tempdir(), "learningCurve"), recursive = TRUE)
#' }
#' @export
createLearningCurve <- function(
plpData,
outcomeId,
parallel = TRUE,
cores = 4,
modelSettings,
saveDirectory = NULL,
analysisId = "learningCurve",
populationSettings = createStudyPopulationSettings(),
splitSettings = createDefaultSplitSetting(),
trainFractions = c(0.25, 0.50, 0.75),
trainEvents = NULL,
sampleSettings = createSampleSettings(),
featureEngineeringSettings = createFeatureEngineeringSettings(),
preprocessSettings = createPreprocessSettings(
minFraction = 0.001,
normalize = TRUE
),
logSettings = createLogSettings(),
executeSettings = createExecuteSettings(
runSplitData = TRUE,
runSampleData = FALSE,
runFeatureEngineering = FALSE,
runPreprocessData = TRUE,
runModelDevelopment = TRUE,
runCovariateSummary = FALSE
)) {
if (is.null(saveDirectory)) {
stop("saveDirectory must be specified")
}
if (is.null(analysisId)) {
analysisId <- gsub(":", "", gsub("-", "", gsub(" ", "", Sys.time())))
}
# if trainEvents is provided override trainFractions input
if (!is.null(trainEvents)) {
trainFractions <- getTrainFractions(
trainEvents,
plpData,
outcomeId,
populationSettings,
splitSettings
)
}
# record global start time
ExecutionDateTime <- Sys.time()
if (parallel) {
rlang::check_installed("parallel")
if (is.null(cores)) {
ParallelLogger::logInfo(paste0("Number of cores not specified"))
cores <- parallel::detectCores()
ParallelLogger::logInfo(paste0("Using all ", cores))
ParallelLogger::logInfo(paste0("Set cores input to use fewer..."))
}
# save data
savePlpData(plpData, file.path(saveDirectory, "data"))
# code to run in parallel
getLcSettings <- function(i) {
result <- list(
plpData = file.path(saveDirectory, "data"),
outcomeId = outcomeId,
analysisId = paste0(analysisId, i),
populationSettings = populationSettings,
splitSettings = splitSettings,
sampleSettings = sampleSettings,
featureEngineeringSettings = featureEngineeringSettings,
preprocessSettings = preprocessSettings,
modelSettings = modelSettings,
logSettings = logSettings,
executeSettings = executeSettings,
saveDirectory = saveDirectory
)
result$splitSettings$train <- trainFractions[i]
return(result)
}
lcSettings <- lapply(1:length(trainFractions), getLcSettings)
cluster <- ParallelLogger::makeCluster(numberOfThreads = cores)
ParallelLogger::clusterRequire(cluster, c("PatientLevelPrediction", "Andromeda", "FeatureExtraction"))
learningCurve <- ParallelLogger::clusterApply(
cluster = cluster,
x = lcSettings,
fun = lcWrapper,
stopOnError = FALSE,
progressBar = TRUE
)
ParallelLogger::stopCluster(cluster)
} else {
# code to run not in parallel
# number of training set fractions
nRuns <- length(trainFractions)
settings <- list(
plpData = quote(plpData),
outcomeId = outcomeId,
analysisId = analysisId,
populationSettings = populationSettings,
splitSettings = splitSettings,
sampleSettings = sampleSettings,
featureEngineeringSettings = featureEngineeringSettings,
preprocessSettings = preprocessSettings,
modelSettings = modelSettings,
logSettings = logSettings,
executeSettings = executeSettings,
saveDirectory = saveDirectory
)
learningCurve <- lapply(1:nRuns, function(i) {
settings$splitSettings$train <- trainFractions[i]
settings$analysisId <- paste0(settings$analysisId, "_", i)
result <- do.call(runPlp, settings)
result <- learningCurveHelper(
result = result,
trainFractions = trainFractions[i]
)
return(result)
})
}
learningCurve <- do.call(rbind, learningCurve)
learningCurve <- tidyr::pivot_wider(
data = learningCurve,
names_from = "name",
values_from = "value"
)
endTime <- Sys.time()
TotalExecutionElapsedTime <-
as.numeric(difftime(endTime, ExecutionDateTime,
units = "secs"
))
ParallelLogger::logInfo("Finished in ", round(TotalExecutionElapsedTime), " secs.")
return(learningCurve)
}
lcWrapper <- function(settings) {
plpData <- PatientLevelPrediction::loadPlpData(settings$plpData)
settings$plpData <- quote(plpData)
result <- tryCatch(
{
do.call(runPlp, settings)
},
warning = function(war) {
ParallelLogger::logInfo(paste0("a warning: ", war))
},
error = function(err) {
ParallelLogger::logError(paste0("an error: ", err))
return(NULL)
}
)
if (!is.null(result)) {
ParallelLogger::logInfo("Extracting performance for learning curve...")
final <- learningCurveHelper(result, settings$splitSettings$train)
return(final)
} else {
return(c())
}
}
getTrainFractions <- function(
trainEvents,
plpData,
outcomeId,
populationSettings,
splitSettings) {
population <- do.call(
createStudyPopulation,
list(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
)
# compute training set fractions from training events
samplesRequired <- trainEvents / (sum(population$outcomeCount / nrow(population)))
trainFractionsTemp <- samplesRequired / nrow(population)
# filter out no. of events that would exceed the available training set size
binaryMask <- trainFractionsTemp <= (1.0 - splitSettings$test)
# override any input to trainFractions with event-based training fractions
trainFractions <- trainFractionsTemp[binaryMask]
# Check if any train fractions could be associated with the provided events
if (!length(trainFractions)) {
# If not, fall back on default train fractions
trainFractions <- c(0.25, 0.50, 0.75)
}
return(trainFractions)
}
learningCurveHelper <- function(result, trainFractions) {
executeTime <- result$executionSummary$TotalExecutionElapsedTime
nPredictors <- result$model$covariateImportance %>%
dplyr::filter(.data$covariateValue != 0) %>%
dplyr::tally() %>%
dplyr::pull()
# evaluationStatistics is a data.frame with columns 'evaluation','metric','value'
result <- result$performanceEvaluation$evaluationStatistics
result$name <- paste(result$evaluation, result$metric, sep = "_")
result <- result %>% dplyr::select("name", "value")
result <- rbind(
c("executionTime", executeTime),
result,
c("nPredictors", nPredictors)
)
result$trainFraction <- trainFractions * 100
return(result)
}
#' @title plotLearningCurve
#'
#' @description Create a plot of the learning curve using the object returned
#' from \code{createLearningCurve}.
#'
#' @param learningCurve An object returned by \code{\link{createLearningCurve}}
#' function.
#' @param metric Specifies the metric to be plotted:
#' \itemize{
#' \item{\code{'AUROC'} - use the area under the Receiver Operating
#' Characteristic curve}
#' \item{\code{'AUPRC'} - use the area under the Precision-Recall curve}
#' \item{\code{'sBrier'} - use the scaled Brier score}
#' }
#' @param abscissa Specify the abscissa metric to be plotted:
#' \itemize{
#' \item{\code{'events'} - use number of events}
#' \item{\code{'observations'} - use number of observations}
#' }
#' @param plotTitle Title of the learning curve plot.
#' @param plotSubtitle Subtitle of the learning curve plot.
#' @param fileName Filename of plot to be saved, for example \code{'plot.png'}.
#' See the function \code{ggsave} in the ggplot2 package for supported file
#' formats.
#'
#' @return
#' A ggplot object. Use the \code{\link[ggplot2]{ggsave}} function to save to
#' file in a different format.
#' @examplesIf rlang::is_installed("parallel")
#' \donttest{ \dontshow{ # takes too long to run }
#' data("simulationProfile")
#' plpData <- simulatePlpData(simulationProfile, n = 1000)
#' outcomeId <- 3
#' modelSettings <- setLassoLogisticRegression(seed=42)
#' learningCurve <- createLearningCurve(plpData, outcomeId, modelSettings = modelSettings,
#' saveDirectory = file.path(tempdir(), "learningCurve"), cores = 2)
#' plotLearningCurve(learningCurve)
#' }
#' @export
plotLearningCurve <- function(
learningCurve,
metric = "AUROC",
abscissa = "events",
plotTitle = "Learning Curve",
plotSubtitle = NULL,
fileName = NULL) {
rlang::check_installed("ggplot2",
reason = "plotLearningCurve requires the ggplot2 package to be installed.")
tidyLearningCurve <- NULL
yAxisRange <- NULL
y <- NULL
learningCurve <- as.data.frame(learningCurve)
# check for performance metric to plot
if (metric == "AUROC") {
# create a data.frame with evalautionType, AUROC
tidyLearningCurve <- learningCurve %>%
dplyr::rename(
Occurrences = "Train_outcomeCount",
Observations = "Train_populationSize"
) %>%
dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUROC", "Train_AUROC")
for (i in 1:ncol(tidyLearningCurve)) {
tidyLearningCurve[, i] <- as.double(as.character(tidyLearningCurve[, i]))
}
tidyLearningCurve <- tidyr::pivot_longer(
data = as.data.frame(tidyLearningCurve),
cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c("trainFraction", "Occurrences", "Observations")],
values_to = "value",
names_to = "variable"
)
tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x) strsplit(as.character(x), "_")[[1]][1])
# define plot properties
yAxisRange <- c(0.5, 1.0)
} else if (metric == "AUPRC") {
# tidy up dataframe
tidyLearningCurve <- learningCurve %>%
dplyr::rename(
Occurrences = "Train_outcomeCount",
Observations = "Train_populationSize"
) %>%
dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUPRC", "Train_AUPRC")
for (i in 1:ncol(tidyLearningCurve)) {
tidyLearningCurve[, i] <- as.double(as.character(tidyLearningCurve[, i]))
}
tidyLearningCurve <- tidyr::pivot_longer(
data = as.data.frame(tidyLearningCurve),
cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c("trainFraction", "Occurrences", "Observations")],
values_to = "value",
names_to = "variable"
)
tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x) strsplit(as.character(x), "_")[[1]][1])
# define plot properties
yAxisRange <- c(0.0, 1.0)
} else if (metric == "sBrier") {
# tidy up dataframe
tidyLearningCurve <- learningCurve %>%
dplyr::rename(
Occurrences = "Train_outcomeCount",
Observations = "Train_populationSize"
) %>%
dplyr::select("trainFraction", "Occurrences", "Observations", "Test_brier score scaled", "Train_brier score scaled")
for (i in 1:ncol(tidyLearningCurve)) {
tidyLearningCurve[, i] <- as.double(as.character(tidyLearningCurve[, i]))
}
tidyLearningCurve <- tidyr::pivot_longer(
data = as.data.frame(tidyLearningCurve),
cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c("trainFraction", "Occurrences", "Observations")],
values_to = "value",
names_to = "variable"
)
tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x) strsplit(as.character(x), "_")[[1]][1])
# define plot properties
yAxisRange <- c(0.0, 1.0)
} else {
stop("An incorrect metric has been specified.")
}
if (abscissa == "observations") {
abscissa <- "Observations"
abscissaLabel <- "No. of observations"
} else if (abscissa == "events") {
abscissa <- "Occurrences"
abscissaLabel <- "No. of events"
} else {
stop("An incorrect abscissa has been specified.")
}
# create plot object
plot <- tidyLearningCurve %>%
ggplot2::ggplot(ggplot2::aes(
x = .data[[abscissa]], y = .data[["value"]],
col = .data[["Dataset"]]
)) +
ggplot2::geom_line() +
ggplot2::coord_cartesian(ylim = yAxisRange, expand = FALSE) +
ggplot2::labs(
title = plotTitle, subtitle = plotSubtitle,
x = abscissaLabel, y = metric
) +
ggplot2::theme_light()
# save plot, if fucntion call provides a file name
if ((!is.null(fileName)) && (is.character(fileName))) {
ggplot2::ggsave(fileName, plot, width = 5, height = 4.5, dpi = 400)
}
return(plot)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.