# @file LearningCurve.R
#
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#' @title createLearningCurve
#'
#' @description Creates a learning curve object, which can be plotted using the
#' \code{plotLearningCurve()} function.
#'
#' @param plpData An object of type \code{plpData} - the patient level prediction
#' data extracted from the CDM.
#' @param outcomeId (integer) The ID of the outcome.
#' @param analysisId (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.
#' @param populationSettings An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that
#' specifies how the data class labels are defined and addition any exclusions to apply to the
#' plpData cohort
#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test.
#' The default settings can be created using \code{createDefaultSplitSetting}.
#' @param sampleSettings An object of type \code{sampleSettings} that specifies any under/over sampling to be done.
#' The default is none.
#' @param trainFractions A list of training fractions to create models for.
#' Note, providing \code{trainEvents} will override your input to
#' \code{trainFractions}.
#' @param trainEvents Events have shown to be determinant of model performance.
#' Therefore, it is recommended to provide \code{trainEvents} rather than
#' \code{trainFractions}. Note, providing \code{trainEvents} will override
#' your input to \code{trainFractions}. The format should be as follows:
#' \itemize{
#' \item \code{c(500, 1000, 1500) } - a list of training events
#' }
#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)
#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
#' target population who must have a covariate for it to be included in the model training
#' and whether to normalise the covariates before training
#' @param modelSettings An object of class \code{modelSettings} created using one of the function:
#' \itemize{
#' \item \code{setLassoLogisticRegression()} A lasso logistic regression model
#' \item \code{setGradientBoostingMachine()} A gradient boosting machine
#' \item \code{setAdaBoost()} An ada boost model
#' \item \code{setRandomForest()} A random forest model
#' \item \code{setDecisionTree()} A decision tree model
#' \item \code{setKNN()} A KNN model
#' }
#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings}
#' specifying how the logging is done
#' @param executeSettings An object of \code{executeSettings} specifying which parts of the analysis to run
#'
#'
#' @param saveDirectory The path to the directory where the results will be saved (if NULL uses working directory)
#' @param cores The number of computer cores to use if running in parallel
#' @param parallel Whether to run the code in parallel
#'
#' @return A learning curve object containing the various performance measures
#' obtained by the model for each training set fraction. It can be plotted
#' using \code{plotLearningCurve}.
#'
#' @examples
#' \dontrun{
#' # define model
#' modelSettings = PatientLevelPrediction::setLassoLogisticRegression()
#'
#' # create learning curve
#' learningCurve <- PatientLevelPrediction::createLearningCurve(population,
#' plpData,
#' modelSettings)
#' # plot learning curve
#' PatientLevelPrediction::plotLearningCurve(learningCurve)
#' }
#'
#' @export
createLearningCurve <- function(
plpData,
outcomeId,
parallel = T,
cores = 4,
modelSettings,
saveDirectory = getwd(),
analysisId = 'learningCurve',
populationSettings = createStudyPopulationSettings(),
splitSettings = createDefaultSplitSetting(),
trainFractions = c(0.25, 0.50, 0.75),
trainEvents = NULL,
sampleSettings = createSampleSettings(),
featureEngineeringSettings = createFeatureEngineeringSettings(),
preprocessSettings = createPreprocessSettings(
minFraction = 0.001,
normalize = T
),
logSettings = createLogSettings(),
executeSettings = createExecuteSettings(
runSplitData = T,
runSampleData = F,
runfeatureEngineering = F,
runPreprocessData = T,
runModelDevelopment = T,
runCovariateSummary = F
)
){
if (is.null(analysisId)) {
analysisId <- gsub(':', '', gsub('-', '', gsub(' ', '', Sys.time())))
}
# if trainEvents is provided override trainFractions input
if (!is.null(trainEvents)) {
trainFractions <- getTrainFractions(
trainEvents,
plpData,
outcomeId,
populationSettings,
splitSettings
)
}
# record global start time
ExecutionDateTime <- Sys.time()
if(parallel){
ensure_installed('parallel')
if(is.null(cores)){
ParallelLogger::logInfo(paste0('Number of cores not specified'))
cores <- parallel::detectCores()
ParallelLogger::logInfo(paste0('Using all ', cores))
ParallelLogger::logInfo(paste0('Set cores input to use fewer...'))
}
# save data
savePlpData(plpData, file.path(saveDirectory,'data'))
# code to run in parallel
getLcSettings <- function(i){
result <- list(
plpData = file.path(saveDirectory,'data'),
outcomeId = outcomeId,
analysisId = paste0(analysisId,i),
populationSettings = populationSettings,
splitSettings = splitSettings,
sampleSettings = sampleSettings,
featureEngineeringSettings = featureEngineeringSettings,
preprocessSettings = preprocessSettings,
modelSettings = modelSettings,
logSettings = logSettings,
executeSettings = executeSettings,
saveDirectory = saveDirectory
)
result$splitSettings$train <- trainFractions[i]
return(result)
}
lcSettings <- lapply(1:length(trainFractions), getLcSettings)
cluster <- ParallelLogger::makeCluster(numberOfThreads = cores)
ParallelLogger::clusterRequire(cluster, c("PatientLevelPrediction", "Andromeda", "FeatureExtraction"))
learningCurve <- ParallelLogger::clusterApply(cluster = cluster,
x = lcSettings,
fun = lcWrapper,
stopOnError = FALSE,
progressBar = TRUE)
ParallelLogger::stopCluster(cluster)
} else{
# code to run not in parallel
# number of training set fractions
nRuns <- length(trainFractions)
settings = list(
plpData = quote(plpData),
outcomeId = outcomeId,
analysisId = analysisId,
populationSettings = populationSettings,
splitSettings = splitSettings,
sampleSettings = sampleSettings,
featureEngineeringSettings = featureEngineeringSettings,
preprocessSettings = preprocessSettings,
modelSettings = modelSettings,
logSettings = logSettings,
executeSettings = executeSettings,
saveDirectory = saveDirectory
)
learningCurve <- lapply(1:nRuns, function(i){
settings$splitSettings$train = trainFractions[i]
settings$analysisId = paste0(settings$analysisId, '_', i)
result <- do.call(runPlp, settings)
result <- learningCurveHelper(
result = result,
trainFractions = trainFractions[i]
)
return(result)
})
}
learningCurve <- do.call(rbind,learningCurve)
learningCurve <- tidyr::pivot_wider(
data = learningCurve,
names_from = 'name',
values_from = 'value'
)
#learningCurve <- reshape2::dcast(data = learningCurve, trainFraction ~ name)
endTime <- Sys.time()
TotalExecutionElapsedTime <-
as.numeric(difftime(endTime, ExecutionDateTime,
units = "secs"))
ParallelLogger::logInfo('Finished in ', round(TotalExecutionElapsedTime), ' secs.')
return(learningCurve)
}
lcWrapper <- function(settings){
plpData <- PatientLevelPrediction::loadPlpData(settings$plpData)
settings$plpData <- quote(plpData)
result <- tryCatch({do.call(runPlp, settings)},
warning = function(war) {
ParallelLogger::logInfo(paste0('a warning: ', war))
},
error = function(err) {
ParallelLogger::logError(paste0('an error: ', err))
return(NULL)
}
)
if(!is.null(result)){
ParallelLogger::logInfo('Extracting performance for learning curve...')
final <- learningCurveHelper(result, settings$splitSettings$train)
return(final)
} else{
return(c())
}
}
getTrainFractions <- function(
trainEvents,
plpData,
outcomeId,
populationSettings,
splitSettings
){
population <- do.call(
createStudyPopulation,
list(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
)
# compute training set fractions from training events
samplesRequired <- trainEvents/(sum(population$outcomeCount/nrow(population)))
trainFractionsTemp <- samplesRequired/nrow(population)
# filter out no. of events that would exceed the available training set size
binaryMask <- trainFractionsTemp <= (1.0 - splitSettings$test)
# override any input to trainFractions with event-based training fractions
trainFractions <- trainFractionsTemp[binaryMask]
# Check if any train fractions could be associated with the provided events
if(!length(trainFractions)) {
# If not, fall back on default train fractions
trainFractions <- c(0.25, 0.50, 0.75)
}
return(trainFractions)
}
learningCurveHelper <- function(result, trainFractions){
executeTime <- result$executionSummary$TotalExecutionElapsedTime
nPredictors <- result$model$covariateImportance %>% dplyr::filter(.data$covariateValue != 0) %>% dplyr::tally() %>% dplyr::pull()
# evaluationStatistics is a data.frame with columns 'evaluation','metric','value'
result <- result$performanceEvaluation$evaluationStatistics
result$name <- paste(result$evaluation, result$metric, sep='_')
result <- result %>% dplyr::select("name", "value")
result <- rbind(
c('executionTime', executeTime),
result,
c('nPredictors', nPredictors)
)
result$trainFraction <- trainFractions * 100
return(result)
}
#' @title plotLearningCurve
#'
#' @description Create a plot of the learning curve using the object returned
#' from \code{createLearningCurve}.
#'
#' @param learningCurve An object returned by \code{\link{createLearningCurve}}
#' function.
#' @param metric Specifies the metric to be plotted:
#' \itemize{
#' \item{\code{'AUROC'} - use the area under the Receiver Operating
#' Characteristic curve}
#' \item{\code{'AUPRC'} - use the area under the Precision-Recall curve}
#' \item{\code{'sBrier'} - use the scaled Brier score}
#' }
#' @param abscissa Specify the abscissa metric to be plotted:
#' \itemize{
#' \item{\code{'events'} - use number of events}
#' \item{\code{'observations'} - use number of observations}
#' }
#' @param plotTitle Title of the learning curve plot.
#' @param plotSubtitle Subtitle of the learning curve plot.
#' @param fileName Filename of plot to be saved, for example \code{'plot.png'}.
#' See the function \code{ggsave} in the ggplot2 package for supported file
#' formats.
#'
#' @return
#' A ggplot object. Use the \code{\link[ggplot2]{ggsave}} function to save to
#' file in a different format.
#'
#' @examples
#' \dontrun{
#' # create learning curve object
#' learningCurve <- createLearningCurve(population,
#' plpData,
#' modelSettings)
#' # plot the learning curve
#' plotLearningCurve(learningCurve)
#' }
#'
#' @export
plotLearningCurve <- function(learningCurve,
metric = "AUROC",
abscissa = "events",
plotTitle = "Learning Curve",
plotSubtitle = NULL,
fileName = NULL){
tidyLearningCurve <- NULL
yAxisRange <- NULL
y <- NULL
learningCurve <- as.data.frame(learningCurve)
# check for performance metric to plot
if(metric == "AUROC") {
# create a data.frame with evalautionType, AUROC
tidyLearningCurve <- learningCurve %>%
dplyr::rename(
Occurrences = "Train_outcomeCount",
Observations = "Train_populationSize" ) %>%
dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUROC", "Train_AUROC")
for(i in 1:ncol(tidyLearningCurve)){
tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i]))
}
tidyLearningCurve <- tidyr::pivot_longer(
data = as.data.frame(tidyLearningCurve),
cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c('trainFraction', 'Occurrences', 'Observations')],
values_to = "value",
names_to = 'variable'
)
#tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations'))
tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x)strsplit(as.character(x), '_')[[1]][1])
# define plot properties
yAxisRange <- c(0.5, 1.0)
} else if (metric == "AUPRC") {
# tidy up dataframe
tidyLearningCurve <- learningCurve %>%
dplyr::rename(
Occurrences = "Train_outcomeCount",
Observations = "Train_populationSize" ) %>%
dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUPRC", "Train_AUPRC")
for(i in 1:ncol(tidyLearningCurve)){
tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i]))
}
tidyLearningCurve <- tidyr::pivot_longer(
data = as.data.frame(tidyLearningCurve),
cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c('trainFraction', 'Occurrences', 'Observations')],
values_to = "value",
names_to = 'variable'
)
#tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations'))
tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x)strsplit(as.character(x), '_')[[1]][1])
# define plot properties
yAxisRange <- c(0.0, 1.0)
} else if (metric == "sBrier") {
# tidy up dataframe
tidyLearningCurve <- learningCurve %>%
dplyr::rename(
Occurrences = "Train_outcomeCount",
Observations = "Train_populationSize" ) %>%
dplyr::select("trainFraction", "Occurrences", "Observations", "Test_brier score scaled", "Train_brier score scaled")
for(i in 1:ncol(tidyLearningCurve)){
tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i]))
}
tidyLearningCurve <- tidyr::pivot_longer(
data = as.data.frame(tidyLearningCurve),
cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c('trainFraction', 'Occurrences', 'Observations')],
values_to = "value",
names_to = 'variable'
)
#tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations'))
tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x)strsplit(as.character(x), '_')[[1]][1])
# define plot properties
yAxisRange <- c(0.0, 1.0)
} else {
stop("An incorrect metric has been specified.")
}
if (abscissa == "observations") {
abscissa <- "Observations"
abscissaLabel <- "No. of observations"
} else if (abscissa == "events") {
abscissa <- "Occurrences"
abscissaLabel <- "No. of events"
} else {
stop("An incorrect abscissa has been specified.")
}
# create plot object
plot <- tidyLearningCurve %>%
ggplot2::ggplot(ggplot2::aes(x = .data[[abscissa]], y = .data[['value']],
col = .data[["Dataset"]])) +
ggplot2::geom_line() +
ggplot2::coord_cartesian(ylim = yAxisRange, expand = FALSE) +
ggplot2::labs(title = plotTitle, subtitle = plotSubtitle,
x = abscissaLabel, y = metric) +
ggplot2::theme_light()
# save plot, if fucntion call provides a file name
if ((!is.null(fileName)) & (is.character(fileName))) {
ggplot2::ggsave(fileName, plot, width = 5, height = 4.5, dpi = 400)
}
return(plot)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.