Nothing
# @file Diagnostics.R
#
# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#' Run a list of predictions diagnoses
#'
#' @details
#' This function will run all specified prediction design diagnoses.
#'
#' @param databaseDetails The database settings created using \code{createDatabaseDetails()}
#' @param modelDesignList A list of model designs created using \code{createModelDesign()}
#' @param cohortDefinitions A list of cohort definitions for the target and outcome cohorts
#' @param logSettings The setting spexcifying the logging for the analyses created using \code{createLogSettings()}
#' @param saveDirectory Name of the folder where all the outputs will written to.
#'
#' @return
#' A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier
#' for a set of analysis choices.\cr \verb{targetId} \tab The ID of the target cohort populations.\cr
#' \verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved
#' \cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr }
#' @export
diagnoseMultiplePlp <- function(
databaseDetails = createDatabaseDetails(),
modelDesignList = list(
createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
),
cohortDefinitions = NULL,
logSettings = createLogSettings(
verbosity = "DEBUG",
timeStamp = TRUE,
logName = "diagnosePlp Log"
),
saveDirectory = NULL) {
# input checks
if (is.null(saveDirectory)) {
stop("saveDirectory must be specified")
}
checkIsClass(databaseDetails, c("databaseDetails"))
checkIsClass(modelDesignList, c("list", "modelDesign"))
checkIsClass(logSettings, "logSettings")
checkIsClass(saveDirectory, "character")
if (!dir.exists(saveDirectory)) {
dir.create(saveDirectory, recursive = TRUE)
}
if (is.null(cohortDefinitions)) {
cohortIds <- unlist(
lapply(
X = 1:length(modelDesignList),
FUN = function(i) {
c(
modelDesignList[[i]]$targetId,
modelDesignList[[i]]$outcomeId
)
}
)
)
cohortDefinitions <- lapply(
X = cohortIds,
FUN = function(x) {
list(
id = x,
name = paste0("Cohort: ", x)
)
}
)
}
settingstable <- convertToJson(modelDesignList, cohortDefinitions) # from runMultiplePlp.R
if (nrow(settingstable) != length(modelDesignList)) {
stop("Error in settingstable")
}
# save the settings: TODO fix
utils::write.csv(
x = settingstable %>% dplyr::select(
"analysisId",
"targetId",
"targetName",
"outcomeId",
"outcomeName",
"dataLocation"
),
file.path(saveDirectory, "settings.csv"),
row.names = FALSE
)
# group the outcomeIds per combination of data extraction settings
dataSettings <- settingstable %>%
dplyr::group_by(
.data$targetId,
.data$covariateSettings,
.data$restrictPlpDataSettings,
.data$dataLocation
) %>%
dplyr::summarise(
outcomeIds = paste(unique(.data$outcomeId), collapse = ",")
)
# extract data
for (i in 1:nrow(as.data.frame(dataSettings))) {
dataExists <- length(dir(file.path(saveDirectory, dataSettings$dataLocation[i]))) > 0
if (!dataExists) {
ParallelLogger::logInfo(paste("Extracting data for cohort", dataSettings$targetId[i], "to", file.path(saveDirectory, dataSettings$dataLocation[i])))
databaseDetails$targetId <- dataSettings$targetId[i]
databaseDetails$outcomeIds <- strsplit(dataSettings$outcomeIds[i], ",")[[1]]
plpDataSettings <- list(
databaseDetails = databaseDetails,
covariateSettings = ParallelLogger::convertJsonToSettings(dataSettings$covariateSettings[i]),
restrictPlpDataSettings = ParallelLogger::convertJsonToSettings(dataSettings$restrictPlpDataSettings[i])
)
plpData <- tryCatch(
{
do.call(getPlpData, plpDataSettings)
},
error = function(e) {
ParallelLogger::logInfo(e)
return(NULL)
}
)
if (!is.null(plpData)) {
savePlpData(plpData, file.path(saveDirectory, dataSettings$dataLocation[i]))
}
} else {
ParallelLogger::logInfo(paste("Data for cohort", dataSettings$targetId[i], "exists at", file.path(saveDirectory, dataSettings$dataLocation[i])))
}
}
# diagnosePlp
for (i in 1:nrow(as.data.frame(settingstable))) {
modelDesign <- modelDesignList[[i]]
settings <- settingstable[i, ] # just the data locations?
dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation))) > 0
if (dataExists) {
plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation))
diagnoseExists <- file.exists(file.path(saveDirectory, settings$analysisId, "diagnosePlp.rds"))
if (!diagnoseExists) {
diagnosePlpSettings <- list(
plpData = plpData,
outcomeId = modelDesign$outcomeId,
analysisId = settings$analysisId,
populationSettings = modelDesign$populationSettings,
splitSettings = modelDesign$splitSettings,
sampleSettings = modelDesign$sampleSettings,
featureEngineeringSettings = modelDesign$featureEngineeringSettings,
preprocessSettings = modelDesign$preprocessSettings,
modelSettings = modelDesign$modelSettings,
logSettings = logSettings,
saveDirectory = saveDirectory
)
result <- tryCatch(
{
do.call(diagnosePlp, diagnosePlpSettings)
},
error = function(e) {
ParallelLogger::logInfo(e)
return(NULL)
}
)
} else {
ParallelLogger::logInfo(paste("Diagnosis ", settings$analysisId, "exists at", file.path(saveDirectory, settings$analysisId)))
}
} # end run per setting
}
return(invisible(settingstable))
}
#' diagnostic - Investigates the prediction problem settings - use before training a model
#'
#' @description
#' This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine
#' whether the prediction problem is worth executing.
#'
#' @details
#' Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as
#' follow-up time distribution, time-to-event information, characteriszation details, time from last prior event,
#' observation time distribution.
#'
#' @param plpData An object of type \code{plpData} - the patient level prediction
#' data extracted from the CDM. Can also include an initial population as
#' plpData$popualtion.
#' @param outcomeId (integer) The ID of the outcome.
#' @param analysisId (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.
#' @param populationSettings An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that
#' specifies how the data class labels are defined and addition any exclusions to apply to the
#' plpData cohort
#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test.
#' The default settings can be created using \code{createDefaultSplitSetting}.
#' @param sampleSettings An object of type \code{sampleSettings} that specifies any under/over sampling to be done.
#' The default is none.
#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)
#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
#' target population who must have a covariate for it to be included in the model training
#' and whether to normalise the covariates before training
#' @param modelSettings An object of class \code{modelSettings} created using one of the function:
#' \itemize{
#' \item setLassoLogisticRegression() A lasso logistic regression model
#' \item setGradientBoostingMachine() A gradient boosting machine
#' \item setAdaBoost() An ada boost model
#' \item setRandomForest() A random forest model
#' \item setDecisionTree() A decision tree model
#' }
#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings}
#' specifying how the logging is done
#' @param saveDirectory The path to the directory where the results will be saved (if NULL uses working directory)
#'
#' @return
#' An object containing the model or location where the model is saved, the data selection settings, the preprocessing
#' and training settings as well as various performance measures obtained by the model.
#' \itemize{
#' \item{`distribution`: List for each O of a data.frame containing: i) Time to observation end distribution, ii) Time from observation start distribution, iii) Time to event distribution and iv) Time from last prior event to index distribution (only for patients in T who have O before index) }
#' \item{`incident`: List for each O of incidence of O in T during TAR}
#' \item{`characterization`: List for each O of Characterization of T, TnO, Tn~O}
#' }
#' @examplesIf rlang::is_installed("Eunomia") && rlang::is_installed("curl") && curl::has_internet()
#' \donttest{ \dontshow{ # takes too long }
#'
#' # load the data
#' plpData <- getEunomiaPlpData()
#' populationSettings <- createStudyPopulationSettings(minTimeAtRisk = 1)
#' saveDirectory <- file.path(tempdir(), "diagnosePlp")
#' diagnosis <- diagnosePlp(plpData = plpData, outcomeId = 3, analysisId = 1,
#' populationSettings = populationSettings, saveDirectory = saveDirectory)
#' # clean up
#' unlink(saveDirectory, recursive = TRUE)
#' }
#' @export
diagnosePlp <- function(
plpData = NULL,
outcomeId,
analysisId,
populationSettings,
splitSettings = createDefaultSplitSetting(),
sampleSettings = createSampleSettings(), # default none
saveDirectory = NULL,
featureEngineeringSettings = createFeatureEngineeringSettings(), # default none
modelSettings = setLassoLogisticRegression(), # default to logistic regression
logSettings = createLogSettings(
verbosity = "DEBUG",
timeStamp = TRUE,
logName = "diagnosePlp Log"
),
preprocessSettings = createPreprocessSettings()) {
# start log
analysisPath <- file.path(saveDirectory, analysisId)
logSettings$saveDirectory <- analysisPath
logSettings$logFileName <- "plpLog"
logger <- do.call(createLog, logSettings)
ParallelLogger::registerLogger(logger)
on.exit(closeLog(logger))
participantsDiag <- probastParticipants(
plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
predictorDiag <- probastPredictors(
plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
outcomeDiag <- probastOutcome(
plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
designDiag <- probastDesign(
plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
# Question: what about
# splitSettings, sampleSettings,
# FeatureEngineeringSettings, modelSettings
result <- list(
summary = rbind(
participantsDiag$diagnosticParticipantsAggregate,
predictorDiag$diagnosticPredictorsAggregate,
outcomeDiag$diagnosticOutcomeAggregate,
designDiag$diagnosticDesignAggregate
),
participants = participantsDiag$diagnosticParticipantsFull,
predictors = predictorDiag$diagnosticPredictorsFull,
outcomes = outcomeDiag$diagnosticOutcomeFull,
designs = designDiag$diagnosticDesignFull,
modelDesign = PatientLevelPrediction::createModelDesign(
targetId = attr(plpData$cohorts, "metaData")$targetId,
outcomeId = outcomeId,
restrictPlpDataSettings = plpData$metaData$restrictPlpDataSettings,
covariateSettings = plpData$metaData$covariateSettings,
populationSettings = populationSettings,
featureEngineeringSettings = featureEngineeringSettings,
preprocessSettings = preprocessSettings,
modelSettings = modelSettings,
splitSettings = splitSettings,
sampleSettings = sampleSettings
),
databaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema,
databaseId = plpData$metaData$databaseDetails$cdmDatabaseId
)
class(result) <- "diagnosePlp"
if (!is.null(saveDirectory)) {
if (!dir.exists(file.path(saveDirectory, analysisId))) {
dir.create(file.path(saveDirectory, analysisId), recursive = TRUE)
}
saveLocation <- file.path(saveDirectory, analysisId, "diagnosePlp.rds")
ParallelLogger::logInfo(paste0("Saving diagnosePlp to ", saveLocation))
saveRDS(result, saveLocation)
}
return(result)
}
probastDesign <- function(
plpData,
outcomeId,
populationSettings) {
diagnosticAggregate <- c()
diagnosticFull <- c()
population <- PatientLevelPrediction::createStudyPopulation(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
probastId <- "4.1"
if (min(sum(population$outcomeCount > 0), nrow(population) - sum(population$outcomeCount > 0)) >= 1000) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Pass")
)
} else if ((min(sum(population$outcomeCount > 0), nrow(population) - sum(population$outcomeCount > 0)) >= 100)) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Unknown")
)
} else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Fail")
)
}
if (!is.null(dim(diagnosticAggregate))) {
diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
dplyr::mutate_if(is.factor, as.character)
colnames(diagnosticAggregate) <- c("probastId", "resultValue")
}
return(
list
(
diagnosticDesignFull = diagnosticFull,
diagnosticDesignAggregate = diagnosticAggregate
)
)
}
probastParticipants <- function(
plpData,
outcomeId,
populationSettings) {
diagnosticAggregate <- c()
diagnosticFull <- c()
# true due to plp
probastId <- "1.1"
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Pass")
)
# appropriate inclusions
## 1.2.1 min prior observation
if (populationSettings$washoutPeriod != 0) {
var <- "washoutPeriod"
probastId <- "1.2.1"
result <- getDiagnostic(
probastId,
plpData,
outcomeId,
populationSettings,
var,
0
)
diagnosticAggregate <- rbind(
diagnosticAggregate,
result$diagnosticAggregate
)
diagnosticFull <- rbind(
diagnosticFull,
result$diagnosticFull
)
}
## 1.2.2 min time-at-risk
if (populationSettings$requireTimeAtRisk && populationSettings$minTimeAtRisk > 0) {
probastId <- "1.2.2"
var <- "minTimeAtRisk"
result <- getDiagnostic(
probastId,
plpData,
outcomeId,
populationSettings,
var,
0
)
diagnosticAggregate <- rbind(
diagnosticAggregate,
result$diagnosticAggregate
)
diagnosticFull <- rbind(
diagnosticFull,
result$diagnosticFull
)
}
## 1.2.3 first exposure only
if (populationSettings$firstExposureOnly) {
probastId <- "1.2.3"
var <- "firstExposureOnly"
default <- FALSE
result <- getDiagnostic(
probastId,
plpData,
outcomeId,
populationSettings,
var,
default
)
diagnosticAggregate <- rbind(
diagnosticAggregate,
result$diagnosticAggregate
)
diagnosticFull <- rbind(
diagnosticFull,
result$diagnosticFull
)
}
## 1.2.4 prior observation
if (populationSettings$removeSubjectsWithPriorOutcome && populationSettings$priorOutcomeLookback > 0) {
probastId <- "1.2.4"
var <- "priorOutcomeLookback"
default <- 0
result <- getDiagnostic(
probastId,
plpData,
outcomeId,
populationSettings,
var,
default
)
diagnosticAggregate <- rbind(
diagnosticAggregate,
result$diagnosticAggregate
)
diagnosticFull <- rbind(
diagnosticFull,
result$diagnosticFull
)
}
if (!is.null(dim(diagnosticAggregate))) {
diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
dplyr::mutate_if(is.factor, as.character)
colnames(diagnosticAggregate) <- c("probastId", "resultValue")
}
return(
list
(
diagnosticParticipantsFull = diagnosticFull,
diagnosticParticipantsAggregate = diagnosticAggregate
)
)
}
getMaxEndDaysFromCovariates <- function(covariateSettings) {
if (inherits(covariateSettings, "covariateSettings")) {
covariateSettings <- list(covariateSettings)
}
vals <- unlist(lapply(covariateSettings, function(x) {
x$endDays
}))
if (length(vals) == 0) {
return(0)
} else {
return(max(vals))
}
}
probastPredictors <- function(
plpData,
outcomeId,
populationSettings) {
diagnosticAggregate <- c()
diagnosticFull <- c()
# true due to plp
probastId <- "2.1"
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Pass")
)
# 2.2.1
# cov end date < tar_start
# covariate + outcome correlation; km of outcome (close to index or not)?
probastId <- "2.2"
if (populationSettings$startAnchor == "cohort start") {
if (populationSettings$riskWindowStart > getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings)) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Pass")
)
} else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Fail")
)
}
} else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Unknown")
)
}
# KM of outcome
populationSettingsFull <- populationSettings
populationSettingsFull$riskWindowEnd <- 10 * 365
population <- PatientLevelPrediction::createStudyPopulation(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
populationFull <- PatientLevelPrediction::createStudyPopulation(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = populationSettingsFull
)
# dayOfEvent, outcomeAtTime, observedAtStartOfDay
kmObservation <- population %>%
dplyr::group_by(.data$daysToEvent) %>%
dplyr::summarise(
outcomeAtTime = sum(!is.na(.data$daysToEvent))
)
kmObservation$observedAtStartOfDay <- unlist(
lapply(
kmObservation$daysToEvent,
function(x) {
population %>%
dplyr::filter(.data$survivalTime >= x) %>%
dplyr::tally() %>%
dplyr::select("n")
}
)
)
kmObservation$probastId <- probastId
kmObservation$inputType <- "populationSettings"
kmObservationFull <- populationFull %>%
dplyr::group_by(.data$daysToEvent) %>%
dplyr::summarise(
outcomeAtTime = sum(!is.na(.data$daysToEvent))
)
kmObservationFull$observedAtStartOfDay <- unlist(
lapply(
kmObservationFull$daysToEvent,
function(x) {
populationFull %>%
dplyr::filter(.data$survivalTime >= x) %>%
dplyr::tally() %>%
dplyr::select("n")
}
)
)
kmObservationFull$probastId <- probastId
kmObservationFull$inputType <- "10-year"
diagnosticFull <- rbind(kmObservation, kmObservationFull)
# 2.3.1
# cov end_date <=0
probastId <- "2.3"
if (getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings) <= 0) {
if (getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings) < 0) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Pass")
)
} else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Unknown")
)
}
} else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Fail")
)
}
if (!is.null(dim(diagnosticAggregate))) {
diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
dplyr::mutate_if(is.factor, as.character)
colnames(diagnosticAggregate) <- c("probastId", "resultValue")
}
return(
list
(
diagnosticPredictorsFull = diagnosticFull,
diagnosticPredictorsAggregate = diagnosticAggregate
)
)
}
probastOutcome <- function(
plpData,
outcomeId,
populationSettings) {
diagnosticAggregate <- c()
diagnosticFull <- c()
# true due to plp
probastId <- "3.4"
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Pass")
)
# 3.5 - check the outcome definition doesn't use things before index?
# 3.6 - check tar after covariate end_days
probastId <- "3.6"
if (populationSettings$startAnchor == "cohort start") {
if (populationSettings$riskWindowStart > getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings)) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Pass")
)
} else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Fail")
)
}
} else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
c(probastId, "Unknown")
)
}
# 3.1.1 - check the outcome rate per gender/age/index year
probastId <- "3.1.1"
# all cohort vs pop
pop <- PatientLevelPrediction::createStudyPopulation(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
popSum <- getOutcomeSummary(
type = "population",
population = pop
)
cohort <- PatientLevelPrediction::createStudyPopulation(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = PatientLevelPrediction::createStudyPopulationSettings(
includeAllOutcomes = FALSE,
firstExposureOnly = FALSE,
washoutPeriod = 0,
removeSubjectsWithPriorOutcome = FALSE,
priorOutcomeLookback = 0,
requireTimeAtRisk = FALSE,
minTimeAtRisk = 0,
riskWindowStart = populationSettings$riskWindowStart,
startAnchor = populationSettings$startAnchor,
riskWindowEnd = populationSettings$riskWindowEnd,
endAnchor = populationSettings$endAnchor
)
)
cohortSum <- getOutcomeSummary(
type = "cohort",
population = cohort
)
# dayOfEvent, outcomeAtTime, observedAtStartOfDay
diagnosticFull <- rbind(
do.call(rbind, popSum),
do.call(rbind, cohortSum)
)
if (!is.null(dim(diagnosticAggregate))) {
diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
dplyr::mutate_if(is.factor, as.character)
colnames(diagnosticAggregate) <- c("probastId", "resultValue")
}
return(
list
(
diagnosticOutcomeFull = diagnosticFull,
diagnosticOutcomeAggregate = diagnosticAggregate
)
)
}
getOutcomeSummary <- function(
type = "population",
population) {
res <- list()
length(res) <- 4
probastId <- "3"
res[[1]] <- population %>%
dplyr::group_by(.data$ageYear) %>%
dplyr::summarise(outcomePercent = sum(.data$outcomeCount > 0) / length(.data$outcomeCount)) %>%
dplyr::mutate(
probastId = probastId,
aggregation = "age",
inputType = type
) %>%
dplyr::rename(xvalue = "ageYear")
res[[2]] <- population %>%
dplyr::group_by(.data$gender) %>%
dplyr::summarise(outcomePercent = sum(.data$outcomeCount > 0) / length(.data$outcomeCount)) %>%
dplyr::mutate(
probastId = probastId,
aggregation = "gender",
inputType = type
) %>%
dplyr::rename(xvalue = "gender")
res[[3]] <- population %>%
dplyr::mutate(
year = substring(.data$cohortStartDate, 1, 4)
) %>%
dplyr::group_by(.data$year) %>%
dplyr::summarise(outcomePercent = sum(.data$outcomeCount > 0) / length(.data$outcomeCount)) %>%
dplyr::mutate(
probastId = probastId,
aggregation = "year",
inputType = type
) %>%
dplyr::rename(xvalue = "year")
res[[4]] <- population %>%
dplyr::mutate(
year = substring(.data$cohortStartDate, 6, 7)
) %>%
dplyr::group_by(.data$year) %>%
dplyr::summarise(outcomePercent = sum(.data$outcomeCount > 0) / length(.data$outcomeCount)) %>%
dplyr::mutate(
probastId = probastId,
aggregation = "month",
inputType = type
) %>%
dplyr::rename(xvalue = "year")
return(res)
}
cos_sim <- function(a, b) {
return(sum(a * b) / sqrt(sum(a^2) * sum(b^2)))
}
getDiagnostic <- function(
probastId,
plpData,
outcomeId,
populationSettings,
var,
defaultValue = 0) {
ParallelLogger::logInfo(paste0("Diagnosing impact of ", var, " in populationSettings"))
populationSettingsCheck <- populationSettings
populationSettingsCheck[var] <- defaultValue
pop <- PatientLevelPrediction::createStudyPopulation(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
popCheck <- PatientLevelPrediction::createStudyPopulation(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = populationSettingsCheck
)
# compare the populations:
diag <- rbind(
data.frame(
probastId = probastId,
design = paste0(var, ": ", defaultValue),
metric = c(
"N", "outcomePercent", "minAge",
"meanAge", "medianAge", "maxAge",
"malePercent"
),
value = c(
nrow(popCheck), sum(popCheck$outcomeCount > 0) / nrow(popCheck) * 100, min(popCheck$ageYear),
mean(popCheck$ageYear), stats::median(popCheck$ageYear), max(popCheck$ageYear),
sum(popCheck$gender == 8507) / nrow(popCheck) * 100
)
),
data.frame(
probastId = probastId,
design = paste0(var, ": ", populationSettings[var]),
metric = c(
"N", "outcomePercent", "minAge",
"meanAge", "medianAge", "maxAge",
"malePercent"
),
value = c(
nrow(pop), sum(pop$outcomeCount > 0) / nrow(pop) * 100, min(pop$ageYear),
mean(pop$ageYear), stats::median(pop$ageYear), max(pop$ageYear),
sum(pop$gender == 8507) / nrow(pop) * 100
)
)
)
diagSim <- cos_sim(
diag %>%
dplyr::filter(.data$design == unique(diag$design)[1]) %>%
dplyr::filter(.data$metric != "N") %>%
dplyr::arrange(.data$metric) %>%
dplyr::select("value"),
diag %>%
dplyr::filter(.data$design == unique(diag$design)[2]) %>%
dplyr::filter(.data$metric != "N") %>%
dplyr::arrange(.data$metric) %>%
dplyr::select("value")
)
return(
list(
diagnosticAggregate = c(
probastId,
diagSim
),
diagnosticFull = diag
)
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.