Nothing
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#' Create an empty cohort definition set
#'
#' @description
#' This function creates an empty cohort set data.frame for use
#' with \code{generateCohortSet}.
#'
#' @param verbose When TRUE, descriptions of each field in the data.frame are
#' returned
#'
#' @return
#' Invisibly returns an empty cohort set data.frame
#'
#' @export
createEmptyCohortDefinitionSet <- function(verbose = FALSE) {
checkmate::assert_logical(verbose)
df <- data.frame(
cohortId = numeric(),
cohortName = character(),
sql = character(),
json = character()
)
if (verbose) {
print(df)
}
invisible(df)
}
.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
checkmate::assert_data_frame(x)
df <- createEmptyCohortDefinitionSet(verbose = FALSE)
# Compare the column names from the input x to an empty cohort
# definition set to ensure the required columns are present
cohortDefinitionSetColumns <- colnames(df)
matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)
if (!columnNamesMatch && emitWarning) {
columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
warningMessage <- paste0(
"The following columns were missing in your cohortDefinitionSet: ",
paste(columnsMissing, collapse = ","),
". A cohortDefinitionSet requires the following columns: ",
paste(cohortDefinitionSetColumns, collapse = ",")
)
warning(warningMessage)
}
invisible(columnNamesMatch)
}
#' Is the data.frame a cohort definition set?
#'
#' @description
#' This function checks a data.frame to verify it holds the expected format
#' for a cohortDefinitionSet.
#'
#' @param x The data.frame to check
#'
#' @return
#' Returns TRUE if the input is a cohortDefinitionSet or returns FALSE
#' with warnings on any violations
#'
#' @export
isCohortDefinitionSet <- function(x) {
columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = TRUE)
dataTypesMatch <- FALSE
if (columnNamesMatch) {
dataTypesMatch <- checkAndFixCohortDefinitionSetDataTypes(
x = x,
fixDataTypes = FALSE,
emitWarning = TRUE
)$dataTypesMatch
}
return(columnNamesMatch && dataTypesMatch)
}
#' Check if a cohort definition set is using the proper data types
#'
#' @description
#' This function checks a data.frame to verify it holds the expected format
#' for a cohortDefinitionSet's data types and can optionally fix data types
#' that do not match the specification.
#'
#' @param x The cohortDefinitionSet data.frame to check
#'
#' @param fixDataTypes When TRUE, this function will attempt to fix the data types
#' to match the specification. @seealso [createEmptyCohortDefinitionSet()].
#'
#' @param emitWarning When TRUE, this function will emit warning messages when problems are
#' encountered.
#'
#' @return
#' Returns a list() of the following form:
#'
#' list(
#' dataTypesMatch = TRUE/FALSE,
#' x = data.frame()
#' )
#'
#' dataTypesMatch == TRUE when the supplied data.frame x matches the cohortDefinitionSet
#' specification's data types.
#'
#' If fixDataTypes == TRUE, x will hold the original data from x with the
#' data types corrected. Otherwise x will hold the original value passed to this
#' function.
#'
#' @export
checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emitWarning = FALSE) {
checkmate::assert_data_frame(x)
df <- createEmptyCohortDefinitionSet(verbose = FALSE)
cohortDefinitionSetColumns <- colnames(df)
columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = emitWarning)
if (!columnNamesMatch) {
stop("Cannot check and fix cohortDefinitionSet since it is missing required columns.")
}
# Compare the data types from the input x to an empty cohort
# definition set to ensure the same data types (or close enough)
# are present
dataTypesMatch <- FALSE
# Subset x to the required columns
xSubset <- x[, cohortDefinitionSetColumns]
# Get the data types
xDataTypes <- sapply(xSubset, typeof)
# Get the reference data types
cohortDefinitionSetDataTypes <- sapply(df, typeof)
# Check if the data types match
# NOTE: createEmptyCohortDefinitionSet() is the reference for the data
# types. cohortId is declared as a numeric but an integer is also fine
dataTypesMatch <- (xDataTypes[1] %in% c("integer", "double") && all(xDataTypes[2:4] == "character"))
# Create the cohortDefinitionSetSpec from the names/data types for reference
cohortDefinitionSetSpec <- data.frame(
columnName = names(xDataTypes),
dataType = xDataTypes
)
if (!dataTypesMatch && emitWarning) {
dataTypesMismatch <- setdiff(x = cohortDefinitionSetDataTypes, y = xDataTypes)
# Create a column for the warning message
cohortDefinitionSetSpec$columnNameWithDataType <- paste(cohortDefinitionSetSpec$columnName, cohortDefinitionSetSpec$dataType, sep = " == ")
userSuppliedCohortDefinitionSetDataTypes <- paste(names(x[1 == 0, ]), "==", sapply(x[1 == 0, ], class), collapse = "\n")
warningMessage <- paste0("Your cohortDefinitionSet had a mismatch in data types. Please check your cohortDefinitionSet to ensure it conforms to the following expected data types:")
warningMessage <- paste0(warningMessage, "Expected column == data type\n--------------------------\n", paste(cohortDefinitionSetSpec$columnNameWithDataType, collapse = "\n"))
warningMessage <- paste0(warningMessage, "\n--------------------------\n")
warningMessage <- paste0(warningMessage, "Your cohortDefinitionSet \n--------------------------\n", userSuppliedCohortDefinitionSetDataTypes)
warning(warningMessage)
}
# If fixDataTypes, change the data types of the data.frame to
# match the specification
if (!dataTypesMatch && fixDataTypes) {
for (i in 1:nrow(cohortDefinitionSetSpec)) {
colName <- cohortDefinitionSetSpec$columnName[i]
dataType <- paste0("as.", cohortDefinitionSetSpec$dataType[i])
x[[colName]] <- do.call(what = dataType, args = as.list(x[[colName]]))
}
}
return(list(
dataTypesMatch = dataTypesMatch,
x = x
))
}
#' Get a cohort definition set
#'
#' @description
#' This function supports the legacy way of retrieving a cohort definition set
#' from the file system or in a package. This function supports the legacy way of
#' storing a cohort definition set in a package with a CSV file, JSON files,
#' and SQL files in the `inst` folder.
#'
#' @param settingsFileName The name of the CSV file that will hold the cohort information
#' including the cohortId and cohortName
#'
#' @param jsonFolder The name of the folder that will hold the JSON representation
#' of the cohort if it is available in the cohortDefinitionSet
#'
#' @param sqlFolder The name of the folder that will hold the SQL representation
#' of the cohort.
#'
#' @param cohortFileNameFormat Defines the format string for naming the cohort
#' JSON and SQL files. The format string follows the
#' standard defined in the base sprintf function.
#'
#' @param cohortFileNameValue Defines the columns in the cohortDefinitionSet to use
#' in conjunction with the cohortFileNameFormat parameter.
#'
#' @param subsetJsonFolder Defines the folder to store the subset JSON
#'
#' @param packageName The name of the package containing the cohort definitions.
#'
#' @param warnOnMissingJson Provide a warning if a .JSON file is not found for a
#' cohort in the settings file
#'
#' @param verbose When TRUE, extra logging messages are emitted
#'
#' @return
#' Returns a cohort set data.frame
#'
#' @export
getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv",
jsonFolder = "cohorts",
sqlFolder = "sql/sql_server",
cohortFileNameFormat = "%s",
cohortFileNameValue = c("cohortId"),
subsetJsonFolder = "inst/cohort_subset_definitions/",
packageName = NULL,
warnOnMissingJson = TRUE,
verbose = FALSE) {
checkmate::assert_vector(cohortFileNameValue)
checkmate::assert_true(length(cohortFileNameValue) > 0)
getPath <- function(fileName) {
path <- fileName
if (!is.null(packageName)) {
path <- system.file(fileName, package = packageName)
}
if (verbose) {
rlang::inform(paste0(" -- Loading ", basename(fileName), " from ", path))
}
if (!file.exists(path)) {
if (grepl(".json$", tolower(basename(fileName))) && warnOnMissingJson) {
errorMsg <- ifelse(is.null(packageName),
paste0("File not found: ", path),
paste0("File, ", fileName, " not found in package: ", packageName)
)
warning(errorMsg)
}
}
return(path)
}
# Read the settings file which holds the cohortDefinitionSet
rlang::inform("Loading cohortDefinitionSet")
settings <- readCsv(file = getPath(fileName = settingsFileName), warnOnCaseMismatch = FALSE)
assertSettingsColumns(names(settings), getPath(fileName = settingsFileName))
checkmate::assert_true(all(cohortFileNameValue %in% names(settings)))
checkmate::assert_true((!all(.getFileDataColumns() %in% names(settings))))
readFile <- function(fileName) {
if (file.exists(fileName)) {
return(SqlRender::readSql(fileName))
} else {
if (grepl(".json$", tolower(basename(fileName))) && warnOnMissingJson) {
warning(paste0(" --- ", fileName, " not found"))
return(NA)
} else {
stop(paste0("File not found: ", fileName))
}
}
}
loadSubsets <- FALSE
subsetsToLoad <- data.frame()
# Do not attempt to load subset definition
if ("isSubset" %in% colnames(settings)) {
subsetsToLoad <- settings %>%
dplyr::filter(.data$isSubset)
settings <- settings %>%
dplyr::filter(!.data$isSubset)
loadSubsets <- TRUE
}
# Read the JSON/SQL files
fileData <- data.frame()
for (i in 1:nrow(settings)) {
cohortFileNameRoot <- .getFileNameFromCohortDefinitionSet(
cohortDefinitionSetRow = settings[i, ],
cohortFileNameValue = cohortFileNameValue,
cohortFileNameFormat = cohortFileNameFormat
)
cohortFileNameRoot <- .removeNonAsciiCharacters(cohortFileNameRoot)
json <- readFile(fileName = getPath(fileName = file.path(jsonFolder, paste0(cohortFileNameRoot, ".json"))))
sql <- readFile(fileName = getPath(fileName = file.path(sqlFolder, paste0(cohortFileNameRoot, ".sql"))))
fileData <- rbind(fileData, data.frame(
json = json,
sql = sql
))
}
cohortDefinitionSet <- cbind(settings, fileData)
# Loading cohort subset definitions with their associated targets
if (loadSubsets & nrow(subsetsToLoad) > 0) {
if (dir.exists(subsetJsonFolder)) {
rlang::inform("Loading Cohort Subset Definitions")
## Loading subsets that apply to the saved definition sets
for (i in unique(subsetsToLoad$subsetDefinitionId)) {
subsetFile <- file.path(subsetJsonFolder, paste0(i, ".json"))
rlang::inform(paste0("Loading Cohort Subset Defintion ", subsetFile))
subsetDef <- CohortSubsetDefinition$new(ParallelLogger::loadSettingsFromJson(subsetFile))
# Find target cohorts for this subset definition
subsetTargetIds <- unique(subsetsToLoad[subsetsToLoad$subsetDefinitionId == i, ]$subsetParent)
cohortDefinitionSet <- addCohortSubsetDefinition(cohortDefinitionSet,
subsetDef,
targetCohortIds = subsetTargetIds
)
}
} else {
stop("subset definitions defined in settings file but no corresponding subset definition file is associated")
}
}
invisible(cohortDefinitionSet)
}
#' Save the cohort definition set to the file system
#'
#' @description
#' This function saves a cohortDefinitionSet to the file system and provides
#' options for specifying where to write the individual elements: the settings
#' file will contain the cohort information as a CSV specified by the
#' settingsFileName, the cohort JSON is written to the jsonFolder and the SQL
#' is written to the sqlFolder. We also provide a way to specify the
#' json/sql file name format using the cohortFileNameFormat and
#' cohortFileNameValue parameters.
#'
#' @template CohortDefinitionSet
#'
#' @param settingsFileName The name of the CSV file that will hold the cohort information
#' including the cohortId and cohortName
#'
#' @param jsonFolder The name of the folder that will hold the JSON representation
#' of the cohort if it is available in the cohortDefinitionSet
#'
#' @param sqlFolder The name of the folder that will hold the SQL representation
#' of the cohort.
#'
#' @param cohortFileNameFormat Defines the format string for naming the cohort
#' JSON and SQL files. The format string follows the
#' standard defined in the base sprintf function.
#'
#' @param cohortFileNameValue Defines the columns in the cohortDefinitionSet to use
#' in conjunction with the cohortFileNameFormat parameter.
#'
#' @param subsetJsonFolder Defines the folder to store the subset JSON
#'
#' @param verbose When TRUE, logging messages are emitted to indicate export
#' progress.
#'
#' @export
saveCohortDefinitionSet <- function(cohortDefinitionSet,
settingsFileName = "inst/Cohorts.csv",
jsonFolder = "inst/cohorts",
sqlFolder = "inst/sql/sql_server",
cohortFileNameFormat = "%s",
cohortFileNameValue = c("cohortId"),
subsetJsonFolder = "inst/cohort_subset_definitions/",
verbose = FALSE) {
checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
checkmate::assert_vector(cohortFileNameValue)
checkmate::assert_true(length(cohortFileNameValue) > 0)
assertSettingsColumns(names(cohortDefinitionSet))
checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet)))
settingsFolder <- dirname(settingsFileName)
if (!dir.exists(settingsFolder)) {
dir.create(settingsFolder, recursive = TRUE)
}
if (!file.exists(jsonFolder)) {
dir.create(jsonFolder, recursive = TRUE)
}
if (!file.exists(sqlFolder)) {
dir.create(sqlFolder, recursive = TRUE)
}
# Export the cohortDefinitionSet to the settings folder
if (verbose) {
rlang::inform(paste0("Exporting cohortDefinitionSet to ", settingsFileName))
}
# Write the settings file and ensure that the "sql" and "json" columns are
# not included
writeCsv(
x = cohortDefinitionSet[, -which(names(cohortDefinitionSet) %in% .getFileDataColumns())],
file = settingsFileName,
warnOnUploadRuleViolations = FALSE
)
hasSubsets <- hasSubsetDefinitions(cohortDefinitionSet)
# Export the SQL & JSON for each entry
for (i in 1:nrow(cohortDefinitionSet)) {
cohortId <- cohortDefinitionSet$cohortId[i]
cohortName <- .removeNonAsciiCharacters(cohortDefinitionSet$cohortName[i])
json <- ifelse("json" %in% names(cohortDefinitionSet), .removeNonAsciiCharacters(cohortDefinitionSet$json[i]), "{}")
sql <- cohortDefinitionSet$sql[i]
fileNameRoot <- .getFileNameFromCohortDefinitionSet(
cohortDefinitionSetRow = cohortDefinitionSet[i, ],
cohortFileNameValue = cohortFileNameValue,
cohortFileNameFormat = cohortFileNameFormat
)
if (hasSubsets && cohortDefinitionSet$isSubset[i]) {
next # Subsets are saved only as json
}
if (verbose) {
rlang::inform(paste0("Exporting (", i, "/", nrow(cohortDefinitionSet), "): ", cohortName))
}
if (!is.na(json) && nchar(json) > 0) {
SqlRender::writeSql(sql = json, targetFile = file.path(jsonFolder, paste0(fileNameRoot, ".json")))
}
SqlRender::writeSql(sql = sql, targetFile = file.path(sqlFolder, paste0(fileNameRoot, ".sql")))
}
if (hasSubsets) {
for (subsetDefinition in attr(cohortDefinitionSet, "cohortSubsetDefinitions")) {
saveCohortSubsetDefinition(subsetDefinition, subsetJsonFolder)
}
}
rlang::inform("Cohort definition saved")
}
.getSettingsFileRequiredColumns <- function() {
return(c("cohortId", "cohortName"))
}
.getFileDataColumns <- function() {
return(c("json", "sql"))
}
.getFileNameFromCohortDefinitionSet <- function(cohortDefinitionSetRow,
cohortFileNameValue,
cohortFileNameFormat) {
checkmate::assertDataFrame(cohortDefinitionSetRow, min.rows = 1, max.rows = 1, col.names = "named")
# Create the list of arguments to pass to stri_sprintf
# to create the file name
argList <- list(format = cohortFileNameFormat)
for (j in 1:length(cohortFileNameValue)) {
argList <- append(argList, cohortDefinitionSetRow[1, cohortFileNameValue[j]][[1]])
}
fileNameRoot <- do.call(stringi::stri_sprintf, argList)
return(fileNameRoot)
}
.removeNonAsciiCharacters <- function(expression) {
return(stringi::stri_trans_general(expression, "latin-ascii"))
}
#' Custom checkmate assertion for ensuring the settings columns are properly
#' specified
#'
#' @description
#' This function is used to provide a more informative message when ensuring
#' that the columns in the cohort definition set or the CSV file that
#' defines the cohort definition set is properly specified. This function
#' is then bootstrapped upon package initialization (code in CohortGenerator.R)
#' to allow for it to work with the other checkmate assertions as described in:
#' https://mllg.github.io/checkmate/articles/checkmate.html. The assertion function
#' is called assert_settings_columns.
#'
#' @param columnNames The name of the columns found in either the cohortDefinitionSet
#' data frame or from reading the contents of the settingsFile
#'
#' @param settingsFileName The file name of the CSV that defines the cohortDefinitionSet.
#' When NULL, this function assumes the column names are defined
#' in a data.frame representation of the cohortDefinitionSet
#' @return
#' Returns TRUE if all required columns are found otherwise it returns an error
#' @noRd
#' @keywords internal
checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
settingsColumns <- .getSettingsFileRequiredColumns()
res <- all(settingsColumns %in% columnNames)
if (!isTRUE(res)) {
sourceDescription <- "cohort definition set"
if (!is.null(settingsFileName)) {
sourceDescription <- "settings file"
}
errorMessage <- paste0("CohortGenerator requires the following columns in the ", sourceDescription, ": ", paste(shQuote(settingsColumns), collapse = ", "), ". The following columns were found: ", paste(shQuote(columnNames), collapse = ", "))
return(errorMessage)
} else {
return(TRUE)
}
}
#' Custom checkmate assertion for ensuring a vector contains only integer numbers,
#' including large ones
#'
#' @description
#' This function is used to provide a more informative message to inform
#' a user that their number must be an integer. Since the
#' cohort definition set allows for storing `numeric` data types, we need
#' to make sure that there are no digits in the mantissa of the cohort ID.
#' NOTE: This function is necessary since checkmate::assert_integerish
#' will still throw an error even in the case where you have a large
#' integer which was not desirable.
#'
#' @param x The vector containing integer/numeric values
#'
#' @param columnName The name of the column where this vector came from. This
#' is used when displaying the error message.
#' @return
#' Returns TRUE if all the values in x are integers
#' @noRd
#' @keywords internal
checkLargeInteger <- function(x, columnName = "cohortId") {
# NOTE: suppressWarnings used to mask
# warning from R which may happen for
# large values in X.
res <- all(suppressWarnings(x %% 1) == 0)
if (!isTRUE(res)) {
errorMessage <- paste0("The column ", columnName, " included non-integer values. Please update and re-try")
return(errorMessage)
} else {
return(TRUE)
}
}
.copySubsetDefinitions <- function(copyToCds, copyFromCds) {
# deep clone any subset definitions
if (hasSubsetDefinitions(copyFromCds)) {
subsetDefintiions <- list()
Map(function(subsetDefinition) {
subsetDefintiions[[length(subsetDefintiions) + 1]] <- subsetDefinition$clone(deep = TRUE)
}, attr(copyFromCds, "cohortSubsetDefinitions"))
attr(copyToCds, "cohortSubsetDefinitions") <- subsetDefintiions
attr(copyToCds, "hasSubsetDefinitions") <- TRUE
}
copyToCds
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.