R/CohortDefinitionSet.R

Defines functions .copySubsetDefinitions checkLargeInteger checkSettingsColumns .removeNonAsciiCharacters .getFileNameFromCohortDefinitionSet .getFileDataColumns .getSettingsFileRequiredColumns saveCohortDefinitionSet getCohortDefinitionSet checkAndFixCohortDefinitionSetDataTypes isCohortDefinitionSet .cohortDefinitionSetHasRequiredColumns createEmptyCohortDefinitionSet

Documented in checkAndFixCohortDefinitionSetDataTypes createEmptyCohortDefinitionSet getCohortDefinitionSet isCohortDefinitionSet saveCohortDefinitionSet

# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Create an empty cohort definition set
#'
#' @description
#' This function creates an empty cohort set data.frame for use
#' with \code{generateCohortSet}.
#'
#' @param verbose When TRUE, descriptions of each field in the data.frame are
#'                returned
#'
#' @return
#' Invisibly returns an empty cohort set data.frame
#'
#' @export
createEmptyCohortDefinitionSet <- function(verbose = FALSE) {
  checkmate::assert_logical(verbose)
  df <- data.frame(
    cohortId = numeric(),
    cohortName = character(),
    sql = character(),
    json = character()
  )
  if (verbose) {
    print(df)
  }
  invisible(df)
}

.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
  checkmate::assert_data_frame(x)
  df <- createEmptyCohortDefinitionSet(verbose = FALSE)

  # Compare the column names from the input x to an empty cohort
  # definition set to ensure the required columns are present
  cohortDefinitionSetColumns <- colnames(df)
  matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
  columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)

  if (!columnNamesMatch && emitWarning) {
    columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
    warningMessage <- paste0(
      "The following columns were missing in your cohortDefinitionSet: ",
      paste(columnsMissing, collapse = ","),
      ". A cohortDefinitionSet requires the following columns: ",
      paste(cohortDefinitionSetColumns, collapse = ",")
    )
    warning(warningMessage)
  }
  invisible(columnNamesMatch)
}

#' Is the data.frame a cohort definition set?
#'
#' @description
#' This function checks a data.frame to verify it holds the expected format
#' for a cohortDefinitionSet.
#'
#' @param x  The data.frame to check
#'
#' @return
#' Returns TRUE if the input is a cohortDefinitionSet or returns FALSE
#' with warnings on any violations
#'
#' @export
isCohortDefinitionSet <- function(x) {
  columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = TRUE)
  dataTypesMatch <- FALSE
  if (columnNamesMatch) {
    dataTypesMatch <- checkAndFixCohortDefinitionSetDataTypes(
      x = x,
      fixDataTypes = FALSE,
      emitWarning = TRUE
    )$dataTypesMatch
  }
  return(columnNamesMatch && dataTypesMatch)
}

#' Check if a cohort definition set is using the proper data types
#'
#' @description
#' This function checks a data.frame to verify it holds the expected format
#' for a cohortDefinitionSet's data types and can optionally fix data types
#' that do not match the specification.
#'
#' @param x  The cohortDefinitionSet data.frame to check
#'
#' @param fixDataTypes When TRUE, this function will attempt to fix the data types
#'                     to match the specification. @seealso [createEmptyCohortDefinitionSet()].
#'
#' @param emitWarning  When TRUE, this function will emit warning messages when problems are
#'                     encountered.
#'
#' @return
#' Returns a list() of the following form:
#'
#' list(
#'    dataTypesMatch = TRUE/FALSE,
#'    x = data.frame()
#' )
#'
#' dataTypesMatch == TRUE when the supplied data.frame x matches the cohortDefinitionSet
#' specification's data types.
#'
#' If fixDataTypes == TRUE, x will hold the original data from x with the
#' data types corrected. Otherwise x will hold the original value passed to this
#' function.
#'
#' @export
checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emitWarning = FALSE) {
  checkmate::assert_data_frame(x)
  df <- createEmptyCohortDefinitionSet(verbose = FALSE)
  cohortDefinitionSetColumns <- colnames(df)

  columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = emitWarning)
  if (!columnNamesMatch) {
    stop("Cannot check and fix cohortDefinitionSet since it is missing required columns.")
  }

  # Compare the data types from the input x to an empty cohort
  # definition set to ensure the same data types (or close enough)
  # are present
  dataTypesMatch <- FALSE
  # Subset x to the required columns
  xSubset <- x[, cohortDefinitionSetColumns]
  # Get the data types
  xDataTypes <- sapply(xSubset, typeof)
  # Get the reference data types
  cohortDefinitionSetDataTypes <- sapply(df, typeof)
  # Check if the data types match
  # NOTE: createEmptyCohortDefinitionSet() is the reference for the data
  # types. cohortId is declared as a numeric but an integer is also fine
  dataTypesMatch <- (xDataTypes[1] %in% c("integer", "double") && all(xDataTypes[2:4] == "character"))
  # Create the cohortDefinitionSetSpec from the names/data types for reference
  cohortDefinitionSetSpec <- data.frame(
    columnName = names(xDataTypes),
    dataType = xDataTypes
  )
  if (!dataTypesMatch && emitWarning) {
    dataTypesMismatch <- setdiff(x = cohortDefinitionSetDataTypes, y = xDataTypes)
    # Create a column for the warning message
    cohortDefinitionSetSpec$columnNameWithDataType <- paste(cohortDefinitionSetSpec$columnName, cohortDefinitionSetSpec$dataType, sep = " == ")
    userSuppliedCohortDefinitionSetDataTypes <- paste(names(x[1 == 0, ]), "==", sapply(x[1 == 0, ], class), collapse = "\n")
    warningMessage <- paste0("Your cohortDefinitionSet had a mismatch in data types. Please check your cohortDefinitionSet to ensure it conforms to the following expected data types:")
    warningMessage <- paste0(warningMessage, "Expected column == data type\n--------------------------\n", paste(cohortDefinitionSetSpec$columnNameWithDataType, collapse = "\n"))
    warningMessage <- paste0(warningMessage, "\n--------------------------\n")
    warningMessage <- paste0(warningMessage, "Your cohortDefinitionSet \n--------------------------\n", userSuppliedCohortDefinitionSetDataTypes)
    warning(warningMessage)
  }

  # If fixDataTypes, change the data types of the data.frame to
  # match the specification
  if (!dataTypesMatch && fixDataTypes) {
    for (i in 1:nrow(cohortDefinitionSetSpec)) {
      colName <- cohortDefinitionSetSpec$columnName[i]
      dataType <- paste0("as.", cohortDefinitionSetSpec$dataType[i])
      x[[colName]] <- do.call(what = dataType, args = as.list(x[[colName]]))
    }
  }

  return(list(
    dataTypesMatch = dataTypesMatch,
    x = x
  ))
}

#' Get a cohort definition set
#'
#' @description
#' This function supports the legacy way of retrieving a cohort definition set
#' from the file system or in a package. This function supports the legacy way of
#' storing a cohort definition set in a package with a CSV file, JSON files,
#' and SQL files in the `inst` folder.
#'
#' @param settingsFileName The name of the CSV file that will hold the cohort information
#'                         including the cohortId and cohortName
#'
#' @param jsonFolder       The name of the folder that will hold the JSON representation
#'                         of the cohort if it is available in the cohortDefinitionSet
#'
#' @param sqlFolder        The name of the folder that will hold the SQL representation
#'                         of the cohort.
#'
#' @param cohortFileNameFormat  Defines the format string  for naming the cohort
#'                              JSON and SQL files. The format string follows the
#'                              standard defined in the base sprintf function.
#'
#' @param cohortFileNameValue   Defines the columns in the cohortDefinitionSet to use
#'                              in conjunction with the cohortFileNameFormat parameter.
#'
#' @param subsetJsonFolder      Defines the folder to store the subset JSON
#'
#' @param packageName The name of the package containing the cohort definitions.
#'
#' @param warnOnMissingJson Provide a warning if a .JSON file is not found for a
#'                          cohort in the settings file
#'
#' @param verbose           When TRUE, extra logging messages are emitted
#'
#' @return
#' Returns a cohort set data.frame
#'
#' @export
getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv",
                                   jsonFolder = "cohorts",
                                   sqlFolder = "sql/sql_server",
                                   cohortFileNameFormat = "%s",
                                   cohortFileNameValue = c("cohortId"),
                                   subsetJsonFolder = "inst/cohort_subset_definitions/",
                                   packageName = NULL,
                                   warnOnMissingJson = TRUE,
                                   verbose = FALSE) {
  checkmate::assert_vector(cohortFileNameValue)
  checkmate::assert_true(length(cohortFileNameValue) > 0)

  getPath <- function(fileName) {
    path <- fileName
    if (!is.null(packageName)) {
      path <- system.file(fileName, package = packageName)
    }
    if (verbose) {
      rlang::inform(paste0(" -- Loading ", basename(fileName), " from ", path))
    }
    if (!file.exists(path)) {
      if (grepl(".json$", tolower(basename(fileName))) && warnOnMissingJson) {
        errorMsg <- ifelse(is.null(packageName),
          paste0("File not found: ", path),
          paste0("File, ", fileName, " not found in package: ", packageName)
        )
        warning(errorMsg)
      }
    }
    return(path)
  }

  # Read the settings file which holds the cohortDefinitionSet
  rlang::inform("Loading cohortDefinitionSet")
  settings <- readCsv(file = getPath(fileName = settingsFileName), warnOnCaseMismatch = FALSE)

  assertSettingsColumns(names(settings), getPath(fileName = settingsFileName))
  checkmate::assert_true(all(cohortFileNameValue %in% names(settings)))
  checkmate::assert_true((!all(.getFileDataColumns() %in% names(settings))))

  readFile <- function(fileName) {
    if (file.exists(fileName)) {
      return(SqlRender::readSql(fileName))
    } else {
      if (grepl(".json$", tolower(basename(fileName))) && warnOnMissingJson) {
        warning(paste0(" --- ", fileName, " not found"))
        return(NA)
      } else {
        stop(paste0("File not found: ", fileName))
      }
    }
  }

  loadSubsets <- FALSE
  subsetsToLoad <- data.frame()
  # Do not attempt to load subset definition
  if ("isSubset" %in% colnames(settings)) {
    subsetsToLoad <- settings %>%
      dplyr::filter(.data$isSubset)

    settings <- settings %>%
      dplyr::filter(!.data$isSubset)

    loadSubsets <- TRUE
  }

  # Read the JSON/SQL files
  fileData <- data.frame()
  for (i in 1:nrow(settings)) {
    cohortFileNameRoot <- .getFileNameFromCohortDefinitionSet(
      cohortDefinitionSetRow = settings[i, ],
      cohortFileNameValue = cohortFileNameValue,
      cohortFileNameFormat = cohortFileNameFormat
    )
    cohortFileNameRoot <- .removeNonAsciiCharacters(cohortFileNameRoot)
    json <- readFile(fileName = getPath(fileName = file.path(jsonFolder, paste0(cohortFileNameRoot, ".json"))))
    sql <- readFile(fileName = getPath(fileName = file.path(sqlFolder, paste0(cohortFileNameRoot, ".sql"))))
    fileData <- rbind(fileData, data.frame(
      json = json,
      sql = sql
    ))
  }

  cohortDefinitionSet <- cbind(settings, fileData)
  # Loading cohort subset definitions with their associated targets
  if (loadSubsets & nrow(subsetsToLoad) > 0) {
    if (dir.exists(subsetJsonFolder)) {
      rlang::inform("Loading Cohort Subset Definitions")

      ## Loading subsets that apply to the saved definition sets
      for (i in unique(subsetsToLoad$subsetDefinitionId)) {
        subsetFile <- file.path(subsetJsonFolder, paste0(i, ".json"))
        rlang::inform(paste0("Loading Cohort Subset Defintion ", subsetFile))
        subsetDef <- CohortSubsetDefinition$new(ParallelLogger::loadSettingsFromJson(subsetFile))
        # Find target cohorts for this subset definition
        subsetTargetIds <- unique(subsetsToLoad[subsetsToLoad$subsetDefinitionId == i, ]$subsetParent)

        cohortDefinitionSet <- addCohortSubsetDefinition(cohortDefinitionSet,
          subsetDef,
          targetCohortIds = subsetTargetIds
        )
      }
    } else {
      stop("subset definitions defined in settings file but no corresponding subset definition file is associated")
    }
  }

  invisible(cohortDefinitionSet)
}

#' Save the cohort definition set to the file system
#'
#' @description
#' This function saves a cohortDefinitionSet to the file system and provides
#' options for specifying where to write the individual elements: the settings
#' file will contain the cohort information as a CSV specified by the
#' settingsFileName, the cohort JSON is written to the jsonFolder and the SQL
#' is written to the sqlFolder. We also provide a way to specify the
#' json/sql file name format using the cohortFileNameFormat and
#' cohortFileNameValue parameters.
#'
#' @template CohortDefinitionSet
#'
#' @param settingsFileName The name of the CSV file that will hold the cohort information
#'                         including the cohortId and cohortName
#'
#' @param jsonFolder       The name of the folder that will hold the JSON representation
#'                         of the cohort if it is available in the cohortDefinitionSet
#'
#' @param sqlFolder        The name of the folder that will hold the SQL representation
#'                         of the cohort.
#'
#' @param cohortFileNameFormat  Defines the format string  for naming the cohort
#'                              JSON and SQL files. The format string follows the
#'                              standard defined in the base sprintf function.
#'
#' @param cohortFileNameValue   Defines the columns in the cohortDefinitionSet to use
#'                              in conjunction with the cohortFileNameFormat parameter.
#'
#' @param subsetJsonFolder      Defines the folder to store the subset JSON
#'
#' @param verbose           When TRUE, logging messages are emitted to indicate export
#'                          progress.
#'
#' @export
saveCohortDefinitionSet <- function(cohortDefinitionSet,
                                    settingsFileName = "inst/Cohorts.csv",
                                    jsonFolder = "inst/cohorts",
                                    sqlFolder = "inst/sql/sql_server",
                                    cohortFileNameFormat = "%s",
                                    cohortFileNameValue = c("cohortId"),
                                    subsetJsonFolder = "inst/cohort_subset_definitions/",
                                    verbose = FALSE) {
  checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
  checkmate::assert_vector(cohortFileNameValue)
  checkmate::assert_true(length(cohortFileNameValue) > 0)
  assertSettingsColumns(names(cohortDefinitionSet))
  checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet)))
  settingsFolder <- dirname(settingsFileName)
  if (!dir.exists(settingsFolder)) {
    dir.create(settingsFolder, recursive = TRUE)
  }
  if (!file.exists(jsonFolder)) {
    dir.create(jsonFolder, recursive = TRUE)
  }
  if (!file.exists(sqlFolder)) {
    dir.create(sqlFolder, recursive = TRUE)
  }

  # Export the cohortDefinitionSet to the settings folder
  if (verbose) {
    rlang::inform(paste0("Exporting cohortDefinitionSet to ", settingsFileName))
  }
  # Write the settings file and ensure that the "sql" and "json" columns are
  # not included
  writeCsv(
    x = cohortDefinitionSet[, -which(names(cohortDefinitionSet) %in% .getFileDataColumns())],
    file = settingsFileName,
    warnOnUploadRuleViolations = FALSE
  )

  hasSubsets <- hasSubsetDefinitions(cohortDefinitionSet)
  # Export the SQL & JSON for each entry
  for (i in 1:nrow(cohortDefinitionSet)) {
    cohortId <- cohortDefinitionSet$cohortId[i]
    cohortName <- .removeNonAsciiCharacters(cohortDefinitionSet$cohortName[i])
    json <- ifelse("json" %in% names(cohortDefinitionSet), .removeNonAsciiCharacters(cohortDefinitionSet$json[i]), "{}")
    sql <- cohortDefinitionSet$sql[i]
    fileNameRoot <- .getFileNameFromCohortDefinitionSet(
      cohortDefinitionSetRow = cohortDefinitionSet[i, ],
      cohortFileNameValue = cohortFileNameValue,
      cohortFileNameFormat = cohortFileNameFormat
    )

    if (hasSubsets && cohortDefinitionSet$isSubset[i]) {
      next # Subsets are saved only as json
    }

    if (verbose) {
      rlang::inform(paste0("Exporting (", i, "/", nrow(cohortDefinitionSet), "): ", cohortName))
    }

    if (!is.na(json) && nchar(json) > 0) {
      SqlRender::writeSql(sql = json, targetFile = file.path(jsonFolder, paste0(fileNameRoot, ".json")))
    }

    SqlRender::writeSql(sql = sql, targetFile = file.path(sqlFolder, paste0(fileNameRoot, ".sql")))
  }

  if (hasSubsets) {
    for (subsetDefinition in attr(cohortDefinitionSet, "cohortSubsetDefinitions")) {
      saveCohortSubsetDefinition(subsetDefinition, subsetJsonFolder)
    }
  }

  rlang::inform("Cohort definition saved")
}

.getSettingsFileRequiredColumns <- function() {
  return(c("cohortId", "cohortName"))
}

.getFileDataColumns <- function() {
  return(c("json", "sql"))
}

.getFileNameFromCohortDefinitionSet <- function(cohortDefinitionSetRow,
                                                cohortFileNameValue,
                                                cohortFileNameFormat) {
  checkmate::assertDataFrame(cohortDefinitionSetRow, min.rows = 1, max.rows = 1, col.names = "named")
  # Create the list of arguments to pass to stri_sprintf
  # to create the file name
  argList <- list(format = cohortFileNameFormat)
  for (j in 1:length(cohortFileNameValue)) {
    argList <- append(argList, cohortDefinitionSetRow[1, cohortFileNameValue[j]][[1]])
  }
  fileNameRoot <- do.call(stringi::stri_sprintf, argList)
  return(fileNameRoot)
}

.removeNonAsciiCharacters <- function(expression) {
  return(stringi::stri_trans_general(expression, "latin-ascii"))
}

#' Custom checkmate assertion for ensuring the settings columns are properly
#' specified
#'
#' @description
#' This function is used to provide a more informative message when ensuring
#' that the columns in the cohort definition set or the CSV file that
#' defines the cohort definition set is properly specified. This function
#' is then bootstrapped upon package initialization (code in CohortGenerator.R)
#' to allow for it to work with the other checkmate assertions as described in:
#' https://mllg.github.io/checkmate/articles/checkmate.html. The assertion function
#' is called assert_settings_columns.
#'
#' @param columnNames The name of the columns found in either the cohortDefinitionSet
#'                    data frame or from reading the contents of the settingsFile
#'
#' @param settingsFileName The file name of the CSV that defines the cohortDefinitionSet.
#'                         When NULL, this function assumes the column names are defined
#'                         in a data.frame representation of the cohortDefinitionSet
#' @return
#' Returns TRUE if all required columns are found otherwise it returns an error
#' @noRd
#' @keywords internal
checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
  settingsColumns <- .getSettingsFileRequiredColumns()
  res <- all(settingsColumns %in% columnNames)
  if (!isTRUE(res)) {
    sourceDescription <- "cohort definition set"
    if (!is.null(settingsFileName)) {
      sourceDescription <- "settings file"
    }
    errorMessage <- paste0("CohortGenerator requires the following columns in the ", sourceDescription, ": ", paste(shQuote(settingsColumns), collapse = ", "), ". The following columns were found: ", paste(shQuote(columnNames), collapse = ", "))
    return(errorMessage)
  } else {
    return(TRUE)
  }
}

#' Custom checkmate assertion for ensuring a vector contains only integer numbers,
#' including large ones
#'
#' @description
#' This function is used to provide a more informative message to inform
#' a user that their number must be an integer. Since the
#' cohort definition set allows for storing `numeric` data types, we need
#' to make sure that there are no digits in the mantissa of the cohort ID.
#' NOTE: This function is necessary since checkmate::assert_integerish
#' will still throw an error even in the case where you have a large
#' integer which was not desirable.
#'
#' @param x The vector containing integer/numeric values
#'
#' @param columnName The name of the column where this vector came from. This
#'                   is used when displaying the error message.
#' @return
#' Returns TRUE if all the values in x are integers
#' @noRd
#' @keywords internal
checkLargeInteger <- function(x, columnName = "cohortId") {
  # NOTE: suppressWarnings used to mask
  # warning from R which may happen for
  # large values in X.
  res <- all(suppressWarnings(x %% 1) == 0)
  if (!isTRUE(res)) {
    errorMessage <- paste0("The column ", columnName, " included non-integer values. Please update and re-try")
    return(errorMessage)
  } else {
    return(TRUE)
  }
}

.copySubsetDefinitions <- function(copyToCds, copyFromCds) {
  # deep clone any subset definitions
  if (hasSubsetDefinitions(copyFromCds)) {
    subsetDefintiions <- list()
    Map(function(subsetDefinition) {
      subsetDefintiions[[length(subsetDefintiions) + 1]] <- subsetDefinition$clone(deep = TRUE)
    }, attr(copyFromCds, "cohortSubsetDefinitions"))
    attr(copyToCds, "cohortSubsetDefinitions") <- subsetDefintiions
    attr(copyToCds, "hasSubsetDefinitions") <- TRUE
  }

  copyToCds
}

Try the CohortGenerator package in your browser

Any scripts or data that you put into this service are public.

CohortGenerator documentation built on Oct. 1, 2024, 1:09 a.m.