# @file GetHdpsCovariates.R
#
# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of FeatureExtraction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This has been moved to extras, because it needs to be converted from using ff to used Andromeda.
#' Get HDPS covariate information from the database
#'
#' @description
#' Constructs the set of covariates for one or more cohorts using data in the CDM schema. This
#' implements the covariates typically used in the HDPS algorithm.
#'
#' @param covariateSettings An object of type \code{covariateSettings} as created using the
#' \code{\link{createHdpsCovariateSettings}} function.
#'
#' @template GetCovarParams
#'
#' @export
getDbHdpsCovariateData <- function(connection,
oracleTempSchema = NULL,
cdmDatabaseSchema,
cohortTable = "cohort_person",
cohortId = -1,
cohortIds = c(-1),
cdmVersion = "5",
rowIdField = "subject_id",
covariateSettings,
aggregated = FALSE) {
if (!missing(cohortId)) {
warning("cohortId argument has been deprecated, please use cohortIds")
cohortIds <- cohortId
}
if (cohortIds != -1)
stop("Haven't implemented restricting to cohort ID yet.")
if (aggregated)
stop("Aggregation not implemented yet")
if (cdmVersion == "4")
stop("Common Data Model version 4 is not supported")
writeLines("Constructing HDPS covariates")
cdmVersion <- "5"
cdmDatabase <- strsplit(cdmDatabaseSchema, "\\.")[[1]][1]
if (cdmVersion == "4") {
cohortDefinitionId <- "cohort_concept_id"
conceptClassId <- "concept_class"
measurement <- "observation"
} else {
cohortDefinitionId <- "cohort_definition_id"
conceptClassId <- "concept_class_id"
measurement <- "measurement"
}
if (is.null(covariateSettings$excludedCovariateConceptIds) || length(covariateSettings$excludedCovariateConceptIds) ==
0) {
hasExcludedCovariateConceptIds <- FALSE
} else {
if (!is.numeric(covariateSettings$excludedCovariateConceptIds))
stop("excludedCovariateConceptIds must be a (vector of) numeric")
hasExcludedCovariateConceptIds <- TRUE
DatabaseConnector::insertTable(connection,
tableName = "#excluded_cov",
data = tibble(concept_id = as.integer(covariateSettings$excludedCovariateConceptIds)),
dropTableIfExists = TRUE,
createTable = TRUE,
tempTable = TRUE,
oracleTempSchema = oracleTempSchema)
}
if (is.null(covariateSettings$includedCovariateConceptIds) || length(covariateSettings$includedCovariateConceptIds) ==
0) {
hasIncludedCovariateConceptIds <- FALSE
} else {
if (!is.numeric(covariateSettings$includedCovariateConceptIds))
stop("includedCovariateConceptIds must be a (vector of) numeric")
hasIncludedCovariateConceptIds <- TRUE
DatabaseConnector::insertTable(connection,
tableName = "#included_cov",
data = tibble(concept_id = as.integer(covariateSettings$includedCovariateConceptIds)),
dropTableIfExists = TRUE,
createTable = TRUE,
tempTable = TRUE,
oracleTempSchema = oracleTempSchema)
}
renderedSql <- SqlRender::loadRenderTranslateSql("GetHdpsCovariates.sql",
packageName = "FeatureExtraction",
dbms = attr(connection, "dbms"),
oracleTempSchema = oracleTempSchema,
cdm_database = cdmDatabase,
cdm_version = cdmVersion,
cohort_temp_table = cohortTable,
row_id_field = rowIdField,
cohort_definition_id = cohortDefinitionId,
concept_class_id = conceptClassId,
measurement = measurement,
use_covariate_cohort_id_is_1 = covariateSettings$useCovariateCohortIdIs1,
use_covariate_demographics = covariateSettings$useCovariateDemographics,
use_covariate_demographics_gender = covariateSettings$useCovariateDemographicsGender,
use_covariate_demographics_race = covariateSettings$useCovariateDemographicsRace,
use_covariate_demographics_ethnicity = covariateSettings$useCovariateDemographicsEthnicity,
use_covariate_demographics_age = covariateSettings$useCovariateDemographicsAge,
use_covariate_demographics_year = covariateSettings$useCovariateDemographicsYear,
use_covariate_demographics_month = covariateSettings$useCovariateDemographicsMonth,
use_covariate_condition_occurrence = covariateSettings$useCovariateConditionOccurrence,
use_covariate_3_digit_icd_9_inpatient_180d = covariateSettings$useCovariate3DigitIcd9Inpatient180d,
use_covariate_3_digit_icd_9_inpatient_180d_med_f = covariateSettings$useCovariate3DigitIcd9Inpatient180dMedF,
use_covariate_3_digit_icd_9_inpatient_180d_75_f = covariateSettings$useCovariate3DigitIcd9Inpatient180d75F,
use_covariate_3_digit_icd_9_ambulatory_180d = covariateSettings$useCovariate3DigitIcd9Ambulatory180d,
use_covariate_3_digit_icd_9_ambulatory_180d_med_f = covariateSettings$useCovariate3DigitIcd9Ambulatory180dMedF,
use_covariate_3_digit_icd_9_ambulatory_180d_75_f = covariateSettings$useCovariate3DigitIcd9Ambulatory180d75F,
use_covariate_drug_exposure = covariateSettings$useCovariateDrugExposure,
use_covariate_ingredient_exposure_180d = covariateSettings$useCovariateIngredientExposure180d,
use_covariate_ingredient_exposure_180d_med_f = covariateSettings$useCovariateIngredientExposure180dMedF,
use_covariate_ingredient_exposure_180d_75_f = covariateSettings$useCovariateIngredientExposure180d75F,
use_covariate_procedure_occurrence = covariateSettings$useCovariateProcedureOccurrence,
use_covariate_inpatient_procedure_occurrence_180d = covariateSettings$useCovariateProcedureOccurrenceInpatient180d,
use_covariate_inpatient_procedure_occurrence_180d_med_f = covariateSettings$useCovariateProcedureOccurrenceInpatient180dMedF,
use_covariate_inpatient_procedure_occurrence_180d_75_f = covariateSettings$useCovariateProcedureOccurrenceInpatient180d75F,
use_covariate_ambulatory_procedure_occurrence_180d = covariateSettings$useCovariateProcedureOccurrenceAmbulatory180d,
use_covariate_ambulatory_procedure_occurrence_180d_med_f = covariateSettings$useCovariateProcedureOccurrenceAmbulatory180dMedF,
use_covariate_ambulatory_procedure_occurrence_180d_75_f = covariateSettings$useCovariateProcedureOccurrenceAmbulatory180d75F,
has_excluded_covariate_concept_ids = hasExcludedCovariateConceptIds,
has_included_covariate_concept_ids = hasIncludedCovariateConceptIds,
delete_covariates_small_count = covariateSettings$deleteCovariatesSmallCount)
DatabaseConnector::executeSql(connection, renderedSql)
writeLines("Done")
writeLines("Fetching data from server")
start <- Sys.time()
covariateSql <- "SELECT row_id, covariate_id, covariate_value FROM #cov ORDER BY covariate_id, row_id"
covariateSql <- SqlRender::translate(sql = covariateSql,
targetDialect = attr(connection, "dbms"),
oracleTempSchema = oracleTempSchema)
covariates <- DatabaseConnector::querySql.ffdf(connection, covariateSql)
covariateRefSql <- "SELECT covariate_id, covariate_name, analysis_id, concept_id FROM #cov_ref ORDER BY covariate_id"
covariateRefSql <- SqlRender::translate(sql = covariateRefSql,
targetDialect = attr(connection, "dbms"),
oracleTempSchema = oracleTempSchema)
covariateRef <- DatabaseConnector::querySql.ffdf(connection, covariateRefSql)
sql <- "SELECT COUNT_BIG(*) FROM @cohort_temp_table"
sql <- SqlRender::render(sql, cohort_temp_table = cohortTable)
sql <- SqlRender::translate(sql = sql,
targetDialect = attr(connection, "dbms"),
oracleTempSchema = oracleTempSchema)
populationSize <- DatabaseConnector::querySql(connection, sql)[1, 1]
delta <- Sys.time() - start
writeLines(paste("Fetching data took", signif(delta, 3), attr(delta, "units")))
renderedSql <- SqlRender::loadRenderTranslateSql("RemoveCovariateTempTables.sql",
packageName = "FeatureExtraction",
dbms = attr(connection, "dbms"),
oracleTempSchema = oracleTempSchema)
DatabaseConnector::executeSql(connection,
renderedSql,
progressBar = FALSE,
reportOverallTime = FALSE)
colnames(covariates) <- SqlRender::snakeCaseToCamelCase(colnames(covariates))
colnames(covariateRef) <- SqlRender::snakeCaseToCamelCase(colnames(covariateRef))
# Remove redundant covariates
writeLines("Removing redundant covariates")
start <- Sys.time()
deletedCovariateIds <- c()
if (nrow(covariates) != 0) {
# First delete all single covariates that appear in every row with the same value
valueCounts <- bySumFf(ff::ff(1, length = nrow(covariates)), covariates$covariateId)
nonSparseIds <- valueCounts$bins[valueCounts$sums == populationSize]
for (covariateId in nonSparseIds) {
selection <- covariates$covariateId == covariateId
idx <- ffbase::ffwhich(selection, selection == TRUE)
values <- ffbase::unique.ff(covariates$covariateValue[idx])
if (length(values) == 1) {
idx <- ffbase::ffwhich(selection, selection == FALSE)
covariates <- covariates[idx, ]
deletedCovariateIds <- c(deletedCovariateIds, covariateId)
}
}
# Next, from groups of covariates that together cover every row, remove the most prevalence one
problematicAnalysisIds <- c(2, 3, 4, 5, 6, 7) # Gender, race, ethnicity, age, year, month
for (analysisId in problematicAnalysisIds) {
t <- covariateRef$analysisId == analysisId
if (ffbase::sum.ff(t) != 0) {
covariateIds <- ff::as.ram(covariateRef$covariateId[ffbase::ffwhich(t, t == TRUE)])
freq <- sapply(covariateIds, function(x) {
ffbase::sum.ff(covariates$covariateId == x)
})
if (sum(freq) == populationSize) {
# Each row belongs to one of the categories, making one redunant. Remove most prevalent one
categoryToDelete <- covariateIds[which(freq == max(freq))[1]]
deletedCovariateIds <- c(deletedCovariateIds, categoryToDelete)
t <- covariates$covariateId == categoryToDelete
covariates <- covariates[ffbase::ffwhich(t, t == FALSE), ]
}
}
}
}
delta <- Sys.time() - start
writeLines(paste("Removing redundant covariates took", signif(delta, 3), attr(delta, "units")))
metaData <- list(sql = renderedSql,
call = match.call(),
deletedCovariateIds = deletedCovariateIds)
result <- list(covariates = covariates, covariateRef = covariateRef, metaData = metaData)
class(result) <- "covariateData"
return(result)
}
#' Create HDPS covariate settings
#'
#' @details
#' creates an object specifying how covariates should be constructed from data in the CDM model.
#'
#' @param useCovariateCohortIdIs1 A boolean value (TRUE/FALSE) to determine
#' if a covariate should be constructed for
#' whether the cohort ID is 1 (currently
#' primarily used in CohortMethod).
#' @param useCovariateDemographics A boolean value (TRUE/FALSE) to determine
#' if demographic covariates (age in 5-yr
#' increments, gender, race, ethnicity, year
#' of index date, month of index date) will
#' be created and included in future models.
#' @param useCovariateDemographicsGender A boolean value (TRUE/FALSE) to determine
#' if gender should be included in the
#' model.
#' @param useCovariateDemographicsRace A boolean value (TRUE/FALSE) to determine
#' if race should be included in the model.
#' @param useCovariateDemographicsEthnicity A boolean value (TRUE/FALSE) to determine
#' if ethnicity should be included in the
#' model.
#' @param useCovariateDemographicsAge A boolean value (TRUE/FALSE) to determine
#' if age (in 5 year increments) should be
#' included in the model.
#' @param useCovariateDemographicsYear A boolean value (TRUE/FALSE) to determine
#' if calendar year should be included in
#' the model.
#' @param useCovariateDemographicsMonth A boolean value (TRUE/FALSE) to determine
#' if calendar month should be included in
#' the model.
#' @param useCovariateConditionOccurrence A boolean value (TRUE/FALSE) to determine
#' if covariates derived from
#' CONDITION_OCCURRENCE table will be
#' created and included in future models.
#' @param useCovariate3DigitIcd9Inpatient180d A boolean value (TRUE/FALSE) to determine
#' if covariates will be created and used in
#' models that look for presence/absence of
#' condition within inpatient setting in
#' 180d window prior to or on cohort index
#' date. Conditions are aggregated at the
#' ICD-9 3-digit level. Only applicable if
#' useCovariateConditionOccurrence = TRUE.
#' @param useCovariate3DigitIcd9Inpatient180dMedF Similar to
#' \code{useCovariate3DigitIcd9Inpatient180d},
#' but now only if the frequency of the
#' ICD-9 code is higher than the median.
#' @param useCovariate3DigitIcd9Inpatient180d75F Similar to
#' \code{useCovariate3DigitIcd9Inpatient180d},
#' but now only if the frequency of the
#' ICD-9 code is higher than the 75th
#' percentile.
#' @param useCovariate3DigitIcd9Ambulatory180d A boolean value (TRUE/FALSE) to determine
#' if covariates will be created and used in
#' models that look for presence/absence of
#' condition within ambulatory setting in
#' 180d window prior to or on cohort index
#' date. Conditions are aggregated at the
#' ICD-9 3-digit level. Only applicable if
#' useCovariateConditionOccurrence = TRUE.
#' @param useCovariate3DigitIcd9Ambulatory180dMedF Similar to
#' \code{useCovariate3DigitIcd9Ambulatory180d},
#' but now only if the frequency of the
#' ICD-9 code is higher than the median.
#' @param useCovariate3DigitIcd9Ambulatory180d75F Similar to
#' \code{useCovariate3DigitIcd9Ambulatory180d},
#' but now only if the frequency of the
#' ICD-9 code is higher than the 75th
#' percentile.
#' @param useCovariateDrugExposure A boolean value (TRUE/FALSE) to determine
#' if covariates derived from DRUG_EXPOSURE
#' table will be created and included in
#' future models.
#' @param useCovariateIngredientExposure180d A boolean value (TRUE/FALSE) to determine
#' if covariates will be created and used in
#' models that look for presence/absence of
#' drug ingredients within inpatient setting
#' in 180d window prior to or on cohort
#' index date. Only applicable if
#' useCovariateDrugExposure = TRUE.
#' @param useCovariateIngredientExposure180dMedF Similar to
#' \code{useCovariateIngredientExposure180d},
#' but now only if the frequency of the
#' ingredient is higher than the median.
#' @param useCovariateIngredientExposure180d75F Similar to
#' \code{useCovariateIngredientExposure180d},
#' but now only if the frequency of the
#' ingredient is higher than the 75th
#' percentile.
#' @param useCovariateProcedureOccurrence A boolean value (TRUE/FALSE) to determine
#' if covariates derived from
#' PROCEDURE_OCCURRENCE table will be
#' created and included in future models.
#' @param useCovariateProcedureOccurrenceInpatient180d A boolean value (TRUE/FALSE) to determine
#' if covariates will be created and used in
#' models that look for presence/absence of
#' procedures within inpatient setting in
#' 180d window prior to or on cohort index
#' date. Only applicable if
#' useCovariateProcedureOccurrence = TRUE.
#' @param useCovariateProcedureOccurrenceInpatient180dMedF Similar to
#' \code{useCovariateProcedureOccurrenceInpatient180d},
#' but now only if the frequency of the
#' procedure code is higher than the median.
#' @param useCovariateProcedureOccurrenceInpatient180d75F Similar to
#' \code{useCovariateProcedureOccurrenceInpatient180d},
#' but now only if the frequency of the
#' procedure code is higher than the 75th
#' percentile.
#' @param useCovariateProcedureOccurrenceAmbulatory180d A boolean value (TRUE/FALSE) to determine
#' if covariates will be created and used in
#' models that look for presence/absence of
#' procedures within ambulatory setting in
#' 180d window prior to or on cohort index
#' date. Only applicable if
#' useCovariateProcedureOccurrence = TRUE.
#' @param useCovariateProcedureOccurrenceAmbulatory180dMedF Similar to
#' \code{useCovariateProcedureOccurrenceAmbulatory180d},
#' but now only if the frequency of the
#' procedure code is higher than the median.
#' @param useCovariateProcedureOccurrenceAmbulatory180d75F Similar to
#' \code{useCovariateProcedureOccurrenceAmbulatory180d},
#' but now only if the frequency of the
#' procedure code is higher than the 75th
#' percentile.
#' @param excludedCovariateConceptIds A list of concept IDs that should NOT be
#' used to construct covariates.
#' @param includedCovariateConceptIds A list of concept IDs that should be used
#' to construct covariates.
#' @param deleteCovariatesSmallCount A numeric value used to remove covariates
#' that occur in both cohorts fewer than
#' deleteCovariateSmallCounts time.
#'
#' @return
#' An object of type \code{hdpsCovariateSettings}, to be used in other functions.
#'
#' @export
createHdpsCovariateSettings <- function(useCovariateCohortIdIs1 = FALSE,
useCovariateDemographics = TRUE,
useCovariateDemographicsGender = TRUE,
useCovariateDemographicsRace = TRUE,
useCovariateDemographicsEthnicity = TRUE,
useCovariateDemographicsAge = TRUE,
useCovariateDemographicsYear = TRUE,
useCovariateDemographicsMonth = TRUE,
useCovariateConditionOccurrence = TRUE,
useCovariate3DigitIcd9Inpatient180d = FALSE,
useCovariate3DigitIcd9Inpatient180dMedF = FALSE,
useCovariate3DigitIcd9Inpatient180d75F = FALSE,
useCovariate3DigitIcd9Ambulatory180d = FALSE,
useCovariate3DigitIcd9Ambulatory180dMedF = FALSE,
useCovariate3DigitIcd9Ambulatory180d75F = FALSE,
useCovariateDrugExposure = FALSE,
useCovariateIngredientExposure180d = FALSE,
useCovariateIngredientExposure180dMedF = FALSE,
useCovariateIngredientExposure180d75F = FALSE,
useCovariateProcedureOccurrence = FALSE,
useCovariateProcedureOccurrenceInpatient180d = FALSE,
useCovariateProcedureOccurrenceInpatient180dMedF = FALSE,
useCovariateProcedureOccurrenceInpatient180d75F = FALSE,
useCovariateProcedureOccurrenceAmbulatory180d = FALSE,
useCovariateProcedureOccurrenceAmbulatory180dMedF = FALSE,
useCovariateProcedureOccurrenceAmbulatory180d75F = FALSE,
excludedCovariateConceptIds = c(),
includedCovariateConceptIds = c(),
deleteCovariatesSmallCount = 100) {
# First: get the default values:
covariateSettings <- list()
for (name in names(formals(createHdpsCovariateSettings))) {
covariateSettings[[name]] <- get(name)
}
# Next: overwrite defaults with actual values if specified:
values <- lapply(as.list(match.call())[-1], function(x) eval(x, envir = sys.frame(-3)))
for (name in names(values)) {
if (name %in% names(covariateSettings))
covariateSettings[[name]] <- values[[name]]
}
attr(covariateSettings, "fun") <- "getDbHdpsCovariateData"
class(covariateSettings) <- "covariateSettings"
return(covariateSettings)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.