In zhangly811/MvDeconfounder: Deconfounder

To begin, install the following libraries from OHDSI github. Packages only need to be installed once.

# devtools::install_github("ohdsi/SqlRender")
# devtools::install_github("ohdsi/DatabaseConnector")
# devtools::install_github("ohdsi/FeatureExtraction")
# devtools::install_github("ohdsi/PatientLevelPrediction")

Connect to your database using the DatabaseConnector package. For details about the createConnectionDetails function, run ?createConnectionDetails or help(createConnectionDetails) in the R console.

connectionDetails = DatabaseConnector::createConnectionDetails(dbms = "sql server",
                                             server = "omop.dbmi.columbia.edu")
connection = DatabaseConnector::connect(connectionDetails)

Specify the following database schemas. The targetCohortTable is the name of the cohort table. Change the targetCohortId when create a new cohort.

cdmDatabaseSchema = "ohdsi_cumc_deid_2020q2r2.dbo"
cohortDatabaseSchema = "ohdsi_cumc_deid_2020q2r2.results"
targetCohortTable = "MVDECONFOUNDER_COHORT"
targetCohortId = 1

Extract the ingredients that appear in more than 0.1% of the total population. In our case, the limit number is ~6000 patients. The minimumProportion can be changed, and the targetDrugTable can be either 'DRUG_ERA' or 'DRUG_EXPOSURE'.

ingredientList<-MvDeconfounder::listingIngredients(connection,
                                                   cdmDatabaseSchema,
                                                   vocabularyDatabaseSchema = cdmDatabaseSchema,
                                                   minimumProportion = 0.001,
                                                   targetDrugTable = 'DRUG_ERA')
# saveRDS(ingredientList, file="dat/ingredientList.rds")
ingredientConceptIds = ingredientList[[1]]$conceptId
#num ingredients
length(ingredientConceptIds)

Similary to find ingredients, we also find the measurements that appear in more than 0.1% of the total popoulation.

measurementList<-MvDeconfounder::listingMeasurements(connection,
                                                    cdmDatabaseSchema,
                                                    vocabularyDatabaseSchema = cdmDatabaseSchema,
                                                    minimumProportion = 0.001)
# saveRDS(measurementList, file="dat/measurementList.rds")
measurementConceptIds = measurementList[[1]]$conceptId
#num measurements
length(measurementConceptIds)

Create a cohort. Patients who take any medication that is in the ingredient list and has at least one pair of pre-treatment and post-treatment measurement values of the lab test are included in the cohort. A patient can contribute multiple records to the cohort if the patient has multiple records that satisfy the inclusion criteria.

WARNING: This step can take a long time to run!

MvDeconfounder::createCohorts(connection,
                              cdmDatabaseSchema,
                              oracleTempSchema = NULL,
                              vocabularyDatabaseSchema = cdmDatabaseSchema,
                              cohortDatabaseSchema,
                              targetCohortTable,
                              ingredientConceptIds,
                              measurementConceptIds,
                              labWindow = 35,
                              targetCohortId = targetCohortId)

Extract the cohort table from the sql database and store as an R data.frame. Because subject_id is not unique, we create a unique identifier rowId in the cohort table.

sql = "select * from @cohort_database_schema.@target_cohort_table where cohort_definition_id = @target_cohort_id;"
sql<-SqlRender::render(sql,
                       cohort_database_schema = cohortDatabaseSchema,
                       target_cohort_table = targetCohortTable,
                       target_cohort_id = targetCohortId
)
sql<-SqlRender::translate(sql, targetDialect = connectionDetails$dbms)
cohort<-DatabaseConnector::querySql(connection, sql)
# Create unique id: rowId
colnames(cohort)<-SqlRender::snakeCaseToCamelCase(colnames(cohort))
cohort$rowId <- seq(nrow(cohort))
#num records
nrow(cohort)
#num patients
length(unique(cohort$subjectId))

To extract features for this cohort, we first specify the settings for measurement and drug separately using functions in the FeatureExtraction package. For example, we extract any measurement from the list measurementConceptIds that appears either between -35 and -1 day(s) prior to treatments, or between 1 and 35 day(s) after treatments.

The drugs at the day of exposure will be extracted.

# get features
measCovariateSettings <- FeatureExtraction::createTemporalCovariateSettings(useMeasurementValue = TRUE,
                                                                            temporalStartDays = c(-35,1),
                                                                            temporalEndDays   = c(-1,35),
                                                                            includedCovariateConceptIds = measurementConceptIds
                                                                            )

drugCovariateSettings <- FeatureExtraction::createCovariateSettings(useDrugEraShortTerm =TRUE,
                                                                    shortTermStartDays = 0,
                                                                    endDays = 0,
                                                                    includedCovariateConceptIds = ingredientConceptIds
)

We execute the above settings using getPlpData from the PatientLevelPrediction package. Depends on the size of the cohort, this step can take a very long time so we suggest sample a fraction first to test the script. Here we sample 100,000 records. Make sure to set seed so that you sample the same records when extract measurements and drugs. To extract features for the entire cohort, set sampleSize= NULL.

options(fftempdir = tempdir())
# memory.limit(size=1024*12)
# memory.size(max=NA)
set.seed(1)
plpData.meas <- PatientLevelPrediction::getPlpData(connectionDetails = connectionDetails,
                                                      cdmDatabaseSchema = cdmDatabaseSchema,
                                                      cohortDatabaseSchema = cohortDatabaseSchema,
                                                      cohortTable = targetCohortTable,
                                                      cohortId = targetCohortId,
                                                      covariateSettings = measCovariateSettings,
                                                      outcomeDatabaseSchema = cohortDatabaseSchema,
                                                      outcomeTable = targetCohortTable,
                                                      outcomeIds = targetCohortId,
                                                      sampleSize = 1e+5#NULL
)

# saveRDS(plpData.meas, file='dat/plpData_meas_sampleSize1e5.rds')


plpData.drug <- PatientLevelPrediction::getPlpData(connectionDetails = connectionDetails,
                                                   cdmDatabaseSchema = cdmDatabaseSchema,
                                                   cohortDatabaseSchema = cohortDatabaseSchema,
                                                   cohortTable = targetCohortTable,
                                                   cohortId = targetCohortId,
                                                   covariateSettings = drugCovariateSettings,
                                                   outcomeDatabaseSchema = cohortDatabaseSchema,
                                                   outcomeTable = targetCohortTable,
                                                   outcomeIds = targetCohortId,
                                                   sampleSize = 1e+5#NULL
)
# saveRDS(plpData.drug, file='dat/plpData_drug_sampleSize1e5.rds')
length(unique(plpData.meas$cohorts$subjectId))
length(unique(plpData.drug$cohorts$subjectId))

Convert drug and measurement to two sparse matrices.

drugSparseMat <- toSparseM(plpData.drug, map=plpMap$map)
saveRDS(drugSparseMat, file="drugSparseMat.rds")
measSparseMat <- toSparseM(plpData.meas, map=plpMap$map)
saveRDS(measSparseMat, file="measSparseMat.rds")

Fit a deconfounder model, ie. a probabilistic PCA.

# specify the python to use, i.e. from a conda env
reticulate::use_condaenv("deconfounder_py3", required = TRUE)
# Parameters for the deconfounder
learning_rate <- 0.01
max_steps <- as.integer(5000)
latent_dim <- as.integer(1)
batch_size <- as.integer(1024)
num_samples <- as.integer(1)
holdout_portion <- 0.2
print_steps <- as.integer(50)
tolerance <- as.integer(3)
num_confounder_samples <- as.integer(100)
CV <- as.integer(5)
outcome_type <- "linear"
project_dir <- "C:/Users/lz2629/git/zhangly811/MvDeconfounder"

MvDeconfounder::fitDeconfounder(learning_rate,
                                max_steps,
                                latent_dim,
                                batch_size,
                                num_samples,
                                holdout_portion,
                                print_steps,
                                tolerance,
                                num_confounder_samples,
                                CV,
                                outcome_type,
                                project_dir)