To begin, install the following libraries from OHDSI github. Packages only need to be installed once.
# devtools::install_github("ohdsi/SqlRender") # devtools::install_github("ohdsi/DatabaseConnector") # devtools::install_github("ohdsi/FeatureExtraction") # devtools::install_github("ohdsi/PatientLevelPrediction")
Connect to your database using the DatabaseConnector package. For details about the createConnectionDetails function, run ?createConnectionDetails or help(createConnectionDetails) in the R console.
connectionDetails = DatabaseConnector::createConnectionDetails(dbms = "sql server", server = "omop.dbmi.columbia.edu") connection = DatabaseConnector::connect(connectionDetails)
Specify the following database schemas. The targetCohortTable is the name of the cohort table. Change the targetCohortId when create a new cohort.
cdmDatabaseSchema = "ohdsi_cumc_deid_2020q2r2.dbo" cohortDatabaseSchema = "ohdsi_cumc_deid_2020q2r2.results" targetCohortTable = "MVDECONFOUNDER_COHORT" targetCohortId = 1
Extract the ingredients that appear in more than 0.1% of the total population. In our case, the limit number is ~6000 patients. The minimumProportion can be changed, and the targetDrugTable can be either 'DRUG_ERA' or 'DRUG_EXPOSURE'.
ingredientList<-MvDeconfounder::listingIngredients(connection, cdmDatabaseSchema, vocabularyDatabaseSchema = cdmDatabaseSchema, minimumProportion = 0.001, targetDrugTable = 'DRUG_ERA') # saveRDS(ingredientList, file="dat/ingredientList.rds") ingredientConceptIds = ingredientList[[1]]$conceptId #num ingredients length(ingredientConceptIds)
Similary to find ingredients, we also find the measurements that appear in more than 0.1% of the total popoulation.
measurementList<-MvDeconfounder::listingMeasurements(connection, cdmDatabaseSchema, vocabularyDatabaseSchema = cdmDatabaseSchema, minimumProportion = 0.001) # saveRDS(measurementList, file="dat/measurementList.rds") measurementConceptIds = measurementList[[1]]$conceptId #num measurements length(measurementConceptIds)
Create a cohort. Patients who take any medication that is in the ingredient list and has at least one pair of pre-treatment and post-treatment measurement values of the lab test are included in the cohort. A patient can contribute multiple records to the cohort if the patient has multiple records that satisfy the inclusion criteria.
WARNING: This step can take a long time to run!
MvDeconfounder::createCohorts(connection, cdmDatabaseSchema, oracleTempSchema = NULL, vocabularyDatabaseSchema = cdmDatabaseSchema, cohortDatabaseSchema, targetCohortTable, ingredientConceptIds, measurementConceptIds, labWindow = 35, targetCohortId = targetCohortId)
Extract the cohort table from the sql database and store as an R data.frame. Because subject_id is not unique, we create a unique identifier rowId in the cohort table.
sql = "select * from @cohort_database_schema.@target_cohort_table where cohort_definition_id = @target_cohort_id;" sql<-SqlRender::render(sql, cohort_database_schema = cohortDatabaseSchema, target_cohort_table = targetCohortTable, target_cohort_id = targetCohortId ) sql<-SqlRender::translate(sql, targetDialect = connectionDetails$dbms) cohort<-DatabaseConnector::querySql(connection, sql) # Create unique id: rowId colnames(cohort)<-SqlRender::snakeCaseToCamelCase(colnames(cohort)) cohort$rowId <- seq(nrow(cohort)) #num records nrow(cohort) #num patients length(unique(cohort$subjectId))
To extract features for this cohort, we first specify the settings for measurement and drug separately using functions in the FeatureExtraction package. For example, we extract any measurement from the list measurementConceptIds that appears either between -35 and -1 day(s) prior to treatments, or between 1 and 35 day(s) after treatments.
The drugs at the day of exposure will be extracted.
# get features measCovariateSettings <- FeatureExtraction::createTemporalCovariateSettings(useMeasurementValue = TRUE, temporalStartDays = c(-35,1), temporalEndDays = c(-1,35), includedCovariateConceptIds = measurementConceptIds ) drugCovariateSettings <- FeatureExtraction::createCovariateSettings(useDrugEraShortTerm =TRUE, shortTermStartDays = 0, endDays = 0, includedCovariateConceptIds = ingredientConceptIds )
We execute the above settings using getPlpData from the PatientLevelPrediction package. Depends on the size of the cohort, this step can take a very long time so we suggest sample a fraction first to test the script. Here we sample 100,000 records. Make sure to set seed so that you sample the same records when extract measurements and drugs. To extract features for the entire cohort, set sampleSize= NULL.
options(fftempdir = tempdir()) # memory.limit(size=1024*12) # memory.size(max=NA) set.seed(1) plpData.meas <- PatientLevelPrediction::getPlpData(connectionDetails = connectionDetails, cdmDatabaseSchema = cdmDatabaseSchema, cohortDatabaseSchema = cohortDatabaseSchema, cohortTable = targetCohortTable, cohortId = targetCohortId, covariateSettings = measCovariateSettings, outcomeDatabaseSchema = cohortDatabaseSchema, outcomeTable = targetCohortTable, outcomeIds = targetCohortId, sampleSize = 1e+5#NULL ) # saveRDS(plpData.meas, file='dat/plpData_meas_sampleSize1e5.rds') plpData.drug <- PatientLevelPrediction::getPlpData(connectionDetails = connectionDetails, cdmDatabaseSchema = cdmDatabaseSchema, cohortDatabaseSchema = cohortDatabaseSchema, cohortTable = targetCohortTable, cohortId = targetCohortId, covariateSettings = drugCovariateSettings, outcomeDatabaseSchema = cohortDatabaseSchema, outcomeTable = targetCohortTable, outcomeIds = targetCohortId, sampleSize = 1e+5#NULL ) # saveRDS(plpData.drug, file='dat/plpData_drug_sampleSize1e5.rds') length(unique(plpData.meas$cohorts$subjectId)) length(unique(plpData.drug$cohorts$subjectId))
Convert drug and measurement to two sparse matrices.
drugSparseMat <- toSparseM(plpData.drug, map=plpMap$map) saveRDS(drugSparseMat, file="drugSparseMat.rds") measSparseMat <- toSparseM(plpData.meas, map=plpMap$map) saveRDS(measSparseMat, file="measSparseMat.rds")
Fit a deconfounder model, ie. a probabilistic PCA.
# specify the python to use, i.e. from a conda env reticulate::use_condaenv("deconfounder_py3", required = TRUE) # Parameters for the deconfounder learning_rate <- 0.01 max_steps <- as.integer(5000) latent_dim <- as.integer(1) batch_size <- as.integer(1024) num_samples <- as.integer(1) holdout_portion <- 0.2 print_steps <- as.integer(50) tolerance <- as.integer(3) num_confounder_samples <- as.integer(100) CV <- as.integer(5) outcome_type <- "linear" project_dir <- "C:/Users/lz2629/git/zhangly811/MvDeconfounder" MvDeconfounder::fitDeconfounder(learning_rate, max_steps, latent_dim, batch_size, num_samples, holdout_portion, print_steps, tolerance, num_confounder_samples, CV, outcome_type, project_dir)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.