R/getDenominatorCohorts.R
In IncidencePrevalence: Estimate Incidence and Prevalence using the OMOP Common Data Model

# Copyright 2025 DARWIN EU®
#
# This file is part of IncidencePrevalence
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' @importFrom rlang .data
#' @importFrom rlang ":="
getDenominatorCohorts <- function(cdm,
                                  startDate,
                                  endDate,
                                  minAge,
                                  maxAge,
                                  daysPriorObservation,
                                  targetCohortTable,
                                  targetCohortId,
                                  intermediateTable) {
  # make sure names are lowercase and keep variables required
  personDb <- dplyr::rename_with(cdm$person, tolower) %>%
    dplyr::select(
      "person_id", "gender_concept_id",
      "year_of_birth", "month_of_birth", "day_of_birth"
    )
  observationPeriodDb <- dplyr::rename_with(
    cdm$observation_period, tolower
  ) %>%
    dplyr::select(
      "person_id", "observation_period_id",
      "observation_period_start_date",
      "observation_period_end_date"
    )

  # stratify population on cohort
  if (!is.na(targetCohortTable)) {
    targetDb <- cdm[[targetCohortTable]] %>%
      dplyr::filter(.data$cohort_definition_id == .env$targetCohortId)

    # drop anyone not in the target cohort
    personDb <- personDb %>%
      dplyr::inner_join(
        targetDb %>%
          dplyr::rename("person_id" = "subject_id") %>%
          dplyr::select("person_id") %>%
          dplyr::distinct(),
        by = "person_id"
      )

    personDb <- personDb %>%
      dplyr::compute(
        name = paste0(intermediateTable, "_working_person"),
        temporary = FALSE,
        logPrefix = "IncidencePrevalence_getDenominatorCohorts_batch_count_pop_"
      )

    observationPeriodDb <- observationPeriodDb %>%
      dplyr::inner_join(
        targetDb %>%
          dplyr::rename("person_id" = "subject_id") %>%
          dplyr::select("person_id") %>%
          dplyr::distinct(),
        by = "person_id"
      )

    # update observation start date to cohort start date
    # if cohort start date is after observation start date
    # update observation end date to match cohort end date
    # if cohort end date is before observation start date
    observationPeriodDb <- observationPeriodDb %>%
      dplyr::inner_join(
        targetDb %>%
          dplyr::rename("person_id" = "subject_id") %>%
          dplyr::select(
            "person_id",
            "cohort_start_date",
            "cohort_end_date"
          ),
        by = "person_id"
      )

    # to deal with potential multiple observation periods
    # make sure cohort started during joined observation period
    # if not, drop
    observationPeriodDb <- observationPeriodDb %>%
      dplyr::filter(.data$observation_period_start_date <=
        .data$cohort_start_date &
        .data$observation_period_end_date >= .data$cohort_start_date)

    # set observation end to whatever came first of cohort end or obs end
    observationPeriodDb <- observationPeriodDb %>%
      dplyr::mutate(
        observation_period_end_date =
          dplyr::if_else(.data$observation_period_end_date >=
            .data$cohort_end_date,
          .data$cohort_end_date,
          .data$observation_period_end_date
          )
      ) %>%
      dplyr::select(!c("cohort_end_date")) %>%
      dplyr::rename("target_cohort_start_date" = "cohort_start_date")

    personDb <- personDb %>%
      dplyr::compute(
        name = paste0(intermediateTable, "_working_obs_period"),
        temporary = FALSE,
        logPrefix = "IncidencePrevalence_getDenominatorCohorts_person_"
      )

    observationPeriodDb <- observationPeriodDb %>%
      dplyr::compute(
        name = paste0(intermediateTable, "_i_0"),
        temporary = FALSE,
        logPrefix = "IncidencePrevalence_getDenominatorCohorts_obs_period_"
      )
  }

  ## Identifying population of interest
  # filtering on database side
  # drop anyone missing year_of_birth or gender_concept_id
  studyPopDb <- personDb %>%
    dplyr::left_join(observationPeriodDb,
      by = "person_id"
    )

  attrition <- recordAttrition(
    table = studyPopDb,
    id = "person_id",
    reasonId = 1,
    reason = "Starting population"
  )

  studyPopDb <- studyPopDb %>%
    dplyr::filter(!is.na(.data$year_of_birth))

  attrition <- recordAttrition(
    table = studyPopDb,
    id = "person_id",
    reasonId = 2,
    reason = "Missing year of birth",
    existingAttrition = attrition
  )

  studyPopDb <- studyPopDb %>%
    dplyr::mutate(sex = dplyr::if_else(.data$gender_concept_id == "8507",
      "Male",
      dplyr::if_else(.data$gender_concept_id == "8532", "Female", NA)
    )) %>%
    dplyr::filter(!is.na(.data$sex))

  studyPopDb <- studyPopDb %>%
    dplyr::compute(
      name = paste0(intermediateTable, "_i_1"),
      temporary = FALSE,
      logPrefix = "IncidencePrevalence_getDenominatorCohorts_i_1_"
    )

  attrition <- recordAttrition(
    table = studyPopDb,
    id = "person_id",
    reasonId = 3,
    reason = "Missing sex",
    existingAttrition = attrition
  )

  # add date of birth
  # fill in missing day to start of month if only day missing,
  # month (January) if only month missing,
  # month (January) and day (to 1st of month) if both missing
  # ie to impute to the center of the period
  studyPopDb <- studyPopDb %>%
    PatientProfiles::addDateOfBirthQuery(dateOfBirthName = "dob")

  # filter for those within the age limits (of all the age strata)
  # during the study
  lowerAgeLimit <- min(minAge)
  upperAgeLimit <- max(maxAge)

  # as character so we can insert into db
  startDateChar <- as.character(startDate)
  endDateChar <- as.character(endDate)
  studyPopDb <- studyPopDb %>%
    dplyr::mutate(
      lower_age_check = !!CDMConnector::dateadd("dob",
        {{ lowerAgeLimit }},
        interval = "year"
      ),
      upper_age_check = !!CDMConnector::dateadd("dob",
        {{ upperAgeLimit + 1L }},
        interval = "year"
      ),
      start_date = as.Date(.env$startDateChar),
      end_date = as.Date(.env$endDateChar),
    ) %>%
    dplyr::filter(
      # drop people too old even at study start
      .data$upper_age_check >= .data$start_date,
      # drop people too young even at study end
      .data$lower_age_check <= .data$end_date
    ) %>%
    dplyr::select(-c("lower_age_check", "upper_age_check"))

  studyPopDb <- studyPopDb %>%
    dplyr::compute(
      name = paste0(intermediateTable, "_i_2"),
      temporary = FALSE,
      logPrefix = "IncidencePrevalence_getDenominatorCohorts_i_2_"
    )

  attrition <- recordAttrition(
    table = studyPopDb,
    id = "person_id",
    reasonId = 4,
    reason = "Cannot satisfy age criteria during the study period based on year of birth",
    existingAttrition = attrition
  )

  studyPopDb <- studyPopDb %>%
    dplyr::filter(
      # drop people with observation_period_start_date after study end
      .data$observation_period_start_date <= .data$end_date &
        # drop people with observation_period_end_date before study start
        .data$observation_period_end_date >= .data$start_date
    )

  studyPopDb <- studyPopDb %>%
    dplyr::compute(
      name = paste0(intermediateTable, "_i_3"),
      temporary = FALSE,
      logPrefix = "IncidencePrevalence_getDenominatorCohorts_i_3_"
    )

  attrition <- recordAttrition(
    table = studyPopDb,
    id = "person_id",
    reasonId = 5,
    reason = "No observation time available during study period",
    existingAttrition = attrition
  )

  # finalise population (if we still have people)
  if ((studyPopDb %>% dplyr::count() %>% dplyr::pull()) > 0) {
    minAge <- as.integer(minAge)
    maxAge <- as.integer(maxAge)
    daysPriorObservation <- as.integer(daysPriorObservation)

    # for each min age, add the date at which they reach it
    # for each max age, add the date at which they reach it
    # the day before their next birthday
    maxAgePlusOne <- as.integer(maxAge + 1)
    minAgeDates <- addDaysQuery(
      cdm = cdm,
      variable = "dob",
      number = minAge[seq_along(minAge)],
      type = "year",
      name_style = "date_min_age_{number}"
    )
    maxAgeDates <- addDaysQuery(
      cdm = cdm,
      variable = "dob",
      number = maxAgePlusOne,
      type = "year",
      name_style = "date_max_age_{number}"
    )
    maxAgeDatesMinusDay <- minusDaysQuery(
      cdm = cdm,
      variable = glue::glue("date_max_age_{maxAgePlusOne}"),
      number = -1,
      type = "day",
      names = glue::glue("date_max_age_{maxAgePlusOne-1}")
    )

    # for each prior_history requirement,
    # add the date at which they reach
    # observation start date + prior_history requirement
    priorHistoryDates <- addDaysQuery(
      cdm = cdm,
      variable = "observation_period_start_date",
      number = daysPriorObservation,
      type = "day",
      name_style = "date_with_prior_history_{number}"
    )

    studyPopDb <- studyPopDb %>%
      dplyr::mutate(!!!minAgeDates, !!!maxAgeDates, !!!priorHistoryDates) %>%
      # dplyr::collapse() %>%
      dplyr::mutate(!!!maxAgeDatesMinusDay)

    studyPopDb <- studyPopDb %>%
      dplyr::compute(
        name = paste0(intermediateTable, "_i_4"),
        temporary = FALSE,
        logPrefix = "IncidencePrevalence_getDenominatorCohorts_i_4_"
      )

    # keep people only if they
    # satisfy age criteria at some point in the study
    varLowerAgeLimit <- glue::glue("date_min_age_{lowerAgeLimit}")
    varUpperAgeLimit <- glue::glue("date_max_age_{upperAgeLimit}")
    studyPopDb <- studyPopDb %>%
      dplyr::filter(
        .data[[!!rlang::sym(varLowerAgeLimit)]] <=
          .data$end_date &
          .data[[!!rlang::sym(varUpperAgeLimit)]] >=
            .data$start_date
      )

    attrition <- recordAttrition(
      table = studyPopDb,
      id = "person_id",
      reasonId = 6,
      reason = "Doesn't satisfy age criteria during the study period",
      existingAttrition = attrition
    )


    varLowerPriorHistory <-
      glue::glue("date_with_prior_history_{min(daysPriorObservation)}")

    if (!is.na(targetCohortTable)) {
      # update prior history date to whatever came first, that or target entry
      studyPopDb <- studyPopDb %>%
        dplyr::mutate(
          !!varLowerPriorHistory :=
            as.Date(dplyr::if_else(.data$target_cohort_start_date >=
              .data[[varLowerPriorHistory]],
            .data$target_cohort_start_date,
            .data[[varLowerPriorHistory]]
            ))
        )
    }

    # satisfy prior history criteria at some point in the study
    studyPopDb <- studyPopDb %>%
      dplyr::filter(
        .data[[!!rlang::sym(varLowerPriorHistory)]] <=
          .data$end_date,
        .data[[!!rlang::sym(varLowerPriorHistory)]] <=
          .data$observation_period_end_date
      )

    studyPopDb <- studyPopDb %>%
      dplyr::compute(
        name = paste0(intermediateTable, "_i_5"),
        temporary = FALSE,
        logPrefix = "IncidencePrevalence_getDenominatorCohorts_i_5_"
      )


    attrition <- recordAttrition(
      table = studyPopDb,
      id = "person_id",
      reasonId = 7,
      reason = "Prior history requirement not fulfilled during study period",
      existingAttrition = attrition
    )

    ## Get cohort start and end dates
    # Start date:
    # study start_date,
    # date_min_age,
    # date_with_prior_history
    # (whichever comes last)

    # cohort start dates
    # for every combination of min age and prior history required
    ageHistCombos <- expand.grid(
      minAge = minAge,
      daysPriorObservation = daysPriorObservation
    )

    minAgeHistDates <- glue::glue("as.Date(dplyr::if_else(date_min_age_{ageHistCombos$minAge} < date_with_prior_history_{ageHistCombos$daysPriorObservation},
                                      date_with_prior_history_{ageHistCombos$daysPriorObservation},
                                      date_min_age_{ageHistCombos$minAge}))") %>%
      rlang::parse_exprs() %>%
      rlang::set_names(glue::glue("last_of_min_age_{ageHistCombos$minAge}_prior_history_{ageHistCombos$daysPriorObservation}"))


    minAgeHistStartDates <- glue::glue("dplyr::if_else(last_of_min_age_{ageHistCombos$minAge}_prior_history_{ageHistCombos$daysPriorObservation} < .data$start_date,
                                       .data$start_date,
                                       last_of_min_age_{ageHistCombos$minAge}_prior_history_{ageHistCombos$daysPriorObservation})") %>%
      rlang::parse_exprs() %>%
      rlang::set_names(glue::glue("date_min_age_{ageHistCombos$minAge}_prior_history_{ageHistCombos$daysPriorObservation}"))

    studyPopDb <- studyPopDb %>%
      dplyr::mutate(!!!minAgeHistDates) %>%
      # dplyr::collapse() %>%
      dplyr::mutate(!!!minAgeHistStartDates)

    studyPopDb <- studyPopDb %>%
      dplyr::compute(
        name = paste0(intermediateTable, "_i_6"),
        temporary = FALSE,
        logPrefix = "IncidencePrevalence_getDenominatorCohorts_i_6_"
      )

    # cohort end dates
    # study end date,
    # end of observation,
    # max.age
    # (whichever comes first)
    maxAgeObsPeriodDates <- glue::glue("dplyr::if_else(date_max_age_{maxAge} < .data$observation_period_end_date,
                                       date_max_age_{maxAge},
                                       .data$observation_period_end_date)") %>%
      rlang::parse_exprs() %>%
      rlang::set_names(glue::glue("first_of_max_age_{maxAge}_obs_period"))

    maxAgeObsPeriodEndDates <- glue::glue("dplyr::if_else(first_of_max_age_{maxAge}_obs_period < .data$end_date,
                                       first_of_max_age_{maxAge}_obs_period,
                                       .data$end_date)") %>%
      rlang::parse_exprs() %>%
      rlang::set_names(glue::glue("date_max_age_{maxAge}"))

    studyPopDb <- studyPopDb %>%
      dplyr::mutate(!!!maxAgeObsPeriodDates) %>%
      # dplyr::collapse() %>%
      dplyr::mutate(!!!maxAgeObsPeriodEndDates)

    studyPopDb <- studyPopDb %>%
      dplyr::compute(
        name = paste0(intermediateTable, "cohorts"),
        temporary = FALSE,
        logPrefix = "IncidencePrevalence_getDenominatorCohorts_ageHist_"
      )
  }

  # table to return
  studyPopDb <- studyPopDb %>%
    dplyr::compute(
      name = intermediateTable,
      temporary = FALSE,
      logPrefix = "IncidencePrevalence_getDenominatorCohorts_final_"
    )

  # remove intermediate tables
  omopgenerics::dropSourceTable(
    cdm = cdm,
    name = dplyr::starts_with(paste0(intermediateTable, "_"))
  )

  # return list with population and attrition
  dpop <- list()
  dpop[["denominator_population"]] <- studyPopDb
  dpop[["attrition"]] <- attrition

  return(dpop)
}
Any scripts or data that you put into this service are public.
IncidencePrevalence documentation built on Aug. 8, 2025, 6:38 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
IncidencePrevalence
Estimate Incidence and Prevalence using the OMOP Common Data Model

R/getDenominatorCohorts.R
In IncidencePrevalence: Estimate Incidence and Prevalence using the OMOP Common Data Model

Defines functions getDenominatorCohorts

Try the IncidencePrevalence package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

IncidencePrevalence Estimate Incidence and Prevalence using the OMOP Common Data Model

R/getDenominatorCohorts.R In IncidencePrevalence: Estimate Incidence and Prevalence using the OMOP Common Data Model

Defines functions getDenominatorCohorts

Try the IncidencePrevalence package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

IncidencePrevalence
Estimate Incidence and Prevalence using the OMOP Common Data Model

R/getDenominatorCohorts.R
In IncidencePrevalence: Estimate Incidence and Prevalence using the OMOP Common Data Model