R/getPrevalence.R
In IncidencePrevalence: Estimate Incidence and Prevalence using the OMOP Common Data Model

# Copyright 2025 DARWIN EU®
#
# This file is part of IncidencePrevalence
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

getPrevalence <- function(cdm,
                          denominatorTable,
                          denominatorCohortId,
                          outcomeTable,
                          outcomeCohortId,
                          type,
                          weeks,
                          months,
                          quarters,
                          years,
                          overall,
                          completeDatabaseIntervals,
                          timePoint,
                          fullContribution,
                          level,
                          tablePrefix,
                          analysisId,
                          strata,
                          includeOverallStrata) {
  # keeping outcome of interest
  # of people in the denominator of interest
  studyPop <- cdm[[denominatorTable]] %>%
    dplyr::mutate(
      cohort_start_date = as.Date(.data$cohort_start_date),
      cohort_end_date = as.Date(.data$cohort_end_date)
    ) %>%
    dplyr::filter(.data$cohort_definition_id ==
      .env$denominatorCohortId) %>%
    dplyr::select(-"cohort_definition_id") %>%
    dplyr::left_join(
      cdm[[outcomeTable]] %>%
        dplyr::filter(.data$cohort_definition_id == .env$outcomeCohortId) %>%
        dplyr::rename(
          "outcome_start_date" = "cohort_start_date",
          "outcome_end_date" = "cohort_end_date"
        ) %>%
        dplyr::select(
          "subject_id", "outcome_start_date",
          "outcome_end_date"
        ),
      by = "subject_id"
    )

  studyPop <- studyPop %>%
    dplyr::collect()

  attrition <- recordAttrition(
    table = studyPop,
    id = "subject_id",
    reasonId = 11,
    reason = "Starting analysis population"
  )

  startEnd <- studyPop %>%
    dplyr::summarise(
      min = min(.data$cohort_start_date, na.rm = TRUE),
      max = max(.data$cohort_end_date, na.rm = TRUE)
    ) %>%
    dplyr::collect()

  # get studyDays as a function of inputs
  interval <- c(
    "weeks" = weeks,
    "months" = months,
    "quarters" = quarters,
    "years" = years,
    "overall" = overall
  )
  interval <- names(interval[interval])
  # study dates
  # based on the earliest start and latest end of those
  # in the relevant denominator
  studyDays <- list()
  if (nrow(studyPop) > 0) {
    startEnd <- studyPop %>%
      dplyr::summarise(
        min = as.Date(min(.data$cohort_start_date, na.rm = TRUE)),
        max = as.Date(max(.data$cohort_end_date, na.rm = TRUE))
      )
    if ("weeks" %in% interval) {
      studyDays[["weeks"]] <- getStudyDays(
        startDate = startEnd$min,
        endDate = startEnd$max,
        timeInterval = "weeks",
        completeDatabaseIntervals = completeDatabaseIntervals,
        type = type,
        timePoint = timePoint
      )
    }
    if ("months" %in% interval) {
      studyDays[["months"]] <- getStudyDays(
        startDate = startEnd$min,
        endDate = startEnd$max,
        timeInterval = "months",
        completeDatabaseIntervals = completeDatabaseIntervals,
        type = type,
        timePoint = timePoint
      )
    }
    if ("quarters" %in% interval) {
      studyDays[["quarters"]] <- getStudyDays(
        startDate = startEnd$min,
        endDate = startEnd$max,
        timeInterval = "quarters",
        completeDatabaseIntervals = completeDatabaseIntervals,
        type = type,
        timePoint = timePoint
      )
    }
    if ("years" %in% interval) {
      studyDays[["years"]] <- getStudyDays(
        startDate = startEnd$min,
        endDate = startEnd$max,
        timeInterval = "years",
        completeDatabaseIntervals = completeDatabaseIntervals,
        type = type,
        timePoint = timePoint
      )
    }
    if ("overall" %in% interval) {
      # note, full periods argument does not apply
      # for overall we just go from start to end
      studyDays[["overall"]] <- dplyr::tibble(
        time = "overall",
        start_time = startEnd$min,
        end_time = startEnd$max
      )
    }
  } else {
    # empty population
    studyDays[["none"]] <- dplyr::tibble()
  }

  if (nrow(dplyr::bind_rows(studyDays)) == 0) {
    # no study days
    studyDays[["none"]] <- dplyr::tibble()
  }

  if ("none" %in% names(studyDays)) {
    # if no study days we<U+00B4>ll return an empty tibble
    pr <- dplyr::tibble()

    attrition <- recordAttrition(
      table = dplyr::tibble(subject_id = integer()),
      id = "subject_id",
      reasonId = 12,
      reason = "Not observed during the complete database interval",
      existingAttrition = attrition
    )
    attrition <- recordAttrition(
      table = dplyr::tibble(subject_id = integer()),
      id = "subject_id",
      reasonId = 13,
      reason = "Do not satisfy full contribution requirement for any time interval",
      existingAttrition = attrition
    )
  }

  if (!"none" %in% names(studyDays)) {
    # drop for complete database intervals requirement
    minStartDate <- min(dplyr::bind_rows(studyDays)$start_time)
    maxStartDate <- max(dplyr::bind_rows(studyDays)$end_time)

    minStartDateChar <- as.character(minStartDate)
    maxStartDateChar <- as.character(maxStartDate)

    allstudyDays <- dplyr::bind_rows(studyDays)

    studyPop <- studyPop %>%
      dplyr::filter(
        .data$cohort_end_date >= as.Date(.env$minStartDateChar),
        .data$cohort_start_date <= as.Date(.env$maxStartDateChar)
      )

    attrition <- recordAttrition(
      table = studyPop,
      id = "subject_id",
      reasonId = 12,
      reason = "Not observed during the complete database interval",
      existingAttrition = attrition
    )

    # drop people who never fulfill contribution requirement
    if (fullContribution == TRUE) {
      checkExpression <- glue::glue("(.data$cohort_end_date >= allstudyDays$end_time[{seq_along(allstudyDays$end_time)}]) &
           (.data$cohort_start_date <= allstudyDays$start_time[{seq_along(allstudyDays$start_time)}])") %>%
        paste0(collapse = "|") %>%
        rlang::parse_expr()

      studyPop <- studyPop %>%
        dplyr::mutate(
          has_full_contribution = dplyr::if_else(!!checkExpression,
            1L,
            0L
          )
        ) %>%
        dplyr::filter(.data$has_full_contribution >= 1) %>%
        dplyr::select(-"has_full_contribution")

      attrition <- recordAttrition(
        table = studyPop,
        id = "subject_id",
        reasonId = 13,
        reason = "Do not satisfy full contribution requirement for an interval",
        existingAttrition = attrition
      )
    } else if (fullContribution == FALSE) { # otherwise nobody excluded
      attrition <- recordAttrition(
        table = studyPop,
        id = "subject_id",
        reasonId = 14,
        reason = "Do not satisfy full contribution requirement for an interval",
        existingAttrition = attrition
      )
    }

    # fetch prevalence
    # looping through each time interval
    pr <- list()
    for (j in seq_along(interval)) {
      workingInterval <- interval[j]
      workingStudyDays <- studyDays[[workingInterval]]
      for (i in seq_len(nrow(workingStudyDays))) {
        workingStart <- workingStudyDays$start_time[i]
        workingEnd <- workingStudyDays$end_time[i]

        if (fullContribution == TRUE) {
          # require presence for all of period
          # drop people with end_date not after workingEnd
          # and start_date not before workingStart
          workingPop <- studyPop %>%
            dplyr::filter(
              .data$cohort_end_date >= .env$workingEnd,
              .data$cohort_start_date <= .env$workingStart
            )
        } else {
          # otherwise include people if they can contribute a day
          # drop people with end_date prior to workingStart
          # and start_date after workingEnd
          workingPop <- studyPop %>%
            dplyr::filter(
              .data$cohort_end_date >= .env$workingStart,
              .data$cohort_start_date <= .env$workingEnd
            )
        }

        workingPop <- workingPop %>%
          dplyr::mutate(
            # individuals start date for this period
            # which could be start of the period or later
            cohort_start_date =
              dplyr::if_else(.data$cohort_start_date <= .env$workingStart,
                .env$workingStart,
                as.Date(.data$cohort_start_date)
              ),
            # individuals end date for this period
            # end of the period or earlier
            cohort_end_date =
              dplyr::if_else(.data$cohort_end_date >= .env$workingEnd,
                .env$workingEnd,
                as.Date(.data$cohort_end_date)
              )
          )

        if (length(strata) == 0 || includeOverallStrata == TRUE) {
          # include ongoing in current time of interest
          if(level == "person"){
          result <- workingPop %>%
            dplyr::summarise(
              denominator_count = dplyr::n_distinct(.data$subject_id),
              outcome_count = dplyr::n_distinct(.data$subject_id[
                !is.na(.data$outcome_start_date) &
                  .data$outcome_start_date <= .data$cohort_end_date &
                  .data$outcome_end_date >= .data$cohort_start_date
              ])
            )
          } else { # record level
            result <-  cbind(
              # count of cohort entries
              workingPop |>
              dplyr::select("subject_id", "cohort_start_date") |>
              dplyr::distinct() |>
              dplyr::summarise(denominator_count = dplyr::n()),
              # count of outcomes per cohort entry
                workingPop |>
                  dplyr::mutate(
                    has_outcome = dplyr::if_else(!is.na(.data$outcome_start_date) &
                                                   .data$outcome_start_date <= .data$cohort_end_date &
                                                   .data$outcome_end_date >= .data$cohort_start_date,
                                                 1, 0)
                  ) |>
                  dplyr::select("subject_id", "cohort_start_date", "has_outcome") |>
                  dplyr::distinct() |>
                  dplyr::summarise(outcome_count = sum(.data$has_outcome)))
          }

          pr[[paste0(i, "_", j)]] <- dplyr::tibble(
            denominator_count = result$denominator_count,
            outcome_count = result$outcome_count
          ) %>%
            dplyr::mutate(
              prevalence_start_date = .env$workingStart,
              prevalence_end_date = .env$workingEnd,
              analysis_interval = .env$workingInterval
            )
        } else {
          pr[[paste0(i, "_", j)]] <- dplyr::tibble()
        }

        if (length(strata) >= 1) {
          pr[[paste0(i, "_", j)]] <- pr[[paste0(i, "_", j)]] %>%
            omopgenerics::uniteStrata()
          for (k in seq_along(strata)) {
            pr[[paste0(i, "_", j, "_", k)]] <- dplyr::bind_rows(
              pr[[paste0(i, "_", j)]],
              getStratifiedPrevalenceResult(workingPop,
                workingStrata = strata[[k]], level
              )
            ) %>%
              dplyr::mutate(
                prevalence_start_date = .env$workingStart,
                prevalence_end_date = .env$workingEnd,
                analysis_interval = .env$workingInterval
              )
          }
        }
      }
    }
    pr <- dplyr::bind_rows(pr) %>%
      dplyr::mutate(prevalence = round((.data$outcome_count / .data$denominator_count), 5)) %>%
      dplyr::select(dplyr::any_of(c(
        "outcome_count", "denominator_count",
        "prevalence", "prevalence_start_date", "prevalence_end_date",
        "strata_name", "strata_level", "analysis_interval"
      )))
  }

  results <- list()
  results[["pr"]] <- pr
  results[["attrition"]] <- attrition

  return(results)
}

getStratifiedPrevalenceResult <- function(workingPop, workingStrata, level) {
  # include ongoing in current time of interest
  if(level == "person"){
  result <- workingPop %>%
    dplyr::group_by(dplyr::pick(.env$workingStrata)) %>%
    dplyr::summarise(
      denominator_count = dplyr::n_distinct(.data$subject_id),
      outcome_count = dplyr::n_distinct(.data$subject_id[
        !is.na(.data$outcome_start_date) &
          .data$outcome_start_date <= .data$cohort_end_date &
          .data$outcome_end_date >= .data$cohort_start_date
      ])
    ) %>%
    dplyr::ungroup()
  } else { # record level
    result <-  workingPop |>
      dplyr::group_by(dplyr::pick(.env$workingStrata)) |>
      dplyr::select(.env$workingStrata, "subject_id", "cohort_start_date") |>
      dplyr::distinct() |>
      dplyr::summarise(denominator_count = dplyr::n()) |>
      dplyr::left_join(
        workingPop |>
      dplyr::group_by(dplyr::pick(.env$workingStrata)) |>
      dplyr::mutate(
        has_outcome = dplyr::if_else(!is.na(.data$outcome_start_date) &
          .data$outcome_start_date <= .data$cohort_end_date &
          .data$outcome_end_date >= .data$cohort_start_date,
        1, 0)
      ) |>
      dplyr::select(.env$workingStrata, "subject_id", "cohort_start_date", "has_outcome") |>
      dplyr::distinct() |>
      dplyr::summarise(outcome_count = sum(.data$has_outcome)),
      by = workingStrata)

  }

  result <- result %>%
    omopgenerics::uniteStrata(cols = workingStrata)

  return(result)
}
Any scripts or data that you put into this service are public.
IncidencePrevalence documentation built on Aug. 8, 2025, 6:38 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
IncidencePrevalence
Estimate Incidence and Prevalence using the OMOP Common Data Model

R/getPrevalence.R
In IncidencePrevalence: Estimate Incidence and Prevalence using the OMOP Common Data Model

Defines functions getStratifiedPrevalenceResult getPrevalence

Try the IncidencePrevalence package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

IncidencePrevalence Estimate Incidence and Prevalence using the OMOP Common Data Model

R/getPrevalence.R In IncidencePrevalence: Estimate Incidence and Prevalence using the OMOP Common Data Model

Defines functions getStratifiedPrevalenceResult getPrevalence

Try the IncidencePrevalence package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

IncidencePrevalence
Estimate Incidence and Prevalence using the OMOP Common Data Model

R/getPrevalence.R
In IncidencePrevalence: Estimate Incidence and Prevalence using the OMOP Common Data Model