R/estimateSurvival.R
In CohortSurvival: Estimate Survival from Common Data Model Cohorts

Documented in estimateCompetingRiskSurvival estimateSingleEventSurvival

# Copyright 2023 DARWIN EU®
#
# This file is part of CohortSurvival
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


#' Estimate survival for a given event of interest using cohorts in the OMOP Common Data Model
#'
#' @param cdm CDM reference
#' @param targetCohortTable targetCohortTable
#' @param outcomeCohortTable The outcome cohort table of interest.
#' @param targetCohortId targetCohortId
#' @param outcomeCohortId ID of event cohorts to include. Only one outcome
#' (and so one ID) can be considered.
#' @param outcomeDateVariable  Variable containing date of outcome event
#' @param outcomeWashout Washout time in days for the outcome
#' @param censorOnCohortExit If TRUE, an individual's follow up will be
#' censored at their cohort exit
#' @param censorOnDate if not NULL, an individual's follow up will be censored
#' at the given date
#' @param followUpDays Number of days to follow up individuals (lower bound 1,
#' upper bound Inf)
#' @param strata strata
#' @param eventGap Days between time points for which to report survival
#' events, which are grouped into the specified intervals.
#' @param estimateGap Days between time points for which to report survival
#' estimates. First day will be day zero with risk estimates provided
#' for times up to the end of follow-up, with a gap in days equivalent
#' to eventGap.
#' @param restrictedMeanFollowUp number of days of follow-up to take into account
#' when calculating restricted mean for all cohorts
#' @param minimumSurvivalDays Minimum number of days required for the main cohort
#' to have survived
#'
#' @return tibble with survival information for desired cohort, including:
#' time, people at risk, survival probability, cumulative incidence,
#' 95 CIs, strata and outcome. A tibble with the number of events is
#' outputted as an attribute of the output
#' @export
#'
#' @examples
#' \donttest{
#' cdm <- mockMGUS2cdm()
#' surv <- estimateSingleEventSurvival(
#'   cdm = cdm,
#'   targetCohortTable = "mgus_diagnosis",
#'   targetCohortId = 1,
#'   outcomeCohortTable = "death_cohort",
#'   outcomeCohortId = 1,
#'   eventGap = 7
#' )
#' }
#'
estimateSingleEventSurvival <- function(cdm,
                                        targetCohortTable,
                                        outcomeCohortTable,
                                        targetCohortId = NULL,
                                        outcomeCohortId = NULL,
                                        outcomeDateVariable = "cohort_start_date",
                                        outcomeWashout = Inf,
                                        censorOnCohortExit = FALSE,
                                        censorOnDate = NULL,
                                        followUpDays = Inf,
                                        strata = NULL,
                                        eventGap = 30,
                                        estimateGap = 1,
                                        restrictedMeanFollowUp = NULL,
                                        minimumSurvivalDays = 1) {
  # Get ids of interest
    if (is.null(targetCohortId)) {
    targetCohortId <- getCohortId(targetCohortTable, cdm)
  }
  if (is.null(outcomeCohortId)) {
    outcomeCohortId <- getCohortId(outcomeCohortTable, cdm)
  }

  # Make sure attrition is up to date for our outcome cohort
  cdm[[outcomeCohortTable]] <- cdm[[outcomeCohortTable]] %>%
    omopgenerics::recordCohortAttrition("Update attrition")

  emptyOutcomes <- omopgenerics::settings(cdm[[outcomeCohortTable]]) %>%
    dplyr::left_join(
      omopgenerics::cohortCount(cdm[[outcomeCohortTable]]),
      by = "cohort_definition_id") %>%
    dplyr::filter(.data$number_records == 0)
  if(nrow(emptyOutcomes) > 0){
    emptyOutcomenames <- emptyOutcomes %>% dplyr::pull("cohort_name")
    cli::cli_warn("Outcome cohort{?s} {emptyOutcomenames} {?is/are} empty")
  }

  emptyTargets <- omopgenerics::settings(cdm[[targetCohortTable]]) %>%
    dplyr::left_join(
      omopgenerics::cohortCount(cdm[[targetCohortTable]]),
      by = "cohort_definition_id") %>%
    dplyr::filter(.data$number_records == 0)
  if(nrow(emptyTargets) > 0){
    emptyTargetnames <- emptyTargets %>% dplyr::pull("cohort_name")
    cli::cli_warn("Target cohort{?s} {emptyTargetnames} {?is/are} empty")
  }

  outcomeCohortId <- outcomeCohortId[!(outcomeCohortId %in% (emptyOutcomes %>% dplyr::pull("cohort_definition_id")))]

  # Apply the survival estimation over all combinations of target and outcome cohorts
  results <- expand.grid(target_idx = seq_along(targetCohortId), outcome_idx = seq_along(outcomeCohortId)) %>%
    purrr::pmap(function(target_idx, outcome_idx) {

      # Get the target and outcome cohort names
      working_target_id <- targetCohortId[target_idx]
      working_target <- omopgenerics::settings(cdm[[targetCohortTable]]) %>%
        dplyr::filter(.data$cohort_definition_id == working_target_id) %>%
        dplyr::pull("cohort_name")
      working_outcome_id <- outcomeCohortId[outcome_idx]
      working_outcome <- omopgenerics::settings(cdm[[outcomeCohortTable]]) %>%
        dplyr::filter(.data$cohort_definition_id == working_outcome_id) %>%
        dplyr::pull("cohort_name")
      cli::cli_inform("- Getting survival for target cohort '{working_target}' and outcome cohort '{working_outcome}'")

      # Estimate survival and collect results
      surv <- estimateSurvival(
        cdm = cdm,
        targetCohortTable = targetCohortTable,
        targetCohortId = working_target_id,
        outcomeCohortTable = outcomeCohortTable,
        outcomeCohortId = working_outcome_id,
        outcomeDateVariable = outcomeDateVariable,
        outcomeWashout = outcomeWashout,
        competingOutcomeCohortTable = NULL,
        competingOutcomeCohortId = 1,
        competingOutcomeDateVariable = "cohort_start_date",
        censorOnCohortExit = censorOnCohortExit,
        censorOnDate = censorOnDate,
        followUpDays = followUpDays,
        strata = strata,
        eventGap = eventGap,
        estimateGap = estimateGap,
        restrictedMeanFollowUp = restrictedMeanFollowUp,
        minimumSurvivalDays = minimumSurvivalDays
      )

      # Extract attrition, events, and summary
      attrition <- attr(surv, "cohort_attrition") %>%
        dplyr::mutate(target_cohort = working_target, outcome = working_outcome) %>%
        dplyr::collect() %>%
        dplyr::filter(.data$cohort_definition_id == working_target_id)
      events <- attr(surv, 'events')
      summary <- attr(surv, 'summary')

      # Return the result as a list
      return(list(surv = surv, attrition = attrition, events = events, summary = summary))
    })

  # Extract the results into separate lists
  surv <- lapply(results, `[[`, "surv")
  attrition <- lapply(results, `[[`, "attrition")
  events <- lapply(results, `[[`, "events")
  summary <- lapply(results, `[[`, "summary")

  # Remove empty elements
  surv[lengths(surv) == 0] <- NULL
  events[lengths(events) == 0] <- NULL
  attrition[lengths(attrition) == 0] <- NULL
  summary[lengths(summary) == 0] <- NULL

  # Bind rows for different components
  estimates <- dplyr::bind_rows(surv)
  events <- dplyr::bind_rows(events)
  attrition <- dplyr::bind_rows(attrition)
  summary <- dplyr::bind_rows(summary)

  if(nrow(attrition) > 0) {
    # Clean attrition data
    attrition <- attrition %>%
      dplyr::distinct() %>%
      dplyr::mutate(
        cdm_name = attr(cdm, "cdm_name"),
        package_name = "CohortSurvival",
        package_version = as.character(utils::packageVersion("CohortSurvival")),
        result_type = "survival_attrition",
        group_name = "target_cohort",
        variable_level = .data$outcome,
        analysis_type = "single_event",
        estimate_name = "count"
      ) %>%
      tidyr::pivot_longer(
        cols = c(
          "number_records", "number_subjects", "excluded_records",
          "excluded_subjects"
        ),
        names_to = "variable_name",
        values_to = "estimate_value"
      ) %>%
      dplyr::mutate(
        "estimate_value" = as.character(.data$estimate_value),
        "estimate_type" = "integer",
        competing_outcome = "none"
      ) %>%
      omopgenerics::uniteStrata("reason") %>%
      omopgenerics::uniteAdditional("reason_id")

    if (attrition %>% dplyr::group_by("target_cohort") %>% dplyr::tally() %>% dplyr::pull("n") ==
        attrition %>% dplyr::group_by("target_cohort", "cohort_definition_id") %>% dplyr::tally() %>% dplyr::pull("n")) {
      attrition <- attrition %>%
        dplyr::mutate(target_cohort = paste0(.data$target_cohort, "_", .data$cohort_definition_id),
                      group_level = .data$target_cohort)
    }

    attrition <- attrition %>%
      dplyr::select(-c("cohort_definition_id"))
  }

  # Check if estimates are not empty
  if(nrow(estimates) > 0) {
    settings <- estimates %>%
      dplyr::select("result_type",
                    "package_name",
                    "package_version",
                    "analysis_type",
                    "outcome",
                    "competing_outcome") %>%
      dplyr::distinct()

    settings <- settings %>%
      dplyr::mutate(eventgap = NA) %>%
      dplyr::union_all(
        events %>%
          dplyr::select("result_type",
                        "package_name",
                        "package_version",
                        "analysis_type",
                        "outcome",
                        "competing_outcome",
                        "eventgap") %>%
          dplyr::distinct()
      )

    settings <- settings %>%
      dplyr::union_all(
        summary %>%
          dplyr::select("result_type",
                        "package_name",
                        "package_version",
                        "analysis_type",
                        "outcome",
                        "competing_outcome") %>%
          dplyr::mutate(eventgap = NA) %>%
          dplyr::distinct()
      )

    settings <- settings %>%
      dplyr::union_all(
        attrition %>%
          dplyr::select("result_type",
                        "analysis_type",
                        "package_name",
                        "package_version",
                        "outcome") %>%
          dplyr::mutate(eventgap = NA,
                        competing_outcome = "none") %>%
          dplyr::distinct()
      )

    settings <- settings %>%
      dplyr::mutate(result_id = c(1:nrow(settings))) %>%
      dplyr::relocate("result_id", .before = "result_type")

    estimates <- estimates %>%
      dplyr::mutate(additional_name = "time",
                    time = as.character(.data$time)) %>%
      dplyr::rename("additional_level" = "time")

    events <- events %>%
      dplyr::mutate(additional_name = "time",
                    time = as.character(.data$time)) %>%
      dplyr::rename("additional_level" = "time")

    complete_results <- estimates %>%
      dplyr::bind_rows(events) %>%
      dplyr::bind_rows(summary) %>%
      dplyr::mutate(estimate_value = as.character(.data$estimate_value)) %>%
      dplyr::bind_rows(attrition) %>%
      dplyr::left_join(settings,
                       by = c("analysis_type", "package_name", "package_version", "result_type", "outcome", "competing_outcome", "eventgap")) %>%
      dplyr::mutate(
        additional_name = dplyr::if_else(is.na(.data$additional_name), "overall", .data$additional_name),
        additional_level = dplyr::if_else(is.na(.data$additional_level), "overall", .data$additional_level)
      )

    complete_results <- complete_results %>%
      dplyr::select(omopgenerics::resultColumns())

    settings <- settings %>%
      dplyr::mutate(
        outcome_date_variable = .env$outcomeDateVariable,
        outcome_washout = .env$outcomeWashout,
        censor_on_cohort_exit = .env$censorOnCohortExit,
        censor_on_date = .env$censorOnDate,
        follow_up_days = .env$followUpDays,
        restricted_mean_follow_up = .env$restrictedMeanFollowUp,
        minimum_survival_days = .env$minimumSurvivalDays
      )

    surv_estimates <- omopgenerics::newSummarisedResult(complete_results,
                                                        settings = settings)

    attr(surv_estimates, "cohort_attrition") <- NULL
    attr(surv_estimates, "summary") <- NULL
    attr(surv_estimates, "events") <- NULL

    # Suppress results with omopgenerics
    surv_estimates <- surv_estimates %>%
      dplyr::mutate(estimate_name = dplyr::if_else(
        .data$estimate_name %in% c("n_risk", "n_events", "n_censor", "number_records"),
        paste0(.data$estimate_name,"_count"), .data$estimate_name
      ))
  } else if (nrow(estimates) == 0 & nrow(attrition) > 0) {
    settings <- attrition %>%
      dplyr::select("result_type",
                    "package_name",
                    "package_version",
                    "analysis_type",
                    "outcome",
                    "competing_outcome") %>%
      dplyr::distinct() %>%
      dplyr::mutate(eventgap = NA,
                    competing_outcome = "none")
    settings <- settings %>%
      dplyr::mutate(result_id = c(1:nrow(settings))) %>%
      dplyr::relocate("result_id", .before = "result_type")

    complete_results <- attrition %>%
      dplyr::mutate(eventgap = NA) %>%
      dplyr::left_join(settings,
                       by = c("analysis_type", "package_name", "package_version", "result_type", "outcome", "competing_outcome", "eventgap")) %>%
      dplyr::mutate(
        additional_name = dplyr::if_else(is.na(.data$additional_name), "overall", .data$additional_name),
        additional_level = dplyr::if_else(is.na(.data$additional_level), "overall", .data$additional_level)
      )

    complete_results <- complete_results %>%
      dplyr::select(omopgenerics::resultColumns())

    settings <- settings %>%
      dplyr::mutate(
        outcome_date_variable = .env$outcomeDateVariable,
        outcome_washout = .env$outcomeWashout,
        censor_on_cohort_exit = .env$censorOnCohortExit,
        censor_on_date = .env$censorOnDate,
        follow_up_days = .env$followUpDays,
        restricted_mean_follow_up = .env$restrictedMeanFollowUp,
        minimum_survival_days = .env$minimumSurvivalDays
      )

    surv_estimates <- omopgenerics::newSummarisedResult(complete_results,
                                                        settings = settings)
  } else {
    complete_results <- omopgenerics::emptySummarisedResult()
    surv_estimates <- omopgenerics::newSummarisedResult(complete_results)
  }

  return(surv_estimates)
}

#' Estimate survival for a given event and competing risk of interest using
#' cohorts in the OMOP Common Data Model
#'
#' @param cdm CDM reference
#' @param targetCohortTable targetCohortTable
#' @param outcomeCohortTable The outcome cohort table of interest.
#' @param competingOutcomeCohortTable The competing outcome cohort table of interest.
#' @param targetCohortId targetCohortId
#' @param outcomeCohortId ID of event cohorts to include. Only one outcome
#' (and so one ID) can be considered.
#' @param outcomeDateVariable  Variable containing date of outcome event
#' @param outcomeWashout Washout time in days for the outcome
#' @param competingOutcomeCohortId ID of event cohorts to include. Only one competing outcome
#' (and so one ID) can be considered.
#' @param competingOutcomeDateVariable Variable containing date of competing outcome event
#' @param competingOutcomeWashout Washout time in days for the competing outcome
#' @param censorOnCohortExit If TRUE, an individual's follow up will be
#' censored at their cohort exit
#' @param censorOnDate if not NULL, an individual's follow up will be censored
#' at the given date
#' @param followUpDays Number of days to follow up individuals (lower bound 1,
#' upper bound Inf)
#' @param strata strata
#' @param eventGap Days between time points for which to report survival
#' events, which are grouped into the specified intervals.
#' @param estimateGap Days between time points for which to report survival
#' estimates. First day will be day zero with risk estimates provided
#' for times up to the end of follow-up, with a gap in days equivalent
#' to eventGap.
#' @param restrictedMeanFollowUp number of days of follow-up to take into account
#' when calculating restricted mean for all cohorts
#' @param minimumSurvivalDays Minimum number of days required for the main cohort
#' to have survived
#'
#' @return tibble with survival information for desired cohort, including:
#' time, people at risk, survival probability, cumulative incidence,
#' 95 CIs, strata and outcome. A tibble with the number of events is
#' outputted as an attribute of the output
#' @export
#'
#' @examples
#' \donttest{
#' cdm <- mockMGUS2cdm()
#' surv <- estimateCompetingRiskSurvival(
#'   cdm = cdm,
#'   targetCohortTable = "mgus_diagnosis",
#'   targetCohortId = 1,
#'   outcomeCohortTable = "progression",
#'   outcomeCohortId = 1,
#'   competingOutcomeCohortTable = "death_cohort",
#'   competingOutcomeCohortId = 1,
#'   eventGap = 7
#' )
#' }
#'
estimateCompetingRiskSurvival <- function(cdm,
                                          targetCohortTable,
                                          outcomeCohortTable,
                                          competingOutcomeCohortTable,
                                          targetCohortId = NULL,
                                          outcomeCohortId = NULL,
                                          outcomeDateVariable = "cohort_start_date",
                                          outcomeWashout = Inf,
                                          competingOutcomeCohortId = NULL,
                                          competingOutcomeDateVariable = "cohort_start_date",
                                          competingOutcomeWashout = Inf,
                                          censorOnCohortExit = FALSE,
                                          censorOnDate = NULL,
                                          followUpDays = Inf,
                                          strata = NULL,
                                          eventGap = 30,
                                          estimateGap = 1,
                                          restrictedMeanFollowUp = NULL,
                                          minimumSurvivalDays = 1) {
  # Get ids of interest
  if (is.null(targetCohortId)) {
    targetCohortId <- getCohortId(targetCohortTable, cdm)
  }
  if (is.null(outcomeCohortId)) {
    outcomeCohortId <- getCohortId(outcomeCohortTable, cdm)
  }
  if (is.null(competingOutcomeCohortId)) {
    competingOutcomeCohortId <- getCohortId(competingOutcomeCohortTable, cdm)
  }

  # Make sure attrition is up to date for our outcome cohorts
  cdm[[outcomeCohortTable]] <- cdm[[outcomeCohortTable]] %>%
    omopgenerics::recordCohortAttrition("update attrition")
  cdm[[competingOutcomeCohortTable]] <- cdm[[competingOutcomeCohortTable]] %>%
    omopgenerics::recordCohortAttrition("update attrition")

  # Handle empty cohorts
  emptyOutcomes <- omopgenerics::settings(cdm[[outcomeCohortTable]]) %>%
    dplyr::left_join(
      omopgenerics::cohortCount(cdm[[outcomeCohortTable]]),
      by = "cohort_definition_id") %>%
    dplyr::filter(.data$number_records == 0)
  emptyCompetingOutcomes <- omopgenerics::settings(cdm[[competingOutcomeCohortTable]]) %>%
    dplyr::left_join(
      omopgenerics::cohortCount(cdm[[competingOutcomeCohortTable]]),
      by = "cohort_definition_id") %>%
    dplyr::filter(.data$number_records == 0)
  if(nrow(emptyOutcomes) > 0){
    emptyOutcomenames <- emptyOutcomes %>% dplyr::pull("cohort_name")
    cli::cli_warn("Outcome cohort{?s} {emptyOutcomenames} {?is/are} empty")
  }
  if(nrow(emptyCompetingOutcomes) > 0){
    emptyCompetingOutcomenames <- emptyCompetingOutcomes %>% dplyr::pull("cohort_name")
    cli::cli_warn("Competing outcome cohort{?s} {emptyCompetingOutcomenames} {?is/are} empty")
  }

  emptyTargets <- omopgenerics::settings(cdm[[targetCohortTable]]) %>%
    dplyr::left_join(
      omopgenerics::cohortCount(cdm[[targetCohortTable]]),
      by = "cohort_definition_id") %>%
    dplyr::filter(.data$number_records == 0)
  if(nrow(emptyTargets) > 0){
    emptyTargetnames <- emptyTargets %>% dplyr::pull("cohort_name")
    cli::cli_warn("Target cohort{?s} {emptyTargetnames} {?is/are} empty")
  }

  outcomeCohortId <- outcomeCohortId[!(outcomeCohortId %in% (emptyOutcomes %>% dplyr::pull("cohort_definition_id")))]
  competingOutcomeCohortId <- competingOutcomeCohortId[!(competingOutcomeCohortId %in% (emptyCompetingOutcomes %>% dplyr::pull("cohort_definition_id")))]

  # Apply survival to all combinations of ids
  results <- expand.grid(target_idx = seq_along(targetCohortId), outcome_idx = seq_along(outcomeCohortId),
                         competing_outcome_idx = seq_along(competingOutcomeCohortId)) %>%
    purrr::pmap(function(target_idx, outcome_idx, competing_outcome_idx) {

      # Get the target and outcome cohorts names
      working_target_id <- targetCohortId[target_idx]
      working_target <- omopgenerics::settings(cdm[[targetCohortTable]]) %>%
        dplyr::filter(.data$cohort_definition_id == working_target_id) %>%
        dplyr::pull("cohort_name")
      working_outcome_id <- outcomeCohortId[outcome_idx]
      working_outcome <- omopgenerics::settings(cdm[[outcomeCohortTable]]) %>%
        dplyr::filter(.data$cohort_definition_id == working_outcome_id) %>%
        dplyr::pull("cohort_name")
      working_competing_outcome_id <- competingOutcomeCohortId[competing_outcome_idx]
      working_competing_outcome <- omopgenerics::settings(cdm[[competingOutcomeCohortTable]]) %>%
        dplyr::filter(.data$cohort_definition_id == working_competing_outcome_id) %>%
        dplyr::pull("cohort_name")
      cli::cli_inform("- Getting survival for target cohort '{working_target}', outcome cohort '{working_outcome}' and
                      competing outcome cohort '{working_competing_outcome}'")

      # Estimate survival and collect results
      surv <- estimateSurvival(
        cdm = cdm,
        targetCohortTable = targetCohortTable,
        targetCohortId = working_target_id,
        outcomeCohortTable = outcomeCohortTable,
        outcomeCohortId = working_outcome_id,
        outcomeDateVariable = outcomeDateVariable,
        outcomeWashout = outcomeWashout,
        competingOutcomeCohortTable = competingOutcomeCohortTable,
        competingOutcomeCohortId = working_competing_outcome_id,
        competingOutcomeDateVariable = competingOutcomeDateVariable,
        competingOutcomeWashout = competingOutcomeWashout,
        censorOnCohortExit = censorOnCohortExit,
        censorOnDate = censorOnDate,
        followUpDays = followUpDays,
        strata = strata,
        eventGap = eventGap,
        estimateGap = estimateGap,
        restrictedMeanFollowUp = restrictedMeanFollowUp,
        minimumSurvivalDays = minimumSurvivalDays
      )

      # Extract attrition, events, and summary
      if(!is.null(attr(surv, "cohort_attrition"))) {
        attrition <- attr(surv, "cohort_attrition") %>%
          dplyr::mutate(target_cohort = working_target, outcome = working_outcome,
                        competing_outcome = working_competing_outcome) %>%
          dplyr::collect() %>%
          dplyr::filter(.data$cohort_definition_id == working_target_id)
      } else {
        attrition <- attr(surv, "cohort_attrition")
      }
      events <- attr(surv, 'events')
      summary <- attr(surv, 'summary')

      # Return the result as a list
      return(list(surv = surv, attrition = attrition, events = events, summary = summary))
    })

  # Extract the results into separate lists
  surv <- lapply(results, `[[`, "surv")
  attrition <- lapply(results, `[[`, "attrition")
  events <- lapply(results, `[[`, "events")
  summary <- lapply(results, `[[`, "summary")

  # Remove empty elements
  surv[lengths(surv) == 0] <- NULL
  events[lengths(events) == 0] <- NULL
  attrition[lengths(attrition) == 0] <- NULL
  summary[lengths(summary) == 0] <- NULL

  # Bind rows for different components
  estimates <- dplyr::bind_rows(surv)
  events <- dplyr::bind_rows(events)
  attrition <- dplyr::bind_rows(attrition)
  summary <- dplyr::bind_rows(summary)

  if(nrow(attrition) > 0) {
    # Clean attrition data
    attrition <- attrition %>%
      dplyr::distinct() %>%
      dplyr::mutate(
        cdm_name = attr(cdm, "cdm_name"),
        package_name = "CohortSurvival",
        package_version = as.character(utils::packageVersion("CohortSurvival")),
        result_type = "survival_attrition",
        group_name = "target_cohort",
        variable_level = .data$outcome,
        analysis_type = "competing_risk",
        estimate_name = "count"
      ) %>%
      tidyr::pivot_longer(
        cols = c(
          "number_records", "number_subjects", "excluded_records",
          "excluded_subjects"
        ),
        names_to = "variable_name",
        values_to = "estimate_value"
      ) %>%
      dplyr::mutate(
        "estimate_value" = as.character(.data$estimate_value),
        "estimate_type" = "integer",
        competing_outcome = "none"
      ) %>%
      omopgenerics::uniteStrata("reason") %>%
      omopgenerics::uniteAdditional("reason_id")

    if (attrition %>% dplyr::group_by("target_cohort") %>% dplyr::tally() %>% dplyr::pull("n") ==
        attrition %>% dplyr::group_by("target_cohort", "cohort_definition_id") %>% dplyr::tally() %>% dplyr::pull("n")) {
      attrition <- attrition %>%
        dplyr::mutate(target_cohort = paste0(.data$target_cohort, "_", .data$cohort_definition_id),
                      group_level = .data$target_cohort)
    }

    attrition <- attrition %>%
      dplyr::select(-c("cohort_definition_id"))
  }

  # Check if estimates are not empty
  if(nrow(estimates) > 0) {
    settings <- estimates %>%
      dplyr::select("result_type",
                    "package_name",
                    "package_version",
                    "analysis_type",
                    "outcome",
                    "competing_outcome") %>%
      dplyr::distinct()

    settings <- settings %>%
      dplyr::mutate(eventgap = NA) %>%
      dplyr::union_all(
        events %>%
          dplyr::select("result_type",
                        "package_name",
                        "package_version",
                        "analysis_type",
                        "outcome",
                        "competing_outcome",
                        "eventgap") %>%
          dplyr::distinct()
      )

    settings <- settings %>%
      dplyr::union_all(
        summary %>%
          dplyr::select("result_type",
                        "package_name",
                        "package_version",
                        "analysis_type",
                        "outcome",
                        "competing_outcome") %>%
          dplyr::mutate(eventgap = NA) %>%
          dplyr::distinct()
      )

    settings <- settings %>%
      dplyr::union_all(
        attrition %>%
          dplyr::select("result_type",
                        "analysis_type",
                        "package_name",
                        "package_version",
                        "outcome") %>%
          dplyr::mutate(eventgap = NA,
                        competing_outcome = "none") %>%
          dplyr::distinct()
      )

    settings <- settings %>%
      dplyr::mutate(result_id = c(1:nrow(settings))) %>%
      dplyr::relocate("result_id", .before = "result_type")

    estimates <- estimates %>%
      dplyr::mutate(additional_name = "time",
                    time = as.character(.data$time)) %>%
      dplyr::rename("additional_level" = "time")

    events <- events %>%
      dplyr::mutate(additional_name = "time",
                    time = as.character(.data$time)) %>%
      dplyr::rename("additional_level" = "time")

    complete_results <- estimates %>%
      dplyr::bind_rows(events) %>%
      dplyr::bind_rows(summary) %>%
      dplyr::mutate(estimate_value = as.character(.data$estimate_value)) %>%
      dplyr::bind_rows(attrition) %>%
      dplyr::left_join(settings,
                       by = c("analysis_type", "package_name", "package_version", "result_type", "outcome", "competing_outcome", "eventgap")) %>%
      dplyr::mutate(
        additional_name = dplyr::if_else(is.na(.data$additional_name), "overall", .data$additional_name),
        additional_level = dplyr::if_else(is.na(.data$additional_level), "overall", .data$additional_level)
      )

    complete_results <- complete_results %>%
      dplyr::select(omopgenerics::resultColumns())

    settings <- settings %>%
      dplyr::mutate(
        outcome_date_variable = .env$outcomeDateVariable,
        outcome_washout = .env$outcomeWashout,
        censor_on_cohort_exit = .env$censorOnCohortExit,
        censor_on_date = .env$censorOnDate,
        follow_up_days = .env$followUpDays,
        restricted_mean_follow_up = .env$restrictedMeanFollowUp,
        minimum_survival_days = .env$minimumSurvivalDays
      )

    surv_estimates <- omopgenerics::newSummarisedResult(complete_results,
                                                        settings = settings)

    attr(surv_estimates, "cohort_attrition") <- NULL
    attr(surv_estimates, "summary") <- NULL
    attr(surv_estimates, "events") <- NULL

    # Suppress results with omopgenerics
    surv_estimates <- surv_estimates %>%
      dplyr::mutate(estimate_name = dplyr::if_else(
        .data$estimate_name %in% c("n_risk", "n_events", "n_censor", "number_records"),
        paste0(.data$estimate_name,"_count"), .data$estimate_name
      ))
  } else if (nrow(estimates) == 0 & nrow(attrition) > 0) {
    settings <- attrition %>%
      dplyr::select("result_type",
                    "package_name",
                    "package_version",
                    "analysis_type",
                    "outcome",
                    "competing_outcome") %>%
      dplyr::distinct() %>%
      dplyr::mutate(eventgap = NA,
                    competing_outcome = "none")
    settings <- settings %>%
      dplyr::mutate(result_id = c(1:nrow(settings))) %>%
      dplyr::relocate("result_id", .before = "result_type")

    complete_results <- attrition %>%
      dplyr::mutate(eventgap = NA) %>%
      dplyr::left_join(settings,
                       by = c("analysis_type", "package_name", "package_version", "result_type", "outcome", "competing_outcome", "eventgap")) %>%
      dplyr::mutate(
        additional_name = dplyr::if_else(is.na(.data$additional_name), "overall", .data$additional_name),
        additional_level = dplyr::if_else(is.na(.data$additional_level), "overall", .data$additional_level)
      )

    complete_results <- complete_results %>%
      dplyr::select(omopgenerics::resultColumns())

    settings <- settings %>%
      dplyr::mutate(
        outcome_date_variable = .env$outcomeDateVariable,
        outcome_washout = .env$outcomeWashout,
        censor_on_cohort_exit = .env$censorOnCohortExit,
        censor_on_date = .env$censorOnDate,
        follow_up_days = .env$followUpDays,
        restricted_mean_follow_up = .env$restrictedMeanFollowUp,
        minimum_survival_days = .env$minimumSurvivalDays
      )

    surv_estimates <- omopgenerics::newSummarisedResult(complete_results,
                                                        settings = settings)
  } else {
    complete_results <- omopgenerics::emptySummarisedResult()
    surv_estimates <- omopgenerics::newSummarisedResult(complete_results)
  }

  return(surv_estimates)
}


estimateSurvival <- function(cdm,
                             targetCohortTable,
                             targetCohortId = NULL,
                             outcomeCohortTable,
                             outcomeCohortId = NULL,
                             outcomeDateVariable = "cohort_start_date",
                             outcomeWashout = Inf,
                             competingOutcomeCohortTable = NULL,
                             competingOutcomeCohortId = 1,
                             competingOutcomeDateVariable = "cohort_start_date",
                             competingOutcomeWashout = Inf,
                             censorOnCohortExit = FALSE,
                             censorOnDate = NULL,
                             followUpDays = Inf,
                             strata = NULL,
                             eventGap = 30,
                             estimateGap = 1,
                             restrictedMeanFollowUp = NULL,
                             minimumSurvivalDays = 1) {

  # check inputs
  validateInputSurvival(cdm, targetCohortTable, targetCohortId, outcomeCohortTable,
                        outcomeCohortId, outcomeDateVariable, outcomeWashout,
                        competingOutcomeCohortTable, competingOutcomeCohortId,
                        competingOutcomeDateVariable, competingOutcomeWashout,
                        censorOnCohortExit, censorOnDate, followUpDays,
                        strata, eventGap, estimateGap, restrictedMeanFollowUp,
                        minimumSurvivalDays)

# extract and prepare exposure data
  workingExposureTable <- cdm[[targetCohortTable]] %>%
    dplyr::filter(.data$cohort_definition_id == .env$targetCohortId) %>%
    addCohortSurvival(cdm, outcomeCohortTable, outcomeCohortId, outcomeDateVariable,
      outcomeWashout, censorOnCohortExit, censorOnDate, followUpDays) %>%
    dplyr::rename( "outcome_time" = "time", "outcome_status" = "status") %>%
    dplyr::compute(temporary = FALSE)

  # handle competing risks
  if (!is.null(competingOutcomeCohortTable)) {
    workingExposureTable <- workingExposureTable %>%
      addCohortSurvival(cdm, competingOutcomeCohortTable, competingOutcomeCohortId,
        competingOutcomeDateVariable, competingOutcomeWashout, censorOnCohortExit,
        censorOnDate, followUpDays) %>%
      dplyr::rename("competing_risk_time" = "time", "competing_risk_status" = "status") %>%
      dplyr::compute(temporary = FALSE)
  }

  # collect
  workingExposureTable <- workingExposureTable %>%
    dplyr::filter(!is.na(.data$outcome_time) &&
                    !is.na(.data$outcome_status)) %>%
    dplyr::compute(temporary = FALSE) %>%
    omopgenerics::recordCohortAttrition(reason = "No outcome event in washout period")

  workingExposureTable <- workingExposureTable %>%
    dplyr::filter(.data$outcome_time >= .env$minimumSurvivalDays) %>%
  omopgenerics::recordCohortAttrition(reason = paste0("Survival days for outcome less than ", minimumSurvivalDays))

  if (!is.null(competingOutcomeCohortTable)) {
    workingExposureTable <- workingExposureTable %>%
      dplyr::filter(.data$competing_risk_time >= .env$minimumSurvivalDays) %>%
      omopgenerics::recordCohortAttrition(reason = paste0("Survival days for competing outcome less than ", minimumSurvivalDays))
  }

  survData <- workingExposureTable %>%
    dplyr::collect()

  if (!is.null(competingOutcomeCohortTable)) {
    survData <- addCompetingRiskVars(data = survData, time1 = "outcome_time",
      status1 = "outcome_status", time2 = "competing_risk_time",
      status2 = "competing_risk_status", nameOutTime = "outcome_or_competing_time",
      nameOutStatus = "outcome_or_competing_status")
  }

  # time points to extract survival estimates
  if(survData %>% dplyr::tally() %>% dplyr::pull() != 0) {
    timepoints <- seq(0, if (followUpDays == "Inf") max(survData$outcome_time) else followUpDays, by = estimateGap)
  } else {
    working_target <- omopgenerics::settings(cdm[[targetCohortTable]]) %>%
      dplyr::filter(.data$cohort_definition_id == targetCohortId) %>%
      dplyr::pull("cohort_name")
    working_outcome <- omopgenerics::settings(cdm[[outcomeCohortTable]]) %>%
      dplyr::filter(.data$cohort_definition_id == outcomeCohortId) %>%
      dplyr::pull("cohort_name")
    if(!is.null(competingOutcomeCohortTable)) {
    working_competing_outcome <- omopgenerics::settings(cdm[[competingOutcomeCohortTable]]) %>%
      dplyr::filter(.data$cohort_definition_id == competingOutcomeCohortId) %>%
      dplyr::pull("cohort_name")
      cli::cli_warn("Survival estimation is not possible as there are no individuals
                  available for target {working_target}, outcome
                  {working_outcome} and competing outcome {working_competing_outcome}.
                    Only attrition returned.")
    } else {
      cli::cli_warn("Survival estimation is not possible as there are no individuals
                  available for target {working_target} and outcome
                  {working_outcome}. Only attrition returned.")
    }
    surv <- empty_estimates()
    attr(surv, "cohort_attrition") <- attr(workingExposureTable, "cohort_attrition")
    return(surv)
    }

  # fit survival, with strata
  if (is.null(competingOutcomeCohortTable)) {
    surv <- singleEventSurvival(survData, timepoints, strata, eventGap, restrictedMeanFollowUp)
  } else {
    surv <- competingRiskSurvival(survData, timepoints, strata, eventGap, restrictedMeanFollowUp)
  }

  # process and summarise results
  if (nrow(surv) > 0) {
    survivalEstimates <- addCohortDetails(surv, cdm, targetCohortId, targetCohortTable,
      outcomeCohortId, outcomeCohortTable, competingOutcomeCohortId, competingOutcomeCohortTable,
      "survival") %>%
      tidyr::pivot_longer(cols = "outcome", names_to = "variable_name",
        values_to = "variable_level") %>%
      dplyr::mutate(result_type = dplyr::if_else(
        .data$analysis_type == "competing_risk", "cumulative_failure_probability",
        "survival_probability")) %>%
      dplyr::select(!c("n_risk","variable_type", "n_censor")) %>%
      tidyr::pivot_longer(
        cols = c("estimate", "estimate_95CI_lower", "estimate_95CI_upper"),
        names_to = "estimate_name",
        values_to = "estimate_value"
      ) %>%
      dplyr::mutate(
        outcome = omopgenerics::settings(cdm[[outcomeCohortTable]]) %>%
          dplyr::filter(.data$cohort_definition_id == .env$outcomeCohortId) %>%
          dplyr::pull("cohort_name")
      )

    if(!is.null(competingOutcomeCohortTable)) {
      survivalEstimates <- survivalEstimates %>%
        dplyr::mutate(competing_outcome = omopgenerics::settings(cdm[[competingOutcomeCohortTable]]) %>%
                                         dplyr::filter(.data$cohort_definition_id == .env$competingOutcomeCohortId) %>%
                                         dplyr::pull("cohort_name"))
    } else {
      survivalEstimates <- survivalEstimates %>%
        dplyr::mutate(competing_outcome = "none")
    }

    survivalEstimates <- dplyr::distinct(survivalEstimates)

    # add attributes
    events <- attr(surv, "events")

    attr(survivalEstimates, "events") <- addCohortDetails(events, cdm,
      targetCohortId, targetCohortTable, outcomeCohortId,
      outcomeCohortTable, competingOutcomeCohortId, competingOutcomeCohortTable,
      "survival_events") %>%
      dplyr::select(!"variable_type") %>%
      tidyr::pivot_longer(cols = c("n_risk", "n_events", "n_censor"),
                          names_to = "estimate_name",
                          values_to = "estimate_value") %>%
      dplyr::mutate(
        estimate_type = "numeric",
        variable_name = "outcome"
      ) %>%
      dplyr::rename(variable_level = "outcome") %>%
      dplyr::mutate(outcome = omopgenerics::settings(cdm[[outcomeCohortTable]]) %>%
                      dplyr::filter(.data$cohort_definition_id == .env$outcomeCohortId) %>%
                      dplyr::pull("cohort_name")) %>%
      dplyr::rename("eventgap" = "eventGap")

    if(!is.null(competingOutcomeCohortTable)) {
      attr(survivalEstimates, "events") <- attr(survivalEstimates, "events") %>%
        dplyr::mutate(competing_outcome = omopgenerics::settings(cdm[[competingOutcomeCohortTable]]) %>%
                        dplyr::filter(.data$cohort_definition_id == .env$competingOutcomeCohortId) %>%
                        dplyr::pull("cohort_name"))
    } else {
      attr(survivalEstimates, "events") <- attr(survivalEstimates, "events") %>%
        dplyr::mutate(competing_outcome = "none")
    }

    attr(survivalEstimates, "cohort_attrition") <- attr(workingExposureTable, "cohort_attrition")

    if (is.null(competingOutcomeCohortTable)) {
      summary <- addCohortDetails(
        x = attr(surv, "summary"), cdm, targetCohortId, targetCohortTable,
        outcomeCohortId, outcomeCohortTable, resultType = "survival_summary"
      ) %>%
        dplyr::mutate(analysis_type = "single_event")
    } else {
      attr(surv, "summary") <- attr(surv, "summary") %>%
        dplyr::filter(.data$outcome != "none")

      summary <- addCohortDetails(
        x = attr(surv, "summary"), cdm, targetCohortId, targetCohortTable,
        outcomeCohortId, outcomeCohortTable, competingOutcomeCohortId,
        competingOutcomeCohortTable, "survival_summary"
      ) %>%
        dplyr::mutate(analysis_type = "competing_risk")
    }

    if(is.null(competingOutcomeCohortTable)) {
      summary <- summary %>%
        dplyr::mutate(
          restricted_mean_survival_95CI_upper = .data$restricted_mean_survival + stats::qnorm(0.975) * .data$restricted_mean_survival_se,
          restricted_mean_survival_95CI_lower = .data$restricted_mean_survival - stats::qnorm(0.975) * .data$restricted_mean_survival_se
        ) %>%
        dplyr::select(-"restricted_mean_survival_se")
    }
    summary <- summary %>%
      dplyr::filter(.data$outcome != "none") %>%
      dplyr::mutate(
        variable_name = "outcome",
        variable_level = .data$outcome,
        estimate_type = "numeric"
      ) %>%
      dplyr::select(-c("variable_type", "outcome")) %>%
      tidyr::pivot_longer(
        cols = -c("cdm_name", "package_name", "package_version", "result_type", "group_name", "group_level",
                  "strata_name", "strata_level", "variable_name", "variable_level", "estimate_type", "analysis_type"),
        names_to = "estimate_name",
        values_to = "estimate_value"
      ) %>%
      dplyr::mutate(estimate_value = round(.data$estimate_value),
                    outcome = omopgenerics::settings(cdm[[outcomeCohortTable]]) %>%
                      dplyr::filter(.data$cohort_definition_id == .env$outcomeCohortId) %>%
                      dplyr::pull("cohort_name"))

    if(!is.null(competingOutcomeCohortTable)) {
      summary <- summary %>%
        dplyr::mutate(competing_outcome = omopgenerics::settings(cdm[[competingOutcomeCohortTable]]) %>%
                        dplyr::filter(.data$cohort_definition_id == .env$competingOutcomeCohortId) %>%
                        dplyr::pull("cohort_name"))
    } else {
      summary <- summary %>%
        dplyr::mutate(competing_outcome = "none")
    }

    # median_survival and its two confidence intervals must be all num or all NA
    summary <- summary %>%
      dplyr::group_by(.data$group_level, .data$variable_level, .data$strata_name, .data$strata_level) %>%
      dplyr::mutate(
        has_na = any(is.na(.data$estimate_value) & .data$estimate_name %in% c("median_survival", "median_survival_95CI_higher", "median_survival_95CI_lower")),
        estimate_value = dplyr::if_else(.data$has_na & .data$estimate_name %in% c("median_survival", "median_survival_95CI_higher", "median_survival_95CI_lower"), NA, .data$estimate_value)
      ) %>%
      dplyr::select(-"has_na") %>%
      dplyr::ungroup()

    # change restrictedMeanSurvival to NA if follow-up for the specific group is shorter than input asked
    if(!is.null(restrictedMeanFollowUp)) {
      summary <- summary %>%
        dplyr::left_join(
          survivalEstimates %>% dplyr::group_by(.data$variable_level,
                                               .data$strata_name,
                                               .data$strata_level) %>%
            dplyr::filter(.data$time == max(.data$time)) %>%
            dplyr::select("variable_level",
                          "strata_name",
                          "strata_level",
                          "time") %>%
            dplyr::distinct() %>%
            dplyr::filter(.data$time < .env$restrictedMeanFollowUp) %>%
            dplyr::mutate(to_na = 1),
          by = c("variable_level", "strata_name", "strata_level")) %>%
        dplyr::mutate(estimate_value = dplyr::if_else(
          .data$estimate_name %in% c("restricted_mean_survival", "restricted_mean_survival_se") &
            !is.na(.data$to_na),
          NA_real_,
          .data$estimate_value
        )) %>%
        dplyr::select(-"to_na")
    }

    attr(survivalEstimates, "summary") <- summary

    # round estimates
    survivalEstimates <- survivalEstimates %>%
      dplyr::mutate(estimate_value = round(.data$estimate_value, 4))

  } else {
    survivalEstimates <- surv
  }

  return(survivalEstimates)
}

addCompetingRiskVars <- function(data, time1, status1, time2, status2,
                                 nameOutTime, nameOutStatus) {
  # - add competing risk variables (time and status)
  # 0: no event, 1: event 1, 2: event 2
  data %>%
    dplyr::mutate(
      !!nameOutTime := pmin(.data[[time1]], .data[[time2]]),
      !!nameOutStatus := factor(dplyr::if_else(.data[[time2]] <= .data[[time1]], 2 * .data[[status2]], .data[[status1]]))
    )
}

singleEventSurvival <- function(survData, times, variables, eventGap,
                                restrictedMeanFollowUp = NULL) {
  estimates <- list()
  fitSummary <- list()

  var_columns <- unlist(variables) %>% unique()

  cli::cli_progress_message("Getting overall estimates")
  fit <- survival::survfit(survival::Surv(outcome_time, outcome_status) ~ 1,
                           data = survData
  )

  # Calculate quantiles
  q0 <- stats::quantile(fit, probs = 0)
  q25 <- stats::quantile(fit, probs = 0.25)
  q75 <- stats::quantile(fit, probs = 0.75)
  q100 <- stats::quantile(fit, probs = 1)

  # Create summary with selected and renamed columns
  fitSummary[[1]] <- as.data.frame(t(summary(fit, rmean = restrictedMeanFollowUp)$table)) %>%
    dplyr::select(!dplyr::any_of(c("n.max", "n.start"))) %>%
    dplyr::rename(
      "number_records" = "records",
      "n_events" = "events",
      "median_survival" = "median",
      "median_survival_95CI_lower" = "0.95LCL",
      "median_survival_95CI_higher" = "0.95UCL",
      "restricted_mean_survival" = "rmean",
      "restricted_mean_survival_se" = "se(rmean)"
    ) %>%
    dplyr::mutate(
      q0_survival = .env$q0$quantile,
      q0_survival_95CI_lower = .env$q0$lower,
      q0_survival_95CI_higher = .env$q0$upper,
      q25_survival = .env$q25$quantile,
      q25_survival_95CI_lower = .env$q25$lower,
      q25_survival_95CI_higher = .env$q25$upper,
      q75_survival = .env$q75$quantile,
      q75_survival_95CI_lower = .env$q75$lower,
      q75_survival_95CI_higher = .env$q75$upper,
      q100_survival = .env$q100$quantile,
      q100_survival_95CI_lower = .env$q100$lower,
      q100_survival_95CI_higher = .env$q100$upper,
      analysis_type = "single event",
      strata_name = "overall",
      strata_level = "overall",
      outcome = "outcome"
    )

  summ <- summary(fit, times = times, extend = TRUE)
  estimates[[1]] <- dplyr::tibble(
      outcome = "outcome",
      time = summ$time,
      n_event = summ$n.event,
      n_censor = summ$n.censor,
      n_risk = summ$n.risk,
      estimate_type = "numeric",
      estimate = summ$surv,
      estimate_95CI_lower = summ$lower,
      estimate_95CI_upper = summ$upper,
      analysis_type = "single_event",
      strata_name = "overall",
      strata_level = "overall"
    )

  # Add strata estimates if required
  if (!is.null(variables)) {
    cli::cli_progress_bar(
      total = length(variables),
      format = " -- Getting estimates for {cli::pb_bar} {cli::pb_current} of {cli::pb_total} strata"
    )
    for (i in seq_along(variables)) {
      cli::cli_progress_update()
      # Get formula for the model
      name <- variables[[i]]
      uniqueVals <- survData %>%
        dplyr::select(.env$name) %>%
        tidyr::unite(col = "vars") %>%
        dplyr::distinct() %>%
        dplyr::pull()

      if(length(uniqueVals) == 1){
        cli::cli_inform("All values of {name} are equal to {uniqueVals}
                        and stratified results will not be returned for
                        this variable")
      } else {
        expr <- stats::as.formula(paste(c(
          "survival::Surv(outcome_time, outcome_status) ~ 1",
          name
        ), collapse = " + "))
        fit <- survival::survfit(expr, data = survData)
        tidyFit <- broom::tidy(fit)

        format_surv_strata_quantile <- function(x) {
          data.frame(x) %>%
            tibble::rownames_to_column(var = "strata")
        }
        q0 <- stats::quantile(fit, probs = 0)
        q0 <- lapply(q0, format_surv_strata_quantile)
        q25 <- stats::quantile(fit, probs = 0.25)
        q25 <- lapply(q25, format_surv_strata_quantile)
        q75 <- stats::quantile(fit, probs = 0.75)
        q75 <- lapply(q75, format_surv_strata_quantile)
        q100 <- stats::quantile(fit, probs = 1)
        q100 <- lapply(q100, format_surv_strata_quantile)

        fitSummary[[i + 1]] <- as.data.frame(summary(fit, rmean = restrictedMeanFollowUp)$table) %>%
          dplyr::select(!dplyr::any_of(c("n.max", "n.start"))) %>%
          dplyr::rename(
            "number_records" = "records",
            "n_events" = "events",
            "median_survival" = "median",
            "median_survival_95CI_lower" = "0.95LCL",
            "median_survival_95CI_higher" = "0.95UCL",
            "restricted_mean_survival" = "rmean",
            "restricted_mean_survival_se" = "se(rmean)"
          ) %>%
          tibble::rownames_to_column(var = "strata")

        fitSummary[[i + 1]] <- fitSummary[[i + 1]] %>%
          dplyr::left_join(q0$quantile, by = "strata") %>%
          dplyr::rename("q0_survival" = "X0") %>%
          dplyr::left_join(q0$lower, by = "strata") %>%
          dplyr::rename("q0_survival_95CI_lower" = "X0") %>%
          dplyr::left_join(q0$upper, by = "strata") %>%
          dplyr::rename("q0_survival_95CI_higher" = "X0") %>%
          dplyr::left_join(q25$quantile, by = "strata") %>%
          dplyr::rename("q25_survival" = "X25") %>%
          dplyr::left_join(q25$lower, by = "strata") %>%
          dplyr::rename("q25_survival_95CI_lower" = "X25") %>%
          dplyr::left_join(q25$upper, by = "strata") %>%
          dplyr::rename("q25_survival_95CI_higher" = "X25") %>%
          dplyr::left_join(q75$quantile, by = "strata") %>%
          dplyr::rename("q75_survival" = "X75") %>%
          dplyr::left_join(q75$lower, by = "strata") %>%
          dplyr::rename("q75_survival_95CI_lower" = "X75") %>%
          dplyr::left_join(q75$upper, by = "strata") %>%
          dplyr::rename("q75_survival_95CI_higher" = "X75") %>%
          dplyr::left_join(q100$quantile, by = "strata") %>%
          dplyr::rename("q100_survival" = "X100") %>%
          dplyr::left_join(q100$lower, by = "strata") %>%
          dplyr::rename("q100_survival_95CI_lower" = "X100") %>%
          dplyr::left_join(q100$upper, by = "strata") %>%
          dplyr::rename("q100_survival_95CI_higher" = "X100")

        fitSummary[[i + 1]] <- fitSummary[[i + 1]] %>%
          dplyr::left_join(
            tidyFit %>%
              dplyr::group_by(.data$strata) %>%
              dplyr::summarise(max_time = max(.data$time, na.rm = TRUE)),
            by = "strata"
          ) %>%
          dplyr::mutate(
            analysis_type = "single event",
            outcome = "outcome"
          )

        summ <- summary(fit, times = times, extend = TRUE)
        estimates[[i + 1]] <- dplyr::tibble(
            strata = summ$strata,
            outcome = "outcome",
            time = summ$time,
            n_event = summ$n.event,
            n_censor = summ$n.censor,
            n_risk = summ$n.risk,
            estimate_type = "numeric",
            estimate = summ$surv,
            estimate_95CI_lower = summ$lower,
            estimate_95CI_upper = summ$upper,
            analysis_type = "single_event")

        # Add strata variable columns in a good format
        for (j in seq_along(name)) {
          name_w <- name
          estimates[[i + 1]] <- estimates[[i + 1]] %>%
            dplyr::mutate(
              strata_name = paste(name_w, collapse = " &&& "),
              strata_level = rep(gsub(", ", " &&& ", gsub(
                paste(paste0(name_w, "="),
                      collapse = "|"
                ), "",
                summ$strata
              )), 1)
            )

          fitSummary[[i + 1]] <- fitSummary[[i + 1]] %>%
            dplyr::mutate(
              strata_name = paste(name_w, collapse = " &&& "),
              strata_level = gsub(", ", " &&& ", gsub(
                paste(paste0(name_w, "="),
                      collapse = "|"
                ), "",
                fitSummary[[i + 1]]$strata
              ))
            )
        }

        # keep estimated probabilities only up to the end of follow up for that group
        maxTimePoints <- fitSummary[[i + 1]] %>%
          dplyr::select("strata", "max_time")

        estimates[[i + 1]] <- estimates[[i + 1]] %>%
          dplyr::left_join(maxTimePoints,
                           by = "strata"
          ) %>%
          dplyr::filter(.data$time <= .data$max_time) %>%
          dplyr::select(!c("max_time", "strata"))

        fitSummary[[i + 1]] <- fitSummary[[i + 1]] %>%
          dplyr::select(!c("max_time", "strata"))
      }
    }
    cli::cli_progress_done()
  }

  estimates <- dplyr::bind_rows(estimates) %>%
    dplyr::mutate(
      strata_level = stringr::str_trim(.data$strata_level)
    )

  # Helper function to calculate events for a given eventGap
  calculate_events <- function(data, gap) {
    data %>%
      dplyr::filter(.data$estimate_type == "numeric") %>%
      dplyr::group_by(.data$strata_name, .data$strata_level) %>%
      dplyr::mutate(
        n_events = cumsum(.data$n_event),
        n_censor = cumsum(.data$n_censor)
      ) %>%
      dplyr::filter(.data$time %% gap == 0 | .data$time == max(.data$time)) %>%
      dplyr::mutate(
        n_events = c(.data$n_events[1], diff(.data$n_events)),
        n_censor = c(.data$n_censor[1], diff(.data$n_censor)),
        eventGap = gap,
        outcome = "outcome"
      ) %>%
      dplyr::ungroup() %>%
      dplyr::select("time", "n_risk", "n_events", "n_censor", "eventGap", "outcome", "strata_name", "strata_level")
  }

  # Calculate number of events for all eventGaps
  number_events <- purrr::map_dfr(eventGap, ~calculate_events(estimates, .x))

  estimates <- estimates %>%
    dplyr::select(!"n_event")

  attr(estimates, "events") <- number_events
  attr(estimates, "summary") <- dplyr::as_tibble(dplyr::bind_rows(fitSummary))

  return(estimates)
}

competingRiskSurvival <- function(survData, times, variables, eventGap,
                                  restrictedMeanFollowUp = NULL) {
  if (!length(unique(as.character(survData %>%
                                  dplyr::pull("outcome_or_competing_status")))) == 3) {
    cli::cli_h1("No results for competing risk analysis")
    cli::cli_text(c(
      "Competing risk variable must have three levels.",
      "Do you have at least 1 individual for:"
    ))
    cli::cli_li("1) censored without event,")
    cli::cli_li("2) censored at outcome event of intest, and")
    cli::cli_li("3) censored at outcome competing event?")

    return(empty_estimates())
  }

  estimates <- list()
  fitSummary <- list()

  var_columns <- unlist(variables) %>% unique()

  cli::cli_progress_message("Getting overall estimates")
  fit <- survival::survfit(
    formula = survival::Surv(
      outcome_or_competing_time,
      outcome_or_competing_status
    ) ~ 1,
    data = survData
  )
  summ <- summary(fit, times = times, extend = TRUE, data.frame = TRUE)

  if(summ %>% dplyr::select("state") %>% dplyr::distinct() %>% dplyr::pull() %>% length() < 3){
    cli::cli_h1("No results for competing risk analysis")
    cli::cli_text(c(
      "Competing risk variable must have three levels.",
      "Do you have at least 1 individual for:"
    ))
    cli::cli_li("1) censored without event,")
    cli::cli_li("2) censored at outcome event of intest, and")
    cli::cli_li("3) censored at outcome competing event?")

    return(empty_estimates())
  }

  fitSummary[[1]] <- as.data.frame(summary(fit, rmean = restrictedMeanFollowUp)$table) %>%
    dplyr::rename(
      "number_records" = "n",
      "n_events" = "nevent",
      "restricted_mean_survival" = "rmean"
    ) %>%
    dplyr::mutate(analysis_type = "competing_risk") %>%
    dplyr::mutate(
      strata_name = "overall",
      strata_level = "overall"
    ) %>%
    tibble::rownames_to_column(var = "outcome") %>%
    dplyr::mutate(outcome = dplyr::if_else(.data$outcome == "(s0)", "none",
                                           dplyr::if_else(.data$outcome == "1",
                                                          "outcome", "competing_outcome"
                                           )
    ))

  estimates[[1]] <- summ %>%
    dplyr::rename(
      "estimate" = "pstate",
      "estimate_95CI_lower" = "lower",
      "estimate_95CI_upper" = "upper",
      "outcome" = "state",
      "n_risk" = "n.risk",
      "n_event" = "n.event",
      "n_censor" = "n.censor"
    ) %>%
    dplyr::mutate(outcome = dplyr::if_else(.data$outcome == 1,
                                           "outcome", dplyr::if_else(.data$outcome == 2,
                                                                     "competing_outcome", "none"))
    ) %>%
    dplyr::mutate(analysis_type = "competing_risk",
                  estimate_type = "numeric") %>%
    dplyr::mutate(
      strata_name = "overall",
      strata_level = "overall"
    )

  # Add strata estimates if required
  if (!is.null(variables)) {
    cli::cli_progress_bar(
      total = length(variables),
      format = " -- Getting estimates for {cli::pb_bar} {cli::pb_current} of {cli::pb_total} strata"
    )
    for (i in seq_along(variables)) {
      cli::cli_progress_update()
      # Get formula for the model
      name <- variables[[i]]
      uniqueVals <- survData %>%
        dplyr::select(.env$name) %>%
        tidyr::unite(col = "vars") %>%
        dplyr::distinct() %>%
        dplyr::pull()
      if(length(uniqueVals) == 1){
        cli::cli_inform("All values of {name} are equal to {uniqueVals}
                        and stratified results will not be returned for
                        this variable")
      } else {
        expr <- stats::as.formula(paste(c(
          "survival::Surv(outcome_or_competing_time, outcome_or_competing_status) ~ 1",
          name
        ), collapse = " + "))
        fit <- survival::survfit(
          formula = expr,
          data = survData %>%
            dplyr::filter(dplyr::if_any(.env$name, ~ !is.na(.x)))
        )
        summ <- summary(fit, times = times, extend = TRUE, data.frame = TRUE)
        tidyFit <- broom::tidy(fit)

        maxTimes <- tidyFit %>%
          dplyr::group_by(.data$strata, .data$state) %>%
          dplyr::summarise(max_time = max(.data$time, na.rm = TRUE))

        fitSummary[[i + 1]] <- as.data.frame(summary(fit, rmean = restrictedMeanFollowUp)$table) %>%
          dplyr::rename(
            "number_records" = "n",
            "n_events" = "nevent",
            "restricted_mean_survival" = "rmean"
          ) %>%
          dplyr::mutate(analysis_type = "competing_risk") %>%
          tibble::rownames_to_column(var = "rowname") %>%
          tidyr::separate_wider_delim("rowname",
                                      delim = ", ",
                                      names = c(variables[[i]], "state")) %>%
          tidyr::unite(col = "strata", variables[[i]], sep = ", ") %>%
          dplyr::left_join(maxTimes,
                           by = c("strata", "state")) %>%
          dplyr::rename("outcome" = "state") %>%
          dplyr::mutate(
            strata_name = paste(name, collapse = " &&& "),
            strata_level = gsub(", ", " &&& ", gsub(
              paste(paste0(name, "="),
                    collapse = "|"
              ), "",
              .data$strata
            ))
          )  %>%
          dplyr::mutate(
            strata_level = gsub("and ([^and ]*)$", "", .data$strata_level)
          ) %>%
          dplyr::mutate(strata_level = gsub("[[:space:]]*$", "", .data$strata_level)) %>%
          dplyr::mutate(outcome = dplyr::if_else(.data$outcome == "(s0)", "none",
                                                 dplyr::if_else(.data$outcome == "1",
                                                                "outcome", "competing_outcome"
                                                 )
          )) %>%
          dplyr::mutate(
            strata = sub(",([^,]*)$", "", .data$strata)
          )

        estimates[[i + 1]] <-  summ %>%
          dplyr::rename(
            "estimate" = "pstate",
            "estimate_95CI_lower" = "lower",
            "estimate_95CI_upper" = "upper",
            "outcome" = "state",
            "n_risk" = "n.risk",
            "n_event" = "n.event",
            "n_censor" = "n.censor"
          ) %>%
          dplyr::mutate(outcome = dplyr::if_else(.data$outcome == 1,
                                                 "outcome", dplyr::if_else(.data$outcome == 2,
                                                                           "competing_outcome", "none")),
                        analysis_type = "competing_risk",
                        estimate_type = "numeric",
                        strata_name = paste(name, collapse = " &&& "),
                        strata_level = .data$strata) # to use in below join

        for (j in seq_along(name)) {
          estimates[[i + 1]] <- estimates[[i + 1]] %>%
            dplyr::mutate(strata_level = stringr::str_replace(
              string = .data$strata_level,
              pattern = paste0(name[j], "="), replacement = ""
            )) %>%
            dplyr::mutate(strata_level = stringr::str_replace(
              string = .data$strata_level,
              pattern = ",",
              replacement = " &&&"
            ))
        }

        # keep estimated probabilities only up to the end of follow up for that group
        maxTimePoints <- fitSummary[[i + 1]] %>%
          dplyr::select("strata_level", "outcome", "max_time")

        estimates[[i + 1]] <- estimates[[i + 1]] %>%
          dplyr::left_join(maxTimePoints,
                           by = c("strata_level", "outcome")
          ) %>%
          dplyr::filter(.data$time <= .data$max_time) %>%
          dplyr::select(!c("max_time", "strata"))

        fitSummary[[i + 1]] <- fitSummary[[i + 1]] %>%
          dplyr::select(!c("max_time", "strata"))
      }}
    cli::cli_progress_done()
  }

  # Output as tibble
  estimates <- dplyr::bind_rows(estimates) %>%
    dplyr::as_tibble() %>%
    dplyr::select(-"std.err")

  # Helper function to calculate events for a specific eventGap
  calculate_number_events <- function(data, gap) {
    data %>%
      dplyr::select("time", "n_event", "outcome", "strata_name", "strata_level") %>%
      dplyr::filter(.data$outcome != "none") %>%
      dplyr::left_join(
        data %>%
          dplyr::filter(.data$outcome == "none") %>%
          dplyr::select("time", "n_risk", "n_censor", "strata_name", "strata_level"),
        by = c("time", "strata_name", "strata_level")
      ) %>%
      dplyr::mutate(eventGap = gap) %>%
      dplyr::group_by(.data$strata_name, .data$strata_level, .data$outcome) %>%
      dplyr::mutate(
        n_events = cumsum(.data$n_event),
        n_censor = cumsum(.data$n_censor)
      ) %>%
      dplyr::filter(.data$time %% gap == 0 | .data$time == max(.data$time)) %>%
      dplyr::mutate(
        n_events = c(.data$n_events[1], diff(.data$n_events)),
        n_censor = c(.data$n_censor[1], diff(.data$n_censor))
      ) %>%
      dplyr::ungroup() %>%
      dplyr::select(-"n_event")
  }

  # Apply the function for all eventGaps and combine results
  number_events <- purrr::map_dfr(eventGap, ~calculate_number_events(estimates, .x))

  estimates <- estimates %>%
    dplyr::select(-c("n_event")) %>%
    dplyr::filter(.data$outcome != "none")

  attr(estimates, "events") <- number_events
  attr(estimates, "summary") <- dplyr::as_tibble(dplyr::bind_rows(fitSummary))

  return(estimates)
}

addCohortDetails <- function(x,
                             cdm,
                             targetCohortId,
                             targetCohortTable,
                             outcomeCohortId,
                             outcomeCohortTable,
                             competingOutcomeCohortId,
                             competingOutcomeCohortTable = NULL,
                             resultType) {
  outcomeCohortName <- omopgenerics::settings(cdm[[outcomeCohortTable]]) %>%
    dplyr::filter(.data$cohort_definition_id ==
                    .env$outcomeCohortId) %>%
    dplyr::pull("cohort_name")

  x <- x %>%
    dplyr::mutate(
      cdm_name = attr(cdm, "cdm_name"),
      package_name = "CohortSurvival",
      package_version = as.character(utils::packageVersion("CohortSurvival")),
      result_type = .env$resultType,
      group_name = "target_cohort",
      group_level =
        omopgenerics::settings(cdm[[targetCohortTable]]) %>%
        dplyr::filter(.data$cohort_definition_id ==
                        .env$targetCohortId) %>%
        dplyr::pull("cohort_name"),
      variable_type = NA,
    )

  if (!is.null(competingOutcomeCohortTable)) {
    competingOutcomeCohortName <- omopgenerics::settings(cdm[[competingOutcomeCohortTable]]) %>%
      dplyr::filter(.data$cohort_definition_id ==
                      .env$competingOutcomeCohortId) %>%
      dplyr::pull("cohort_name")

    if (competingOutcomeCohortName == outcomeCohortName) {
      competingOutcomeCohortName <- paste0(competingOutcomeCohortName, "_competing_outcome")
    }
    x <- x %>%
      dplyr::mutate(outcome = dplyr::if_else(
        .data$outcome == "outcome", outcomeCohortName, competingOutcomeCohortName
      ))
    x <- x %>%
      dplyr::mutate(analysis_type = "competing_risk")
  } else {
    x <- x %>%
      dplyr::mutate(outcome = dplyr::if_else(
        .data$outcome == "outcome", outcomeCohortName, "no_competing_outcome"
      ))
    x <- x %>%
      dplyr::mutate(analysis_type = "single_event")
  }

  return(x)
}

empty_estimates <- function() {
  dplyr::tibble()
}

validateInputSurvival <- function(cdm,
                                  targetCohortTable,
                                  targetCohortId,
                                  outcomeCohortTable,
                                  outcomeCohortId,
                                  outcomeDateVariable,
                                  outcomeWashout,
                                  competingOutcomeCohortTable,
                                  competingOutcomeCohortId,
                                  competingOutcomeDateVariable,
                                  competingOutcomeWashout,
                                  censorOnCohortExit,
                                  censorOnDate,
                                  followUpDays,
                                  strata,
                                  eventGap,
                                  estimateGap,
                                  restrictedMeanFollowUp,
                                  minimumSurvivalDays) {

  omopgenerics::assertCharacter(targetCohortTable, length = 1)
  omopgenerics::assertCharacter(outcomeCohortTable, length = 1)
  omopgenerics::assertCharacter(competingOutcomeCohortTable, length = 1, null = TRUE)
  omopgenerics::validateCdmArgument(cdm)
  omopgenerics::validateCohortArgument(cdm[[targetCohortTable]])
  omopgenerics::validateCohortArgument(cdm[[outcomeCohortTable]])
  omopgenerics::assertNumeric(targetCohortId, integerish = TRUE, length = 1, min = 1)
  omopgenerics::assertList(strata, null = TRUE)
  omopgenerics::assertChoice(unlist(strata), choices = colnames(cdm[[targetCohortTable]]), null = TRUE)
  omopgenerics::assertNumeric(outcomeCohortId, length = 1, min = 1)
  omopgenerics::assertNumeric(competingOutcomeCohortId, length = 1, min = 1)
  omopgenerics::assertCharacter(outcomeDateVariable, length = 1)
  omopgenerics::assertCharacter(competingOutcomeDateVariable, length = 1)
  omopgenerics::assertLogical(censorOnCohortExit, length = 1)
  omopgenerics::assertDate(censorOnDate, null = TRUE)
  omopgenerics::assertNumeric(followUpDays, length = 1, min = 1, integerish = TRUE)
  omopgenerics::assertNumeric(outcomeWashout, length = 1, min = 0, integerish = TRUE)
  omopgenerics::assertNumeric(competingOutcomeWashout, length = 1, min = 0, integerish = TRUE)
  omopgenerics::assertNumeric(eventGap, integerish = TRUE, min = 1)
  omopgenerics::assertNumeric(estimateGap, integerish = TRUE, min = 1)
  omopgenerics::assertNumeric(restrictedMeanFollowUp, integerish = TRUE, min = 1, length = 1, null = TRUE)
  omopgenerics::assertNumeric(minimumSurvivalDays, integerish = TRUE, min = 0, length = 1)

}

# Helper function to get cohort id
getCohortId <- function(cohortTable, cdm) {
  omopgenerics::assertTable(cdm[[cohortTable]])
  cohortId <- omopgenerics::cohortCount(cdm[[cohortTable]]) %>%
    dplyr::filter(.data$number_records > 0) %>%
    dplyr::pull("cohort_definition_id")
  return(cohortId)
}
Any scripts or data that you put into this service are public.
CohortSurvival documentation built on June 16, 2025, 5:10 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
CohortSurvival
Estimate Survival from Common Data Model Cohorts

R/estimateSurvival.R
In CohortSurvival: Estimate Survival from Common Data Model Cohorts

Defines functions getCohortId validateInputSurvival empty_estimates addCohortDetails competingRiskSurvival singleEventSurvival addCompetingRiskVars estimateSurvival estimateCompetingRiskSurvival estimateSingleEventSurvival

Documented in estimateCompetingRiskSurvival estimateSingleEventSurvival

Try the CohortSurvival package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CohortSurvival Estimate Survival from Common Data Model Cohorts

R/estimateSurvival.R In CohortSurvival: Estimate Survival from Common Data Model Cohorts

Defines functions getCohortId validateInputSurvival empty_estimates addCohortDetails competingRiskSurvival singleEventSurvival addCompetingRiskVars estimateSurvival estimateCompetingRiskSurvival estimateSingleEventSurvival

Documented in estimateCompetingRiskSurvival estimateSingleEventSurvival

Try the CohortSurvival package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CohortSurvival
Estimate Survival from Common Data Model Cohorts

R/estimateSurvival.R
In CohortSurvival: Estimate Survival from Common Data Model Cohorts