# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortMethod
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#' Create simulation profile
#'
#' @description
#' Creates a profile based on the provided
#' [CohortMethodData] object, which can be used to generate simulated data that has similar
#' characteristics.
#'
#' @template CohortMethodData
#'
#' @details
#' The output of this function is an object that can be used by the [simulateCohortMethodData()]
#' function to generate a cohortMethodData object.
#'
#' @return
#' An object of type `CohortDataSimulationProfile`.
#'
#' @export
createCohortMethodDataSimulationProfile <- function(cohortMethodData) {
errorMessages <- checkmate::makeAssertCollection()
checkmate::assertClass(cohortMethodData, "CohortMethodData", add = errorMessages)
checkmate::reportAssertions(collection = errorMessages)
if (nrow_temp(cohortMethodData$cohorts) == 0) {
stop("Cohorts are empty")
}
if (nrow_temp(cohortMethodData$covariates) == 0) {
stop("Covariates are empty")
}
if (sum(cohortMethodData$cohorts %>% select("daysToCohortEnd") %>% pull()) == 0) {
warning("Cohort data appears to be limited, check daysToCohortEnd which appears to be all zeros")
}
message("Computing covariate prevalence")
# (Note: currently limiting to binary covariates)
populationSize <- nrow(cohortMethodData$cohorts)
covariatePrevalence <- cohortMethodData$covariates %>%
group_by(.data$covariateId) %>%
summarise(sum = sum(.data$covariateValue, na.rm = TRUE)) %>%
mutate(prevalence = .data$sum / populationSize) %>%
ungroup() %>%
inner_join(cohortMethodData$covariateRef, by = "covariateId") %>%
inner_join(cohortMethodData$analysisRef, by = "analysisId") %>%
filter(.data$isBinary == "Y") %>%
select("covariateId", "prevalence") %>%
collect()
message("Computing propensity model")
propensityScore <- createPs(cohortMethodData,
maxCohortSizeForFitting = 25000,
prior = Cyclops::createPrior("laplace", 0.1, exclude = 0)
)
propensityModel <- attr(propensityScore, "metaData")$psModelCoef
message("Fitting outcome model(s)")
outcomeIds <- attr(cohortMethodData, "metaData")$outcomeIds
outcomeModels <- vector("list", length(outcomeIds))
for (i in 1:length(outcomeIds)) {
outcomeId <- outcomeIds[i]
studyPop <- createStudyPopulation(
cohortMethodData = cohortMethodData,
population = propensityScore,
minDaysAtRisk = 1,
removeSubjectsWithPriorOutcome = FALSE,
outcomeId = outcomeId
)
studyPop <- matchOnPs(studyPop, caliper = 0.25, caliperScale = "standardized", maxRatio = 1)
outcomeModel <- fitOutcomeModel(
population = studyPop,
cohortMethodData = cohortMethodData,
modelType = "poisson",
stratified = FALSE,
useCovariates = TRUE,
prior = Cyclops::createPrior("laplace", 0.1, exclude = 0)
)
outcomeModels[[i]] <- outcomeModel$outcomeModelCoefficients[outcomeModel$outcomeModelCoefficients != 0]
}
message("Computing rates of prior outcomes")
totalTime <- cohortMethodData$cohorts %>%
summarise(time = sum(.data$daysFromObsStart, na.rm = TRUE)) %>%
pull()
preIndexOutcomeRates <- cohortMethodData$outcomes %>%
filter(.data$daysToEvent < 0) %>%
group_by(.data$outcomeId) %>%
summarise(n = n_distinct(.data$rowId)) %>%
mutate(rate = .data$n / totalTime) %>%
select("outcomeId", "rate") %>%
collect()
message("Fitting models for time to observation period start, end and time to cohort end")
cohorts <- cohortMethodData$cohorts %>%
collect()
obsEnd <- cohorts$daysToObsEnd
cohortEnd <- cohorts$daysToCohortEnd
event <- as.integer(cohortEnd < obsEnd)
time <- cohortEnd
time[cohortEnd > obsEnd] <- obsEnd[cohortEnd > obsEnd]
data <- tibble(time = time, event = event)
data <- data[data$time > 0, ]
fitCohortEnd <- survival::survreg(survival::Surv(
time,
event
) ~ 1, data = data, dist = "exponential")
fitObsEnd <- survival::survreg(survival::Surv(obsEnd[obsEnd > 0]) ~ 1, dist = "exponential")
data <- cohorts
minObsTime <- min(data$daysFromObsStart)
data$time <- data$daysFromObsStart - minObsTime + 1
fitObsStart <- survival::survreg(survival::Surv(time) ~ 1, data = data, dist = "exponential")
result <- list(
covariatePrevalence = covariatePrevalence,
propensityModel = propensityModel,
outcomeModels = outcomeModels,
preIndexOutcomeRates = preIndexOutcomeRates,
metaData = attr(cohortMethodData, "metaData"),
covariateRef = collect(cohortMethodData$covariateRef),
analysisRef = collect(cohortMethodData$analysisRef),
cohortEndRate = 1 / exp(coef(fitCohortEnd)),
obsStartRate = 1 / exp(coef(fitObsStart)),
minObsTime = minObsTime,
obsEndRate = 1 / exp(coef(fitObsEnd))
)
class(result) <- "CohortDataSimulationProfile"
return(result)
}
#' Generate simulated data
#'
#' @description
#' Creates a [CohortMethodData] object with simulated data.
#'
#' @param profile An object of type `CohortMethodDataSimulationProfile` as generated using the
#' [createCohortMethodDataSimulationProfile()] function.
#' @param n The size of the population to be generated.
#'
#' @details
#' This function generates simulated data that is in many ways similar to the original data on which
#' the simulation profile is based. The contains same outcome, comparator, and outcome concept IDs,
#' and the covariates and their 1st order statistics should be comparable.
#'
#' @return
#' An object of type [CohortMethodData].
#'
#' @export
simulateCohortMethodData <- function(profile, n = 10000) {
errorMessages <- checkmate::makeAssertCollection()
checkmate::assertClass(profile, "CohortDataSimulationProfile", add = errorMessages)
checkmate::assertInt(n, lower = 1, add = errorMessages)
checkmate::reportAssertions(collection = errorMessages)
message("Generating covariates")
# Treatment variable is generated elsewhere:
covariatePrevalence <- profile$covariatePrevalence[names(profile$covariatePrevalence) != "1"]
personsPerCov <- rpois(n = nrow(covariatePrevalence), lambda = covariatePrevalence$prevalence * n)
personsPerCov[personsPerCov > n] <- n
rowId <- sapply(personsPerCov, function(x, n) sample.int(size = x, n), n = n)
rowId <- do.call("c", rowId)
covariateIds <- covariatePrevalence$covariateId
covariateId <- sapply(1:length(personsPerCov),
function(x, personsPerCov, covariateIds) {
rep(
covariateIds[x],
personsPerCov[x]
)
},
personsPerCov = personsPerCov,
covariateIds = covariateIds
)
covariateId <- do.call("c", covariateId)
covariateValue <- rep(1, length(covariateId))
covariates <- tibble(
rowId = rowId,
covariateId = covariateId,
covariateValue = covariateValue
)
message("Generating treatment variable")
betas <- profile$propensityModel
intercept <- betas[1]
betas <- betas[2:length(betas)]
betas <- tibble(
beta = as.numeric(betas),
covariateId = as.numeric(names(betas))
)
treatmentVar <- covariates %>%
inner_join(betas, by = "covariateId") %>%
mutate(value = .data$covariateValue * .data$beta) %>%
group_by(.data$rowId) %>%
summarise(value = sum(.data$value) + intercept)
link <- function(x) {
return(1 / (1 + exp(-x)))
}
treatmentVar$value <- link(treatmentVar$value)
treatmentVar$rand <- runif(nrow(treatmentVar))
treatmentVar$covariateValue <- as.integer(treatmentVar$rand < treatmentVar$value)
treatmentVar <- treatmentVar[, c("rowId", "covariateValue")]
treatmentVar$covariateId <- 1
message("Generating cohorts")
cohorts <- tibble(
rowId = treatmentVar$rowId,
treatment = treatmentVar$covariateValue,
personId = treatmentVar$rowId,
personSeqId = treatmentVar$rowId,
cohortStartDate = as.Date("2000-01-01"),
daysFromObsStart = profile$minObsTime + round(rexp(
n,
profile$obsStartRate
)) - 1,
daysToCohortEnd = round(rexp(n, profile$obsEndRate)),
daysToObsEnd = round(rexp(n, profile$cohortEndRate))
)
message("Generating outcomes after index date")
allOutcomes <- tibble()
for (i in 1:length(profile$metaData$outcomeIds)) {
betas <- profile$outcomeModels[[i]]
intercept <- betas[1]
betas <- betas[2:length(betas)]
betas <- tibble(beta = as.numeric(betas), covariateId = as.numeric(names(betas)))
temp <- merge(covariates, betas)
temp$value <- temp$covariateValue * temp$beta # Currently pointless, since covariateValue is always 1
temp <- aggregate(value ~ rowId, data = temp, sum)
temp$value <- temp$value + intercept
temp$value <- exp(temp$value) # Value is now the rate
temp <- merge(temp, cohorts[, c("rowId", "daysToObsEnd")])
temp$value <- temp$value * temp$daysToObsEnd # Value is lambda
temp$nOutcomes <- rpois(n, temp$value)
temp$nOutcomes[temp$nOutcomes > temp$daysToObsEnd] <- temp$daysToObsEnd[temp$nOutcomes > temp$daysToObsEnd]
outcomeRows <- sum(temp$nOutcomes)
outcomes <- tibble(
rowId = rep(0, outcomeRows),
outcomeId = rep(profile$metaData$outcomeIds[i], outcomeRows),
daysToEvent = rep(0, outcomeRows)
)
cursor <- 1
for (j in 1:nrow(temp)) {
nOutcomes <- temp$nOutcomes[j]
if (nOutcomes != 0) {
outcomes$rowId[cursor:(cursor + nOutcomes - 1)] <- temp$rowId[j]
outcomes$daysToEvent[cursor:(cursor + nOutcomes - 1)] <- sample.int(
size = nOutcomes,
temp$daysToObsEnd[j]
)
cursor <- cursor + nOutcomes
}
}
allOutcomes <- rbind(allOutcomes, outcomes)
}
message("Generating outcomes before index date")
for (i in 1:length(profile$metaData$outcomeIds)) {
outcomeId <- profile$metaData$outcomeIds[i]
rate <- profile$preIndexOutcomeRates$rate[profile$preIndexOutcomeRates$outcomeId == outcomeId]
nOutcomes <- rpois(nrow(cohorts), rate * cohorts$daysFromObsStart)
nOutcomes[nOutcomes > cohorts$daysFromObsStart] <- cohorts$daysFromObsStart[nOutcomes > cohorts$daysFromObsStart]
outcomeRows <- sum(nOutcomes)
outcomes <- tibble(
rowId = rep(0, outcomeRows),
outcomeId = rep(outcomeId, outcomeRows),
daysToEvent = rep(0, outcomeRows)
)
cursor <- 1
for (j in 1:length(nOutcomes)) {
if (nOutcomes[j] != 0) {
outcomes$rowId[cursor:(cursor + nOutcomes[j] - 1)] <- cohorts$rowId[j]
outcomes$daysToEvent[cursor:(cursor + nOutcomes[j] - 1)] <- -sample.int(
size = nOutcomes[j],
cohorts$daysFromObsStart[j]
)
cursor <- cursor + nOutcomes[j]
}
}
allOutcomes <- rbind(allOutcomes, outcomes)
}
result <- Andromeda::andromeda(
outcomes = allOutcomes,
cohorts = cohorts,
covariates = covariates,
covariateRef = profile$covariateRef,
analysisRef = profile$analysisRef
)
metaData <- profile$metaData
metaData$populationSize <- n
attr(result, "metaData") <- metaData
class(result) <- "CohortMethodData"
attr(class(result), "package") <- "CohortMethod"
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.