##
## intsurv: Integrative Survival Models
## Copyright (C) 2017-2021 Wenjie Wang <wang@wwenjie.org>
##
## This file is part of the R package intsurv.
##
## The R package intsurv is free software: You can redistribute it and/or
## modify it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or any later
## version (at your option). See the GNU General Public License at
## <https://www.gnu.org/licenses/> for details.
##
## The R package intsurv is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
##
##' Simulated Survival Data with Uncertain Records
##'
##' Generate survival data with uncertain records. An integrative Cox model can
##' be fitted for the simulated data by function \code{\link{iCoxph}}.
##'
##' The event times are simulated from a Weibull proportional hazard model of
##' given shape and baseline scale. The censoring times follow uniform
##' distribution of specified boundaries.
##'
##' @param nSubject Number of subjects.
##' @param beta0Vec Time-invariant covariate coefficients.
##' @param xMat Design matrix. By default, three continuous variables following
##' standard normal distribution and one binary variable following Bernoulli
##' distribution with equal probability are used.
##' @param maxNum Maximum number of uncertain records.
##' @param nRecordProb Probability of the number of uncertain records.
##' @param matchCensor The matching rate for subjects actually having censoring
##' times.
##' @param matchEvent The matching rate for subjects actually having event
##' times.
##' @param censorMin The lower boundary of the uniform distribution for
##' generating censoring time.
##' @param censorMax The upper boundary of the uniform distribution for
##' generating censoring time.
##' @param lambda A positive number, scale parameter in baseline rate function
##' for true event times.
##' @param rho A positive number, shape parameter in baseline rate function for
##' true event times.
##' @param fakeLambda1 A positive number, scale parameter in baseline rate
##' function for fake event times from one distribution.
##' @param fakeRho1 A positive number, shape parameter in baseline rate function
##' for fake event times from one distribution.
##' @param fakeLambda2 A positive number, scale parameter in baseline rate
##' function for fake event times from another distribution.
##' @param fakeRho2 A positive number, shape parameter in baseline rate function
##' for fake event times from another distribution.
##' @param mixture The mixture weights, i.e., the probabilities (summing up to
##' one) of fake event times coming from different mixture components.
##' @param randomMiss A logical value specifying whether the labels of the true
##' records are missing completely at random (MCAR) or missing not at random
##' (MNAR). The default value is \code{TRUE} for MCAR.
##' @param eventOnly A logical value specifying whether the uncertain records
##' only include possible events. The default value is \code{FALSE}, which
##' considers the censoring cases as the possible truth in addition to event
##' records.
##' @param ... Other arguments for future usage.
##'
##' @return A data frame with the following columns,
##'
##' \itemize{
##'
##' \item \code{ID}: subject ID
##'
##' \item \code{time}: observed event times
##'
##' \item \code{event}: event indicators
##'
##' \item \code{isTure}: latent labels indicating the true records
##'
##' }
##'
##' and the corresponding covariates.
##'
##' @examples
##' ## See examples of function iCoxph
##' @importFrom stats runif rnorm
##' @export
simData4iCoxph <- function(nSubject = 1e3, beta0Vec, xMat, maxNum = 2,
nRecordProb = c(0.9, 0.1),
matchCensor = 0.1, matchEvent = 0.1,
censorMin = 0.5, censorMax = 12.5,
lambda = 0.005, rho = 0.7,
fakeLambda1 = lambda * exp(- 3), fakeRho1 = rho,
fakeLambda2 = lambda * exp(3), fakeRho2 = rho,
mixture = 0.5, randomMiss = TRUE,
eventOnly = FALSE, ...)
{
## 1. Generate the true dataset with unique event time per subject and
## cencoring time. The censoring rate of the truth data set can be
## controled.
## 2. Take a sample of subject ID, where the sample size depends on the
## matching rate from original data to death database.
## 3. Generate fake records for selected the subjects.
## 3.1. If the subject has event time censored, i.e. the truth is the
## censoring record, generate given number of fake event times by different
## x's or distribution parameters and only keep fake events that happened
## before censoring time. (It is possible that not much fake events left,
## which means that the alternative distribution of fake events is probably
## not reasonable for the censoring time picked.)
## 3.2. If the subject has event time recorded, i.e. the truth is the event
## time, generate event times by different x's or distribution parameters.
## Take them and the censoring time picked as all the fake
## records. Similarly, only keep event times before the censoring time.
## non-sense just to surpress complaint from R check
ID <- NULL
## set design matrix and coefficients for event times
if (missing(xMat)) {
x1 <- round(rnorm(nSubject), 3)
x2 <- round(rnorm(nSubject), 3)
x3 <- round(rnorm(nSubject), 3)
x4 <- sample(c(0, 1), size = nSubject, replace = TRUE)
xMat <- cbind(x1, x2, x3, x4)
} else {
xMat <- as.matrix(xMat)
}
beta0Vec <- if (missing(beta0Vec))
rep(1, ncol(xMat))
else
as.numeric(beta0Vec)
## event times
eventTime <- denWeib(nSubject, beta0Vec, xMat, lambda, rho, gen = TRUE)
## censoring times
censorTime <- runif(n = nSubject, min = censorMin, max = censorMax)
event <- censorTime > eventTime
time <- ifelse(event, eventTime, censorTime)
trueDat <- data.frame(ID = seq_len(nSubject), xMat, time = time,
eventTime = eventTime, censorTime = censorTime,
event, isTrue = 1L)
## match records for subjects actually having events
numSubFake1 <- round(matchEvent * sum(trueDat$event))
subIDfake1 <- if (randomMiss) {
with(subset(trueDat, event == 1L),
sample(ID, size = numSubFake1))
} else {
with(subset(trueDat, event == 1L),
sample(ID, size = numSubFake1,
prob = eventTime))
}
## match records for censoring cases
numSubFake0 <- round(matchCensor * sum(! trueDat$event))
subIDfake0 <- if (randomMiss) {
with(subset(trueDat, event == 0L),
sample(ID, size = numSubFake0))
} else {
with(subset(trueDat, event == 0L),
sample(ID, size = numSubFake0,
prob = eventTime))
}
if (maxNum == 1) nRecordProb <- 1
nRecords <- sample(maxNum, size = numSubFake0 + numSubFake1,
replace = TRUE, prob = nRecordProb)
numFake <- sum(nRecords) - numSubFake1
multiInd <- c(rep(c(subIDfake0, subIDfake1), nRecords - 1), subIDfake0)
fakeDat <- trueDat[multiInd, ]
xMat <- xMat[multiInd, ]
## fake event time from alternative dist. # 1 and # 2
fakes1 <- denWeib(numFake, beta0Vec, xMat, lambda = fakeLambda1,
rho = fakeRho1, censorTime = fakeDat$censorTime,
gen = TRUE)
fakes2 <- denWeib(numFake, beta0Vec, xMat, lambda = fakeLambda2,
rho = fakeRho2, censorTime = fakeDat$censorTime,
gen = TRUE)
## fake censored data for all selected subject with true event times
fakeCensor <- subset(trueDat, ID %in% subIDfake1 & event)
if (eventOnly || ! nrow(fakeCensor)) {
fakeCensor <- NULL
} else {
fakeCensor$time <- fakeCensor$censorTime
fakeCensor$event <- fakeCensor$isTrue <- 0L
}
## setup mixture of alternative density
indDist1 <- runif(numFake)
fakeDat$time <- ifelse(indDist1 < mixture, fakes1, fakes2)
fakeDat$event <- 1L
fakeDat$isTrue <- 0L
## remove all fake events after censorTime
## fakeDat <- subset(fakeDat, time <= censorTime)
## by using truncated distribution, no need to remove
## combine all parts as output
outDat <- rbind(trueDat, fakeDat, fakeCensor)
outDat$event <- as.integer(outDat$event)
out <- outDat[order(outDat$ID), ]
row.names(out) <- out$eventTime <- out$censorTime <- NULL
out
}
### internal function ==========================================================
denWeib <- function (nEvent = 1e3, beta0Vec, xMat, lambda, rho,
censorTime = NULL, gen = FALSE, ...) {
if (missing(beta0Vec))
beta0Vec <- c(1, 1)
if (missing(xMat))
xMat <- rep(0, length(beta0Vec))
vVec <- runif(nEvent)
if (gen) {
expXbeta <- exp(xMat %*% beta0Vec)
if (is.null(censorTime)) {
eventTime <- (- log(1 - vVec) / (lambda * expXbeta)) ^ (1 / rho)
return(eventTime)
}
## generate event times before given censoring time
## using truncated distribution
## reference: Nadarajah and Kotz (2006). JSS.
Gb <- 1 - exp(- lambda * censorTime ^ rho * expXbeta)
eventTime <- (- log(1 - vVec * Gb) / (lambda * expXbeta)) ^ (1 / rho)
idx <- eventTime >= censorTime
return(eventTime)
}
## otherwise
newScale <- lambda * exp(xMat %*% beta0Vec)
eventTime <- (- log(vVec) / newScale) ^ (1 / rho)
tVec <- seq.int(0, max(eventTime), by = 0.01)
htSt <- newScale * rho * tVec ^ (rho - 1) * exp(- newScale * tVec ^ rho)
list(time = tVec, htSt = htSt, x = xMat, beta = beta0Vec)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.