Nothing
#' @keywords internal
dgenpois <- function(x, mu, phi, log = FALSE){
lx <- length(x); lm <- length(mu); lp <- length(phi)
ch <- lx == lm & lx == lp & lm == lp
if(!ch) stop("Not all supplied inputs are of same length, check.\n")
out <- sapply(1:lx, function(i) GP1_pmf_scalar(mu[i], phi[i], x[i]))
if(log)
return(log(out))
else
return(out)
}
#' Simulate realisations from a generalised poisson distribution
#' @param mu A numeric vector of rates \eqn{\exp{\eta}}, with \eqn{\eta} the linear predictor.
#' @param phi A numeric specifying the dispersion \eqn{\varphi}. If \eqn{\varphi<0} the response
#' will be under-dispersed and overdispersed if \eqn{\varphi>0}.
#'
#' @details Follows the "GP-1" implementation of the generalised Poisson distribution outlined
#' in Zamani & Ismail (2012). The variance of produced \eqn{Y} is \eqn{(1+\varphi)^2\mu}. As such
#' the dispersion parameter is bounded (i.e. not in positive reals as with CMP distribution).
#'
#' @return An appropriately-dimensioned vector of count data.
#'
#' @references
#'
#' Zamani H and Ismail N. Functional Form for the Generalized Poisson Regression Model,
#' \emph{Communications in Statistics - Theory and Methods} 2012; \strong{41(20)}; 3666-3675.
#'
#' @export
rgenpois <- function(mu, phi){
n <- length(mu)
out <- numeric(n)
for(j in 1:n){
ans <- 0;
rand <- runif(1)
kum <- dgenpois(0, mu[j], phi[j])
while(rand > kum){
ans <- ans + 1
kum <- kum + dgenpois(ans, mu[j], phi[j])
# For debugging
if(is.infinite(kum) || is.nan(kum)) cat(sprintf("NaN/Infinite kum: mu = %.2f, phi = %.2f", mu[j], phi[j]))
}
out[j] <- ans
}
out
}
#' Simulate data from a multivariate joint model
#'
#' @description Simulate multivariate longitudinal and survival data from a joint model
#' specification, with potential mixture of response families. Implementation is similar
#' to existing packages (e.g. \code{joineR}, \code{joineRML}).
#'
#' @param n the number of subjects
#' @param ntms the number of time points
#' @param fup the maximum follow-up time, such that t = [0, ..., fup] with length \code{ntms}.
#' In instances where subject \eqn{i} \emph{doesn't} fail before \code{fup}, their censoring
#' time is set as \code{fup + 0.1}.
#' @param family a \eqn{K}-\code{list} of families, see \strong{details}.
#' @param sigma a \eqn{K}-\code{list} of dispersion parameters corresponding to the order of
#' \code{family}, and matching \code{disp.formulas} specification; see \strong{details}.
#' @param beta a \eqn{K \times 4} matrix specifying fixed effects for each \eqn{K} parameter,
#' in the order (Intercept), time, continuous, binary.
#' @param D a positive-definite matrix specifying the variance-covariance matrix for the random
#' effects. If not supplied an identity matrix is assumed.
#' @param gamma a \eqn{K}-vector specifying the association parameters for each longitudinal
#' outcome.
#' @param zeta a vector of length 2 specifying the coefficients for the baseline covariates in
#' the survival sub-model, in the order of continuous and binary.
#' @param theta parameters to control the failure rate, see \strong{baseline hazard}.
#' @param cens.rate parameter for \code{rexp} to generate censoring times for each subject.
#' @param regular.times logical, if \code{regular.times = TRUE} (the default), then
#' \emph{every} subject will have the same follow-up times defined by
#' \code{seq(0, fup, length.out = ntms)}. If \code{regular.times = FALSE} then follow-up times are
#' set as random draws from a uniform distribution with maximum \code{fup}.
#' @param dof integer, specifies the degrees of freedom of the multivariate t-distribution
#' used to generate the random effects. If specified, this t-distribution is used. If left
#' at the default \code{dof=Inf} then the random effects are drawn from a multivariate normal
#' distribution.
#' @param random.formulas allows user to specify if an intercept-and-slope (\code{~ time}) or
#' intercept-only (\code{~1}) random effects structure should be used on a response-by-response
#' basis. Defaults to an intercept-and-slope for all responses.
#' @param disp.formulas allows user to specify the dispersion model simulated. Intended use is
#' to allow swapping between intercept only (the default) and a time-varying one (\code{~ time}).
#' Note that this should be a \eqn{K}-\code{list} of formula objects, so if only one dispersion
#' model is wanted, then an intercept-only should be specified for remaining sub-models. The
#' corresponding item in list of \code{sigma} parameters should be of appropriate size. Defaults
#' to an intercept-only model.
#' @param return.ranefs a logical determining whether the \emph{true} random effects should be
#' returned. This is largely for internal/simulation use. Default \code{return.ranefs = FALSE}.
#'
#' @returns A list of two \code{data.frame}s: One with the full longitudinal data, and another
#' with only survival data. If \code{return.ranefs=TRUE}, a matrix of the true \eqn{b} values is
#' also returned. By default (i.e. no arguments provided), a bivariate Gaussian set of joint
#' data is returned.
#'
#' @details \code{simData} simulates data from a multivariate joint model with a mixture of
#' families for each \eqn{k=1,\dots,K} response. The specification of \code{family} changes
#' requisite dispersion parameter \code{sigma}, if applicable. The \code{family} list can
#' (currently) contain:
#'
#' \describe{
#'
#' \item{\code{"gaussian"}}{Simulated with identity link, corresponding item in \code{sigma}
#' will be the \strong{variance}.}
#' \item{\code{"poisson"}}{Simulated with log link, corresponding dispersion in \code{sigma}
#' can be anything, as it doesn't impact simulation.}
#' \item{\code{"binomial"}}{Simulated with logit link, corresponding dispersion in \code{sigma}
#' can be anything, as it doesn't impact simulation.}
#' \item{\code{"negbin"}}{Simulated with a log link, corresponding item in \code{sigma} will be
#' the \strong{overdispersion} defined on the log scale. Simulated variance is
#' \eqn{\mu+\mu^2/\varphi}.}
#' \item{\code{"genpois"}}{Simulated with a log link, corresponding item in \code{sigma} will be
#' the \strong{dispersion}. Values < 0 correspond to under-dispersion, and values > 0 over-
#' dispersion. See \code{\link{rgenpois}} for more information. Simulated variance is
#' \eqn{(1+\varphi)^2\mu}.}
#' \item{\code{"Gamma"}}{Simulated with a log link, corresponding item in \code{sigma} will be
#' the \strong{shape} parameter, defined on the log-scale.}
#'
#' }
#'
#' Therefore, for families \code{"negbin"}, \code{"Gamma"}, \code{"genpois"}, the user can
#' define the dispersion model desired in \code{disp.formulas}, which creates a data matrix
#' \eqn{W}. For the \code{"negbin"} and \code{"Gamma"} cases, we define
#' \eqn{\varphi_i=\exp\{W_i\sigma_i\}} (i.e. the exponent of the linear predictor of the
#' dispersion model); and for \code{"genpois"} the identity of the linear is used.
#'
#' @section Baseline hazard:
#'
#' When simulating the survival time, the baseline hazard is a Gompertz distribution controlled
#' by \code{theta=c(x,y)}:
#'
#' \deqn{\lambda_0(t) = \exp{x + yt}}
#'
#' where \eqn{y} is the shape parameter, and the scale parameter is \eqn{\exp{x}}.
#'
#' @importFrom MASS mvrnorm
#' @importFrom mvtnorm rmvt
#'
#' @author James Murray (\email{j.murray7@@ncl.ac.uk}).
#' @keywords simulation
#' @seealso \code{\link{joint}}
#'
#' @references
#'
#' Austin PC. Generating survival times to simulate Cox proportional hazards
#' models with time-varying covariates. \emph{Stat Med.} 2012; \strong{31(29)}:
#' 3946-3958.
#'
#' @export
#'
#' @examples
#' # 1) A set of univariate data ------------------------------------------
#' beta <- c(2.0, 0.33, -0.25, 0.15)
#' # Note that by default arguments are bivariate, so need to specify the univariate case
#' univ.data <- simData(beta = beta,
#' gamma = 0.15, sigma = list(0.2), family = list("gaussian"),
#' D = diag(c(0.25, 0.05)))
#'
#' # 2) Univariate data, with failure rate controlled ---------------------
#' # In reality, many facets contribute to the simulated failure rate, in
#' # this example, we'll just atler the baseline hazard via 'theta'.
#' univ.data.highfail <- simData(beta = beta,
#' gamma = 0.15, sigma = list(0.0), family = list("poisson"),
#' D = diag(c(0.40, 0.08)), theta = c(-2, 0.1))
#'
#' # 3) Trivariate (K = 3) mixture of families with dispersion parameters -
#' beta <- do.call(rbind, replicate(3, c(2, -0.1, 0.1, -0.2), simplify = FALSE))
#' gamma <- c(0.3, -0.3, 0.3)
#' D <- diag(c(0.25, 0.09, 0.25, 0.05, 0.25, 0.09))
#' family <- list('gaussian', 'genpois', 'negbin')
#' sigma <- list(.16, 1.5, log(1.5))
#' triv.data <- simData(ntms=15, family = family, sigma = sigma, beta = beta, D = D,
#' gamma = gamma, theta = c(-3, 0.2), zeta = c(0,-.2))
#'
#' # 4) K = 4 mixture of families with/out dispersion ---------------------
#' beta <- do.call(rbind, replicate(4, c(2, -0.1, 0.1, -0.2), simplify = FALSE))
#' gamma <- c(-0.75, 0.3, -0.6, 0.5)
#' D <- diag(c(0.25, 0.09, 0.25, 0.05, 0.25, 0.09, 0.16, 0.02))
#' family <- list('gaussian', 'poisson', 'binomial', 'gaussian')
#' sigma <- list(.16, 0, 0, .05) # 0 can be anything here, as it is ignored internally.
#' mix.data <- simData(ntms=15, family = family, sigma = sigma, beta = beta, D = D, gamma = gamma,
#' theta = c(-3, 0.2), zeta = c(0,-.2))
#'
#' # 5) Bivariate joint model with two dispersion models. -----------------
#' disp.formulas <- list(~time, ~time) # Two time-varying dispersion models
#' sigma <- list(c(0.00, -0.10), c(0.10, 0.15)) # specified in form of intercept, slope
#' D <- diag(c(.25, 0.04, 0.50, 0.10))
#' disp.data <- simData(family = list("genpois", "negbin"), sigma = sigma, D = D,
#' beta = rbind(c(0, 0.05, -0.15, 0.00), 1 + c(0, 0.25, 0.15, -0.20)),
#' gamma = c(1.5, 1.5),
#' disp.formulas = disp.formulas, fup = 5)
#'
#' # 6) Trivariate joint model with mixture of random effects models ------
#' # It can be hard to e.g. fit a binomial model on an intercept and slope, since e.g.
#' # glmmTMB might struggle to accurately fit it (singular fits, etc.). To that end, could
#' # swap the corresponding random effects specification to be an intercept-only.
#' family <- list("gaussian", "binomial", "gaussian")
#' # A list of formulae, even though we want to change the second sub-model's specification
#' # we need to specify the rest of the items, too (same as disp.formulas, sigma).
#' random.formulas <- list(~time, ~1, ~time)
#' beta <- rbind(c(2, -0.2, 0.5, -0.25), c(0, 0.5, 1, -1), c(-2, 0.2, -0.5, 0.25))
#' # NOTE that the specification of RE matrix will need to match.
#' D <- diag(c(0.25, 0.09, 1, 0.33, 0.05))
#' # Simulate data, and return REs as a sanity check...
#' mix.REspec.data <- simData(beta = beta, D = D, family = family,
#' gamma = c(-0.5, 1, 0.5), sigma = list(0.15, 0, 0.15),
#' random.formulas = random.formulas, return.ranefs = TRUE)
simData <- function(n = 250, ntms = 10, fup = 5,
family = list('gaussian', 'gaussian'),
sigma = list(0.16, 0.16),
beta = rbind(c(1, 0.10, 0.33, -0.50), c(1, 0.10, 0.33, -0.50)), D = NULL,
gamma = c(0.5, -0.5), zeta = c(0.05, -0.30),
theta = c(-4, 0.2), cens.rate = exp(-3.5),
regular.times = TRUE,
dof = Inf,
random.formulas = NULL, disp.formulas = NULL,
return.ranefs = FALSE){
random.formula <- random.formulas
# Checks --------------
# Check family is valid option
family <- lapply(family, function(f){
if("function"%in%class(f)) f <- f()$family
if(!f%in%c('gaussian', 'binomial', 'poisson', 'genpois', 'Gamma', 'genpois', 'negbin'))
stop('Family must be one of "gaussian", "binomial", "poisson", "genpois", "negbin" or "Gamma".')
f
})
# Check dispersion supplied if neg. binom.
family <- lapply(family, function(f) match.arg(f, c('gaussian', 'binomial', 'poisson', 'genpois', 'Gamma', 'genpois', 'negbin'), several.ok = F))
funlist <- unlist(family)
# Fixed effects
if(!"matrix"%in%class(beta)) beta <- t(beta) # Assume if a vector is passed as beta, then a univariate set of data is being attempted.
K <- nrow(beta)
if(K != length(funlist)) stop('Incorrect number of families provided and/or beta terms.')
if(K != length(gamma)) stop('Incorrect number of association parameters provided wrt beta terms.')
if(is.null(D)) D <- diag(K * 2) else D <- D
if(!is.null(random.formula)){
if(length(random.formula) != K) stop('Provided random formulas must be a list of length K.')
}else{
if((K*2) != ncol(D)) stop('Incorrect dimension on supplied covariance matrix D, do you need to supply random.formula?')
}
# Check covariance matrix D is positive semi-definite.
if(is.not.SPD(D)) stop('D must be positive semi-definite')
# Parse dispersion formulas
pf <- parent.frame()
if(is.null(disp.formulas)){
disp.formulas <- replicate(K, ~1, simplify = FALSE)
lapply(disp.formulas, function(x) environment(x) <- pf)
}else{
if(sum(!sapply(disp.formulas, is.null)) != K)
stop("Need to supply dispersion formulas for all responses even if not required for all K responses\n",
"(You can just fill-in ~1 for those with no wanted dispersion model.)")
}
# Necessary parameters & data generation ----
if(regular.times){
time <- seq(0, fup, length.out = ntms)
tau <- fup + 0.1
}else{
time <- c(replicate(n, c(0, sort(runif(ntms - 1, max = fup)))))
tau <- fup
}
cont <- rnorm(n); bin <- rbinom(n, 1, 0.5) # Continuous and binary covariates.
df <- data.frame(id = rep(1:n, each = ntms),
time = if(regular.times) rep(time, n) else time,
cont = rep(cont, each = ntms),
bin = rep(bin, each = ntms))
# Design matrices, used across ALL responses ----
X <- model.matrix(~ time + cont + bin, data = df)
W <- lapply(disp.formulas, function(x) model.matrix(x, data = df))
if(!is.null(random.formula)){
Z <- lapply(random.formula, function(x) model.matrix(x, data = df))
}else{
Z <- replicate(K, model.matrix(~ 1 + time, data = df), simplify = F) # assume all intslopes.
}
# Check that the supplied disp.formulas match with elements of sigma
if(!all(sapply(1:K, function(k) ncol(W[[k]])==length(sigma[[k]]))))
stop("Mis-specified sigma or disp.formulas.\n")
# Random effects specification
if(is.infinite(dof))
.simRE <- function(n, mu, Sigma) MASS::mvrnorm(n = n, mu = mu, Sigma = Sigma)
else
.simRE <- function(n, mu, Sigma) mvtnorm::rmvt(n = n, sigma = Sigma, delta = mu, df = dof)
# Linear predictor & response generation ----
if(!is.null(random.formula)){
q <- ncol(do.call(cbind, lapply(Z, head)))
# b <- MASS::mvrnorm(n, mu = rep(0, q), Sigma = D)
b <- .simRE(n, rep(0, q), D)
b.inds <- split(1:q, do.call(c, sapply(1:K, function(k) rep(k, ncol(Z[[k]])), simplify = F)))
}else{
# b <- MASS::mvrnorm(n, mu = rep(0, K * 2), Sigma = D)
b <- .simRE(n, rep(0, K * 2), D)
b.inds <- split(1:(2*K), rep(1:K, each = 2))
}
Y <- sapply(1:K, function(k){
f <- family[[k]]; Zk <- Z[[k]]
betak <- beta[k,,drop=T]
etak <- X %*% betak + rowSums(Zk * b[df$id, b.inds[[k]]])
phik <- W[[k]] %*% sigma[[k]]
switch(f,
gaussian = Y <- rnorm(n * ntms, mean = etak, sd = sqrt(phik[1])),
binomial = Y <- rbinom(n * ntms, 1, plogis(etak)),
poisson = Y <- rpois(n * ntms, exp(etak)),
Gamma = Y <- rgamma(n * ntms, shape = exp(phik), scale = exp(etak)/exp(phik)),
negbin = Y <- MASS::rnegbin(n * ntms, mu = exp(etak), theta = exp(phik)),
genpois = {
maxtry <- 5
try <- 1
e <- T
while(e){ # genpois can be quite sensitive in simulation part; repeat sim up to 5 times.
Y <- tryCatch(rgenpois(exp(etak), phik),
error = function(e) NA)
e <- any(is.na(Y))
try <- try + 1
if(try > maxtry){
stop(sprintf("Issues creating genpois response, this could be due to large values for mu (range: %.2f-%.2f),",
range(exp(etak))[1], range(exp(etak))[2]),
sprintf(" or dispersion (range: %.2f, %.2f); it's recommended that this value is between",
range(phik)[1], range(phik)[2]),
sprintf(" %.2f and %.2f.\n", max(-1, -max(exp(etak)/4)), 1))
}
}
Y
})
})
colnames(Y) <- paste0('Y.', 1:K)
df <- cbind(df, Y)
# Survival ----
theta0 <- theta[1]; theta1 <- theta[2]
Keta <- cbind(cont, bin) %*% zeta
U <- runif(n)
if(!is.null(random.formula)){
zz <- lapply(Z, colnames)
ints <- which(grepl('\\(Intercept\\)', do.call(c, zz))); slopes <- which(grepl('time', do.call(c, zz)))
if(any(!grepl('\\(Intercept\\)|time', do.call(c, zz)))) message('Warning: Only intercept-only or intercept-ands-slope random.formulas allowed.')
# Which of the K responses have intercept/slope
ints.gamma <- do.call(c, lapply(1:K, function(k){
if(any(grepl('\\(Intercept\\)', colnames(Z[[k]])))) return(k) else return(NULL)
}))
slopes.gamma <- do.call(c, lapply(1:K, function(k){
if(any(grepl('time', colnames(Z[[k]])))) return(k) else return(NULL)
}))
}else{
ints <- seq(1,2*K,by=2); slopes <- seq(2, 2*K, by = 2) # intslope on all.
ints.gamma <- slopes.gamma <- 1:K # Each K has both intercept and slope.
}
b0 <- b[, ints, drop = F]; b1 <- b[, slopes, drop = F]
# Generate survival times (Austin et al 2012)
denom <- theta1 + b1 %*% gamma[slopes.gamma]
rhs <- (theta1 + b1 %*% gamma[slopes.gamma]) * log(U)/(exp(theta0 + Keta + b0 %*% gamma[ints.gamma]))
t <- suppressWarnings(log(1-rhs)/denom)
t[is.nan(t)] <- tau
# Collect survival times, and generate cenors times.
cens.time <- rexp(n, cens.rate)
survtime <- pmin(t, cens.time)
survtime[survtime >= tau] <- tau
# Status flag
status <- rep(1, n)
is.censored <- cens.time < survtime
status[which(survtime == tau | is.censored | survtime == cens.time)] <- 0 # Failure flag
# Output Dataframes ----
surv.data <- data.frame(id = 1:n, survtime, status)
long.data <- df
out.data <- merge(df, surv.data, by = 'id')
out.data <- out.data[out.data$time < out.data$survtime, ]
cat(sprintf("%.2f%% failure rate.\n", 100 * sum(surv.data$status)/n))
out <- list(data = out.data,
surv.data = out.data[!duplicated(out.data[,'id']), c('id', 'survtime', 'status', 'cont', 'bin')])
if(return.ranefs) out$ranefs <- b
out
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.