# Copyright (C) 2017,2018 Sam Brilleman
#' Simulate data for a univariate or multivariate joint model
#'
#' Returns a list of data frames containing data simulated from a joint model for
#' longitudinal and time-to-event data.
#'
#' @export
#' @importFrom stats gaussian
#' @importFrom methods is
#'
#' @param n Number of individuals
#' @param M Number of longitudinal markers
#' @param fixed_trajectory The desired type of trajectory in the fixed effects
#' part of the longitudinal model. Can be \code{"none"} (intercept only),
#' \code{"linear"} (intercept and linear slope),
#' \code{"quadratic"} (intercept, linear slope, and quadratic term),
#' or \code{"cubic"} (intercept, linear slope, quadratic term, and
#' cubic term). Can be a single character string, or a character
#' vector of length M (if a different trajectory type is to be used
#' for each longitudinal submodel). Note that in addition to these time
#' effects, two baseline covariates (one binary and one continuous) are
#' always included in the longitudinal submodel as well.
#' @param random_trajectory The desired type of trajectory in the random effects
#' part of the longitudinal model. Can be \code{"none"} (random intercept only),
#' \code{"linear"} (random intercept and linear slope),
#' \code{"quadratic"} (random intercept, linear slope, and quadratic term),
#' or \code{"cubic"} (random intercept, linear slope, quadratic term, and
#' cubic term). Can be a single character string, or a character
#' vector of length M (if a different trajectory type is to be used
#' for each longitudinal submodel). Note that the corresponding
#' \code{fixed_trajectory} argument must be at least as complex as the
#' random effect structure; for example, you cannot specify
#' \code{fixed_trajectory = "linear"} and \code{random_trajectory = "cubic"},
#' because there would be no corresponding fixed effects parameters for
#' the quadratic and cubic random terms in the model.
#' @param assoc A character string, or a character vector of length M,
#' specifying the desired type of association structure for
#' linking each longitudinal outcome to the hazard of the event.
#' Only one association structure type can be specified for each longitudinal
#' outcome. The options are: \code{"null"}, \code{"etavalue"},
#' \code{"etaslope"}, \code{"etaauc"}, \code{"muvalue"},
#' \code{"shared_b(1)"}, \code{"shared_coef(1)"}, \code{"shared_b(2)"},
#' \code{"shared_coef(2)"}.
#' @param basehaz The desired baseline hazard for the event submodel. Can be
#' \code{"weibull"}.
#' @param betaLong_intercept True intercept in the longitudinal submodel. Can
#' be a scalar or a vector of length M.
#' @param betaLong_binary True coefficient for the binary covariate in the
#' longitudinal submodel. Can be a scalar or a vector of length M.
#' @param betaLong_continuous True coefficient for the continuous covariate in the
#' longitudinal submodel. Can be a scalar or a vector of length M.
#' @param betaLong_linear True coefficient for the fixed effect linear term in
#' the longitudinal submodel when \code{fixed_trajectory = "linear"},
#' \code{fixed_trajectory = "quadratic"} or \code{fixed_trajectory = "cubic"}.
#' Can be a scalar or a vector of length M.
#' @param betaLong_quadratic True coefficient for the fixed effect quadratic term
#' in the longitudinal submodel when \code{fixed_trajectory = "quadratic"} or
#' \code{fixed_trajectory = "cubic"}. Can be a scalar or a vector of length M.
#' @param betaLong_cubic True coefficient for the fixed effect cubic term in
#' the longitudinal submodel when \code{fixed_trajectory = "cubic"}.
#' Can be a scalar or a vector of length M.
#' @param betaLong_aux True parameter value for the auxiliary parameter in the
#' longitudinal submodel (sigma for Gaussian models, number of trials for
#' binomial models, size for negative binomial models, shape for Gamma models,
#' lambda for inverse Gaussian models). Can be a scalar or a vector of length M.
#' @param betaEvent_intercept True intercept term (log hazard scale) in the
#' event submodel. Only required for Weibull models.
#' @param betaEvent_binary True coefficient (log hazard ratio) for the binary
#' covariate in the event submodel.
#' @param betaEvent_continuous True coefficient (log hazard ratio) for the
#' continuous covariate in the event submodel.
#' @param betaEvent_assoc True association parameter (log hazard ratio) in the
#' event submodel. Can be a scalar or a vector of length M.
#' @param betaEvent_aux True parameter value(s) for the auxiliary parameter(s) in
#' the event submodel (shape parameter for Weibull models). Can be a scalar
#' or a vector, depending on how many auxiliary parameters are required for
#' specifying the baseline hazard.
#' @param prob_Z1 Probability for the binary covariate included in each of the
#' submodels.
#' @param mean_Z2 Mean of the (normally distributed) continuous covariate included
#' in each of the submodels.
#' @param sd_Z2 Standard deviation of the (normally distributed) continuous
#' covariate included in each of the submodels.
#' @param b_sd A list, with each element of the list containing the vector of
#' standard deviations for the individual-level random effects.
#' @param b_rho Correlation between the individual-level random effects. This is
#' only relevant when there is a total of >1 individual-level random effects in
#' the joint model. This can be specified as a scalar correlation term, which
#' assumes the same (true) correlation
#' between each of the individual-level random effects, or it can be a correlation
#' matrix for the correlation between the individual-level random effects. When
#' simulating data for a multivariate joint model, the structure of the
#' correlation matrix is such that the first \eqn{K_1} columns/rows correspond
#' to the individual-level random effects for the first longitudinal submodel,
#' and the next \eqn{K_2} columns/rows correspond to the individual-level random
#' effects for the second longitudinal submodel, and so on.
#' @param max_yobs The maximum allowed number of longitudinal measurements for
#' each biomarker. The actual number of observed measurements will depend on the
#' individuals event time. Every individual is forced to have at least a baseline
#' measurement for each biomarker (i.e. a biomarker measurement at time 0). The
#' remaining biomarker measurement times will be uniformly distributed between 0
#' and \code{max_fuptime} if \code{balanced = FALSE}, or evenly spaced between
#' 0 and \code{max_fuptime} if \code{balanced = TRUE}.
#' @param max_fuptime The maximum follow up time in whatever the desired time
#' units are. This time will also be used as the censoring time (i.e. for
#' individuals who have a simulated survival time that is after \code{max_fuptime}).
#' @param balanced A logical, specifying whether the timings of the longitudinal
#' measurements should be balanced across individuals. If \code{FALSE} (the
#' default), then each individual will have a baseline longitudinal measurement
#' and the remaining measurement times will be chosen randomly from a uniform
#' distribution on the range \code{[0,max_fuptime]}. If \code{TRUE}, then each
#' individual will have a baseline longitudinal measurement and the remaining
#' measurement times will be at common times for each individual, chosen to be
#' evenly spaced between \code{[0,max_fuptime]}.
#' @param family A family for the the longitudinal submodel, or for a multivariate
#' joint model this can be a list of families. See \code{\link[stats]{glm}} and
#' \code{\link[stats]{family}}.
#' @param clust_control A named list providing the arguments that are
#' necessary for simulating data where the first longitudinal submodel has
#' lower level clustering within individuals. The named list should contain
#' the following elements:
#' \describe{
#' \item{L}{Integer specifying the maximum number of lower level units
#' clustered within an individual. The actual number of lower level units
#' clustered within each individual is taken to be a random uniform (truncated
#' to an integer) on the range \code{[1,L+1]}.}
#' \item{assoc}{Character string specifying the method for combining information
#' across the lower level units clustered within an individual when forming the
#' association structure. Can be \code{"sum"} for specifying which indicates
#' the association structure should be based on a summation across the lower
#' level units clustered within an individual. Can be \code{"mean"} which
#' indicates that the association structure should be based on the mean
#' (i.e. average) taken across the lower level units clustered within an
#' individual.}
#' \item{u_sd}{Numeric vector providing the standard deviations of the random
#' effects at the cluster level.}
#' \item{u_rho}{A scalar or a correlation matrix providing the correlation
#' between the random effects for the lower level clustering factor. This is
#' only relevant if there are >1 random effects for the lower level clustering
#' factor. This is specified in a similar way as the \code{b_rho} argument
#' described above.}
#' \item{random_trajectory}{The desired type of trajectory in the random effects
#' part of the longitudinal model at the cluster level. Can be \code{"none"} to
#' only include a random intercept at the cluster level, or otherwise
#' \code{"linear"} to include a random intercept and linear slope term.}
#' }
#' @param return_eta A logical, if \code{return_eta = TRUE}, then the simulated
#' data will also include the value of longitudinal submodel's linear predictor
#' evaluated at the various measurement times. These will be stored in the
#' variables named \code{Xij_1}, \code{Xij_2}, etc. (Note that if
#' \code{family = "gaussian"} then these are equivalent to the error-free
#' values of the biomarker at the measurement times).
#' @param seed An optional \code{\link[=set.seed]{seed}}.
#' @param interval The interval over which to search for the
#' \code{\link[stats]{uniroot}} corresponding to each simulated event time.
#'
#' @details The \code{simjm} function returns data simulated under a joint
#' longitudinal and time-to-event model. The joint model can be univariate
#' (i.e. one longitudinal outcome) or multivariate (i.e. more than one
#' longitudinal outcome). If more than one longitudinal outcome is specified
#' then the longitudinal outcomes are assumed to be correlated via a joint
#' multivariate normal distribution for the individual-level random effects.
#' Each longitudinal outcome may have a different family and link function.
#' The time-to-event model is assumed to be a parametric proportional hazards
#' model with the baseline hazard specified via the \code{basehaz} argument.
#' The (log) hazard of the event is assumed to be related to each longitudinal
#' outcome via the association structure specified in the \code{assoc} argument.
#' A different association may be specified for linking each longitudinal
#' outcome to the hazard of the event.
#'
#' Note that by default \code{simjm} will simulate data for one longitudinal
#' outcome (i.e. a univariate joint model). If we want to simulate data for
#' a multivariate joint model, or we wish to use one of the non-default
#' association structures (for example "etaslope"), then we may need to adjust
#' the "true" parameters accordingly so that we still get realistic survival
#' times (for example, by changing the "true" association parameter value).
#'
#' @return A list of data frames, one for each longitudinal biomarker (in long
#' format) and one for the event time data. The returned object also has a
#' number of attributes, including a class attribute \code{simjm} and
#' a number of other attributes that record the arguments that were specified in
#' the \code{simjm} call; for example the "true" parameter values, the number
#' of individuals, the number of longitudinal markers, and so on.
#'
#' @examples
#' # Note that throughout the examples we specify 'n = 30' to
#' # ensure a small sample size so that the examples run quickly
#'
#' # Simulate one longitudinal marker (we can just use the defaults):
#' simdat1 <- simjm(n = 30)
#'
#' # Simulate two longitudinal markers and then check the
#' # true parameter values (stored as an attribute):
#' simdat2 <- simjm(M = 2, n = 30, betaEvent_assoc = 0.1)
#' attr(simdat2, "params")
#'
#' # Simulate one longitudinal marker, using "etaslope"
#' # association structure:
#' simdat3 <- simjm(n = 30, assoc = "etaslope", betaEvent_assoc = 0.8)
#'
#' # For one longitudinal marker, with a bernoulli outcome.
#' # (Note that 'betaLong_aux' specifies the number of trials
#' # for the binomial outcome).
#' simdat4 <- simjm(M = 1, n = 30,
#' betaLong_intercept = 1.3,
#' betaLong_binary = -0.6,
#' betaLong_continuous = -0.03,
#' betaLong_linear = 0.05,
#' b_sd = c(1, 0.05),
#' betaEvent_intercept = -9,
#' betaEvent_assoc = 0.3,
#' family = binomial(),
#' betaLong_aux = 1)
#'
#' # For one longitudinal marker, with lower level clustering
#' # within individuals. The model includes only a random intercept
#' # at the individual-level, and then a random intercept and
#' # random slope at the lower cluster level. The association
#' # structure is based on a summation of the expected values for
#' # each of the lower level units.
#' simdat5 <- simjm(M = 1, n = 30,
#' fixed_trajectory = "linear",
#' random_trajectory = "none",
#' betaLong_intercept = -1,
#' b_sd = 1,
#' clust_control = list(
#' L = 6,
#' assoc = "sum",
#' random_trajectory = "linear",
#' u_sd = c(1,1),
#' u_rho = 0.2
#' ))
#'
#' # Simulate two longitudinal markers, where the first longitudinal
#' # marker has a linear trajectory, and the second marker has
#' # quadratic trajectory. We will specify a correlation matrix for
#' # the individual-level random effects (via the 'b_rho' argument).
#' corrmat <- matrix(c(
#' 1.0, 0.2, 0.5, 0.0, 0.0,
#' 0.2, 1.0, 0.0, 0.2, 0.0,
#' 0.5, 0.0, 1.0, 0.3, 0.0,
#' 0.0, 0.2, 0.3, 1.0, -.1,
#' 0.0, 0.0, 0.0, -.1, 1.0), ncol = 5)
#' simdat6 <- simjm(M = 2, n = 30,
#' fixed_trajectory = c("linear", "quadratic"),
#' random_trajectory = c("linear", "quadratic"),
#' b_sd = c(2, 1, 2, 1, 0.5),
#' b_rho = corrmat,
#' betaEvent_assoc = c(0.1, 0.2))
#'
simjm <- function(n = 200, M = 1,
fixed_trajectory = "cubic",
random_trajectory = "linear",
assoc = "etavalue",
basehaz = c("weibull"),
betaLong_intercept = 10,
betaLong_binary = -1,
betaLong_continuous = 1,
betaLong_linear = -0.25,
betaLong_quadratic = 0.03,
betaLong_cubic = -0.0015,
betaLong_aux = 0.5,
betaEvent_intercept = -7.5,
betaEvent_binary = -0.5,
betaEvent_continuous = 0.5,
betaEvent_assoc = 0.5,
betaEvent_aux = 1.2,
b_sd = c(1.5, 0.07), b_rho = -0.2,
prob_Z1 = 0.5,
mean_Z2 = 0, sd_Z2 = 1,
max_yobs = 10,
max_fuptime = 20,
balanced = FALSE,
family = gaussian,
clust_control = list(),
return_eta = FALSE,
seed = sample.int(.Machine$integer.max, 1),
interval = c(1E-8, 200)) {
#----- Preliminaries
set.seed(seed)
basehaz <- match.arg(basehaz)
if (max_yobs < 1)
stop("'max_yobs' must be at least 1.")
# Check input assoc is valid
ok_assocs <- c("etavalue", "etaslope", "etaauc", "muvalue",
"null", "shared_b(1)", "shared_coef(1)",
"shared_b(2)", "shared_coef(2)")
assoc <- maybe_broadcast(assoc, M)
if (!all(assoc %in% ok_assocs))
stop("'assoc' must be one of: ", paste(ok_assocs, collapse = ", "))
# Check input to trajectory type is valid
ok_trajs <- c("none", "linear", "quadratic", "cubic")
fixed_trajectory <- maybe_broadcast(fixed_trajectory, M)
random_trajectory <- maybe_broadcast(random_trajectory, M)
if (!all(fixed_trajectory %in% ok_trajs))
stop("'fixed_trajectory' must be one of: ", paste(ok_trajs, collapse = ", "))
if (!all(random_trajectory %in% ok_trajs))
stop("'random_trajectory' must be one of: ", paste(ok_trajs, collapse = ", "))
if (!length(fixed_trajectory) == M)
stop("'fixed_trajectory' is the wrong length.")
if (!length(random_trajectory) == M)
stop("'random_trajectory' is the wrong length.")
# Check family is valid
if (!is(family, "list"))
family <- list(family)
family <- maybe_broadcast(family, M)
family <- lapply(family, validate_family)
ok_families <- c("gaussian", "binomial")
lapply(family, function(x) if (!x$family %in% ok_families)
stop("'family' must be one of: ", paste(ok_families, collapse = ", ")))
# Error check to ensure that the random effects structure isn't
# more complex than the corresponding fixed effects structure
fixed_traj_index <- match(fixed_trajectory, ok_trajs)
random_traj_index <- match(random_trajectory, ok_trajs)
sel <- which(random_traj_index > fixed_traj_index)
if (length(sel))
stop("The 'random_trajectory' cannot be more complex than the ",
"corresponding 'fixed_trajectory'. This problem was encountered ",
"for longitudinal submodel(s): ", paste(sel, collapse = ", "))
# Error checks for assoc type
for (m in 1:M) {
shared_slope <- (assoc[m] %in% c("shared_b(2)", "shared_coef(2)"))
if (shared_slope && (!random_trajectory[m] == "linear"))
stop("Can only use 'shared_b(2)' or 'shared_coef(2)' when ",
"'random_trajectory' is linear.")
}
# Check specified structure for any lower-level clustering
has_clust <- !missing(clust_control)
if (has_clust) { # has lower-level clustering within the individual
if (!is(clust_control, "list"))
stop("'clust_control' should be a named list.")
clust_nms <- names(clust_control)
ok_clust_args <- c("L", "assoc", "random_trajectory", "u_sd", "u_rho")
if (!all(clust_nms %in% ok_clust_args))
stop("'clust_control' should only include the following named arguments: ",
paste(ok_clust_args, collapse = ", "))
req_clust_args <- c("L", "assoc", "random_trajectory", "u_sd")
if (!all(req_clust_args %in% clust_nms))
stop("'clust_control' must include the following named arguments: ",
paste(req_clust_args, collapse = ", "))
if (clust_control$L < 1)
stop("In clust_control, 'L' should be a positive integer.")
if (!is.numeric(clust_control$u_sd) || any(clust_control$u_sd < 0))
stop("In clust_control, 'u_sd' must be a positive scalar.")
ok_clust_assocs <- c("sum", "mean", "max", "min")
if (!clust_control$assoc %in% ok_clust_assocs)
stop("In clust_control, 'assoc' must be one of: ",
paste(ok_clust_assocs, collapse = ", "))
ok_clust_trajs <- c("none", "linear")
if (!(clust_control$random_trajectory %in% ok_clust_trajs))
stop("'clust_control$random_trajectory' must be one of: ",
paste(ok_clust_trajs, collapse = ", "))
marker1_traj_types <- c(random_trajectory[1L], clust_control$random_trajectory)
if (!any(marker1_traj_types == "none")) {
stop("If lower-level clustering within the individual is specified, ",
"then the random effects must be limited to a random intercept ",
"(ie. 'random_trajectory = none') at either the individual-level ",
"or the clustering-level within individuals.")
}
grp_assoc <- clust_control$assoc
} else {
grp_assoc <- NULL
}
#----- Parameters
# Broadcast user-specified parameters
betaLong_intercept <- maybe_broadcast(betaLong_intercept, M)
betaLong_binary <- maybe_broadcast(betaLong_binary, M)
betaLong_continuous <- maybe_broadcast(betaLong_continuous, M)
betaLong_linear <- maybe_broadcast(betaLong_linear, M)
betaLong_quadratic <- maybe_broadcast(betaLong_quadratic, M)
betaLong_cubic <- maybe_broadcast(betaLong_cubic, M)
betaLong_aux <- maybe_broadcast(betaLong_aux, M)
betaEvent_assoc <- maybe_broadcast(betaEvent_assoc, M)
# Draw individual-level REs
b_dim <- sapply(random_trajectory, function(x)
switch(x,
none = 1L, # random intercepts model
linear = 2L, # random slopes model
quadratic = 3L, # random quadratic terms
cubic = 4L) # random cubic terms
)
b_dim_total <- sum(b_dim) # total num of individual-level REs
# Validate b_sd
if (length(unique(b_dim)) == 1L) { # same num of REs in each submodel
if (length(b_sd) == b_dim[1]) {
b_sd <- rep(b_sd, times = M)
}
}
if (!length(b_sd) == b_dim_total)
stop("'b_sd' appears to be the wrong length.")
# Validate b_rho
if (b_dim_total == 1) { # only one RE, no corr matrix needed
b_corr_mat = matrix(1,1,1)
} else { # >1 RE, requires corr matrix
if (is.scalar(b_rho) && b_dim_total > 1L) {
# user supplied a constant correlation term for REs
b_corr_mat <- matrix(rep(b_rho, b_dim_total ^ 2), ncol = b_dim_total)
diag(b_corr_mat) <- 1
} else {
# user supplied a correlation matrix for REs
b_corr_mat <- validate_corr_matrix(b_rho)
}
}
# Draw standardised REs and scale them by b_sd
b_dd <- MASS::mvrnorm(n = n, mu = rep(0, b_dim_total), Sigma = b_corr_mat)
b <- sapply(1:length(b_sd), function(x) b_sd[x] * b_dd[,x])
colnames(b) <- paste0("b", 1:b_dim_total)
# Draw random effects if lower level clustering within individuals
# NB lower level clustering only applies to the first longitudinal submodel
if (has_clust) {
L <- clust_control$L
u_sd <- clust_control$u_sd
u_dim <- switch(clust_control$random_trajectory,
none = 1L, # random intercepts model
linear = 2L) # random slopes model
if (!length(u_sd) == u_dim)
stop("In clust_control, 'u_sd' appears to be the wrong length. ",
"Should be length ", u_dim, ".")
Li <- as.integer(stats::runif(n, 1, L+1)) # num units within each individual
if (u_dim == 1) { # only one RE, no corr matrix needed
u_corr_mat = matrix(1,1,1)
} else { # >1 RE, requires corr matrix
u_rho <- clust_control$u_rho
if (is.null(u_rho))
stop("'u_rho' must be provided in the clust_control list when there is ",
"more than one random effect for the lower level clustering factor.")
if (is.scalar(u_rho) && u_dim > 1L) {
# user supplied a constant correlation term for REs
u_corr_mat <- matrix(rep(u_rho, u_dim ^ 2), ncol = u_dim)
diag(u_corr_mat) <- 1
} else {
# user supplied a correlation matrix for REs
u_corr_mat <- validate_corr_matrix(u_rho)
}
}
u_dd <- MASS::mvrnorm(n = sum(Li), mu = rep(0, u_dim), Sigma = u_corr_mat)
u <- sapply(1:length(clust_control$u_sd), function(x) u_sd[x] * u_dd[,x])
colnames(u) <- paste0("u", 1:u_dim)
}
# Construct data frame of parameters
betas <- list()
# Longitudinal submodel parameters
for (m in 1:M) {
nm <- paste0("Long", m)
betas[[nm]] <- data.frame(id = 1:n)
# fixed effect parameters
betas[[nm]][[paste0("betaLong_intercept", m)]] <- rep(betaLong_intercept[m], n)
betas[[nm]][[paste0("betaLong_binary", m)]] <- rep(betaLong_binary[m], n)
betas[[nm]][[paste0("betaLong_continuous", m)]] <- rep(betaLong_continuous[m], n)
if (fixed_trajectory[m] %in% c("linear", "quadratic", "cubic")) {
# fixed effect linear
betas[[nm]][[paste0("betaLong_linear", m)]] <- rep(betaLong_linear[m], n)
}
if (fixed_trajectory[m] %in% c("quadratic", "cubic")) {
# fixed effect quadratic
betas[[nm]][[paste0("betaLong_quadratic", m)]] <- rep(betaLong_quadratic[m], n)
}
if (fixed_trajectory[m] %in% c("cubic")) {
# fixed effect cubic
betas[[nm]][[paste0("betaLong_cubic", m)]] <- rep(betaLong_cubic[m], n)
}
# add on subject-specific intercept
shift <- if (m == 1) 0 else sum(b_dim[1:(m - 1)])
b_idx <- shift + 1
betas[[nm]][[paste0("betaLong_intercept", m)]] <-
betas[[nm]][[paste0("betaLong_intercept", m)]] + b[, b_idx]
# add on subject-specific linear term
if (random_trajectory[m] %in% c("linear", "quadratic", "cubic")) {
b_idx <- shift + 2
betas[[nm]][[paste0("betaLong_linear", m)]] <-
betas[[nm]][[paste0("betaLong_linear", m)]] + b[, b_idx]
}
# add on subject-specific quadratic term
if (random_trajectory[m] %in% c("quadratic", "cubic")) {
b_idx <- shift + 3
betas[[nm]][[paste0("betaLong_quadratic", m)]] <-
betas[[nm]][[paste0("betaLong_quadratic", m)]] + b[, b_idx]
}
# add on subject-specific cubic term
if (random_trajectory[m] %in% c("cubic")) {
b_idx <- shift + 4
betas[[nm]][[paste0("betaLong_cubic", m)]] <-
betas[[nm]][[paste0("betaLong_cubic", m)]] + b[, b_idx]
}
}
# Additional clustering factors (only allowed for longitudinal submodel 1)
if (has_clust) { # expand rows and add on cluster-specific random effects
betas[["Long1"]] <-
betas[["Long1"]][rep(row.names(betas[["Long1"]]), Li), , drop = FALSE]
# cluster id within each individual
betas[["Long1"]]$clust <- sequence(Li)
# unique cluster id
betas[["Long1"]]$clust_id <-
paste(betas[["Long1"]][["id"]], betas[["Long1"]]$clust, sep = "_")
# add on cluster-specific intercept
betas[["Long1"]][["betaLong_intercept1"]] <-
betas[["Long1"]][["betaLong_intercept1"]] + u[, 1]
# add on cluster-specific linear slope
if (clust_control$random_trajectory %in% "linear") {
betas[["Long1"]][["betaLong_linear1"]] <-
betas[["Long1"]][["betaLong_linear1"]] + u[, 2]
}
}
# Event submodel parameters
betas[["Event"]] <- data.frame(
id = 1:n,
betaEvent_intercept = rep(betaEvent_intercept, n),
betaEvent_binary = rep(betaEvent_binary, n),
betaEvent_continuous = rep(betaEvent_continuous, n),
betaEvent_aux = rep(betaEvent_aux, n)
)
for (m in 1:M)
betas[["Event"]][[paste0("betaEvent_assoc", m)]] <- rep(betaEvent_assoc[m], n)
#----- Data
# Generate baseline covariates - binary
Z1 <- stats::rbinom(n, 1, prob_Z1) # covariate value for each subject
# Generate baseline covariates - continuous
Z2 <- stats::rnorm(n, mean_Z2, sd_Z2) # covariate value for each subject
# Construct data frame of baseline covariates
covs <- data.frame(id = 1:n, Z1, Z2)
# Generate survival times
ss <- simsurv::simsurv(# the following arguments apply to 'simsurv'
hazard = jm_hazard, x = covs, betas = betas,
idvar = "id", ids = covs$id,
maxt = max_fuptime, interval = interval,
# the following arguments apply to 'jm_hazard'
basehaz = basehaz, M = M, trajectory = fixed_trajectory,
assoc = assoc, family = family, grp_assoc = grp_assoc)
# Construct data frame of event data
dat <- list(
Event = data.frame(betas[["Event"]], covs, ss) # single row per subject
)
# Construct data frame of longitudinal data
for (m in 1:M) {
nm <- paste0("Long", m)
dat[[nm]] <- merge(betas[[nm]], dat[["Event"]])
dat[[nm]] <- merge(dat[[nm]], covs)
dat[[nm]] <- dat[[nm]][rep(row.names(dat[[nm]]), max_yobs), ] # multiple row per subject
# create observation times
if (balanced) { # longitudinal observation times balanced across individuals
tij_seq <- max_fuptime * 0:(max_yobs - 1) / max_yobs # baseline and post-baseline
dat[[nm]]$tij <- rep(tij_seq, each = nrow(betas[[nm]]))
} else { # longitudinal observation times unbalanced across individuals
tij_seq1 <- rep(0, nrow(betas[[nm]])) # baseline
tij_seq2 <- stats::runif(nrow(dat[[nm]]) - nrow(betas[[nm]]), 0, max_fuptime) # post-baseline
dat[[nm]]$tij <- c(tij_seq1, tij_seq2)
}
if (has_clust && m == 1) { # sort on ID, cluster ID and time
dat[[nm]] <- dat[[nm]][order(dat[[nm]]$id, dat[[nm]]$clust_id, dat[[nm]]$tij), ]
} else { # sort on ID and time
dat[[nm]] <- dat[[nm]][order(dat[[nm]]$id, dat[[nm]]$tij), ]
}
dat[[nm]][[paste0("Xij_", m)]] <-
dat[[nm]][[paste0("betaLong_intercept", m)]] +
dat[[nm]][[paste0("betaLong_binary", m)]] * dat[[nm]][["Z1"]] +
dat[[nm]][[paste0("betaLong_continuous", m)]] * dat[[nm]][["Z2"]]
if (fixed_trajectory[m] %in% c("linear", "quadratic", "cubic")) {
dat[[nm]][[paste0("Xij_", m)]] <- dat[[nm]][[paste0("Xij_", m)]] +
dat[[nm]][[paste0("betaLong_linear", m)]] *
dat[[nm]][["tij"]]
}
if (fixed_trajectory[m] %in% c("quadratic", "cubic")) {
dat[[nm]][[paste0("Xij_", m)]] <- dat[[nm]][[paste0("Xij_", m)]] +
dat[[nm]][[paste0("betaLong_quadratic", m)]] *
dat[[nm]][["tij"]] *
dat[[nm]][["tij"]]
}
if (fixed_trajectory[m] %in% c("cubic")) {
dat[[nm]][[paste0("Xij_", m)]] <- dat[[nm]][[paste0("Xij_", m)]] +
dat[[nm]][[paste0("betaLong_cubic", m)]] *
dat[[nm]][["tij"]] *
dat[[nm]][["tij"]] *
dat[[nm]][["tij"]]
}
fam <- family[[m]]$family
invlink <- family[[m]]$linkinv
mu <- invlink(dat[[nm]][[paste0("Xij_", m)]])
if (fam == "gaussian") {
sigma <- betaLong_aux[m]
dat[[nm]][[paste0("Yij_", m)]] <- stats::rnorm(length(mu), mu, sigma)
} else if (fam == "binomial") {
trials <- betaLong_aux[m]
dat[[nm]][[paste0("Yij_", m)]] <- stats::rbinom(length(mu), trials, mu)
} else if (fam == "poisson") {
dat[[nm]][[paste0("Yij_", m)]] <- stats::rpois(length(mu), mu)
} else if (fam == "neg_binomial_2") {
size <- betaLong_aux[m]
dat[[nm]][[paste0("Yij_", m)]] <- stats::rnbinom(length(mu), size, mu)
} else if (fam == "Gamma") {
shape <- betaLong_aux[m]
dat[[nm]][[paste0("Yij_", m)]] <- stats::rgamma(length(mu), shape, rate = shape / mu)
} else if (fam == "inverse.gaussian") {
lambda <- betaLong_aux[m]
dat[[nm]][[paste0("Yij_", m)]] <- .rinvGauss(length(mu), mu, lambda)
}
}
# Prepare final datasets
ret <- lapply(dat, function(x) {
if ("tij" %in% colnames(x))
x <- x[x$tij <= x$eventtime, ] # only keep rows before event time
if (return_eta) {
sel <- grep("^id$|^clust|^Z|^tij|^Yij|^Xij|eventtime|status", colnames(x))
} else {
sel <- grep("^id$|^clust|^Z|^tij|^Yij|eventtime|status", colnames(x))
}
x <- x[, sel, drop = FALSE]
rownames(x) <- NULL
return(x)
})
# Only keep individuals who have at least one measurement for each biomarker
# (NB Following the issues around delayed entry, this is now all individuals,
# since every individual is forced to have a baseline measurement. This
# may change in the future though, if there is a need to test whether
# rstanarm is correctly accommodating delayed entry, i.e. the situation
# in which we exclude individuals who do not have a baseline measurement.)
commonids <- ret[[1L]]$id
for (i in 1:length(ret))
commonids <- intersect(commonids, ret[[i]]$id)
ret <- lapply(ret, function(x) x[x$id %in% commonids, , drop = FALSE])
# Store 'true' parameter values
long_params <- nlist(
betaLong_intercept,
betaLong_binary,
betaLong_continuous,
betaLong_aux
)
if (any(fixed_trajectory %in% c("linear", "quadratic", "cubic")))
long_params$betaLong_linear <- betaLong_linear
if (any(fixed_trajectory %in% c("quadratic", "cubic")))
long_params$betaLong_quadratic <- betaLong_quadratic
if (any(fixed_trajectory %in% c("cubic")))
long_params$betaLong_cubic <- betaLong_cubic
event_params <- nlist(
betaEvent_intercept,
betaEvent_binary,
betaEvent_continuous,
betaEvent_assoc,
betaEvent_aux
)
re_params <- nlist(
b_sd,
b_corr = b_corr_mat
)
cov_params <- nlist(
prob_Z1, mean_Z2, sd_Z2
)
# Return object
structure(ret,
params = c(long_params, event_params, re_params, cov_params),
n = length(unique(ret$Event$id)),
M = M,
max_yobs = max_yobs,
max_fuptime = max_fuptime,
balanced = balanced,
assoc = assoc,
family = family,
fixed_trajectory = fixed_trajectory,
random_trajectory = random_trajectory,
return_eta = return_eta,
clust_control = clust_control,
seed = seed,
class = c("simjm", class(ret)))
}
#--------------------- internal
# Return the hazard at time t, for a shared parameter joint model
#
# @param t The time variable
# @param x A named vector of covariate values
# @param betas A named vector of parameter values
# @param basehaz The type of baseline hazard
# @param M The number of longitudinal submodels
# @param assoc A vector of character strings, indicating the association
# structure to use for each longitudinal submodel
# @param A list of family objects, providing the family (and link function)
# for each longitudinal submodel
# @return A scalar
jm_hazard <- function(t, x, betas, basehaz = "weibull", M = 1,
trajectory = "linear", assoc = "etavalue",
family = list(gaussian()), grp_assoc = NULL) {
if (t == 0)
return(0) # boundary condition
# Baseline hazard
if (basehaz == "weibull") {
h0 <- betas[["Event"]][["betaEvent_aux"]] *
(t ^ (betas[["Event"]][["betaEvent_aux"]] - 1))
}
# Time-invariant part of event submodel eta
etaevent <-
betas[["Event"]][["betaEvent_intercept"]] +
betas[["Event"]][["betaEvent_binary"]] * x[["Z1"]] +
betas[["Event"]][["betaEvent_continuous"]] * x[["Z2"]]
# Association structure
for (m in 1:M) {
nm <- paste0("Long", m)
etabaseline_m <- etavalue_m <-
betas[[nm]][[paste0("betaLong_intercept", m)]] +
betas[[nm]][[paste0("betaLong_binary", m)]] * x[["Z1"]] +
betas[[nm]][[paste0("betaLong_continuous", m)]] * x[["Z2"]]
if (trajectory[m] %in% c("linear", "quadratic", "cubic")) {
etavalue_m <- etavalue_m +
betas[[nm]][[paste0("betaLong_linear", m)]] * t
}
if (trajectory[m] %in% c("quadratic", "cubic")) {
etavalue_m <- etavalue_m +
betas[[nm]][[paste0("betaLong_quadratic", m)]] * (t * t)
}
if (trajectory[m] %in% c("cubic")) {
etavalue_m <- etavalue_m +
betas[[nm]][[paste0("betaLong_cubic", m)]] * (t * t * t)
}
if (!is.null(grp_assoc)) {
if (grp_assoc == "sum") {
res_etavalue_m <- sum(etavalue_m)
} else if (grp_assoc == "mean") {
res_etavalue_m <- mean(etavalue_m)
}
} else res_etavalue_m <- etavalue_m
if (assoc[m] == "etavalue") {
# eta value
res_etavalue_m <- collapse_across_clusters(etavalue_m, grp_assoc)
etaevent <- etaevent +
betas[["Event"]][[paste0("betaEvent_assoc", m)]] * res_etavalue_m
} else if (assoc[m] == "etaslope") {
# eta slope
if (trajectory[m] == "none") {
etaslope_m <- 0
} else if (trajectory[m] == "linear") {
etaslope_m <-
betas[[nm]][[paste0("betaLong_linear", m)]]
} else if (trajectory[m] == "quadratic") {
etaslope_m <-
betas[[nm]][[paste0("betaLong_linear", m)]] +
betas[[nm]][[paste0("betaLong_quadratic", m)]] * 2 * t
} else if (trajectory[m] == "cubic") {
etaslope_m <-
betas[[nm]][[paste0("betaLong_linear", m)]] +
betas[[nm]][[paste0("betaLong_quadratic", m)]] * 2 * t +
betas[[nm]][[paste0("betaLong_cubic", m)]] * 3 * I(t ^ 2)
}
res_etaslope_m <- collapse_across_clusters(etaslope_m, grp_assoc)
etaevent <- etaevent +
betas[["Event"]][[paste0("betaEvent_assoc", m)]] * res_etaslope_m
} else if (assoc[m] == "etaauc") {
# eta auc
if (trajectory[m] == "none") {
etaauc_m <-
etabaseline_m * t
} else if (trajectory[m] == "linear") {
etaauc_m <-
etabaseline_m * t +
I(1/2) * betas[[nm]][[paste0("betaLong_linear", m)]] * I(t ^ 2)
} else if (trajectory[m] == "quadratic") {
etaauc_m <-
etabaseline_m * t +
I(1/2) * betas[[nm]][[paste0("betaLong_linear", m)]] * I(t ^ 2) +
I(1/3) * betas[[nm]][[paste0("betaLong_linear", m)]] * I(t ^ 3)
} else if (trajectory[m] == "cubic") {
etaauc_m <-
etabaseline_m * t +
I(1/2) * betas[[nm]][[paste0("betaLong_linear", m)]] * I(t ^ 2) +
I(1/3) * betas[[nm]][[paste0("betaLong_linear", m)]] * I(t ^ 3) +
I(1/4) * betas[[nm]][[paste0("betaLong_linear", m)]] * I(t ^ 4)
}
res_etaauc_m <- collapse_across_clusters(etaauc_m, grp_assoc)
etaevent <- etaevent +
betas[["Event"]][[paste0("betaEvent_assoc", m)]] * res_etaauc_m
} else if (assoc[m] == "muvalue") {
# mu value
invlink <- family[[m]]$invlink
muvalue_m <- invlink(etavalue_m)
res_muvalue_m <- collapse_across_clusters(muvalue_m, grp_assoc)
etaevent <- etaevent +
betas[["Event"]][[paste0("betaEvent_assoc", m)]] * res_muvalue_m
}
}
# Calculate hazard
h <- h0 * exp(etaevent)
# Validate and return hazard
if (!length(h) == 1) {
stop("Bug found: returned hazard should be a scalar.")
}
return(h)
}
# Apply summary function in association structure when there is lower-level
# clustering within patients
#
# @param x A scalar or numeric vector with the 'etavalue', 'etaslope', 'etaauc'
# value (or whatever quantity is being used in the association structure) for
# one patient.
# @param collapse_fun A function to match and then apply to 'x'. If NULL, then
# 'x' is just returned, in which case 'x' should just be a scalar (i.e. there
# should only be a patient-level value in the association structure and no
# lower-level clusters within a patient).
# @return A scalar, used as the association term in the event submodel.
collapse_across_clusters <- function(x, collapse_fun = NULL) {
if (is.null(collapse_fun)) {
return(x)
} else {
fn <- match.fun(collapse_fun)
return(fn(x))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.