# ##' Process inputs given in linear form
# process_inputs2(formulas, pars, family, link, kwd, ordering=FALSE) {
# if (length(formulas))
# }
##' Sample from a causal model
##'
##' Obtain samples from a causal model using the rejection sampling approach of
##' Evans and Didelez (2024).
##'
##' @param n number of samples required
##' @param formulas list of lists of formulas
##' @param pars list of lists of parameters
## @param data optional data frame of covariates
##' @param family families for Z,X,Y and copula
##' @param link list of link functions
##' @param dat data frame of covariates
##' @param method only `"rejection"` is valid
##' @param control list of options for the algorithm
##' @param seed random seed used for replication
##'
##' @details Samples from a given causal model using rejection sampling (or,
##' if everything is discrete, direct sampling).
##'
##' The entries for `formula` and `family` should each be a
##' list with four entries, corresponding to the \eqn{Z}, \eqn{X}, \eqn{Y} and
##' the copula. `formula` determines the model, so it is crucial that
##' every variable to be simulated is represented there exactly once. Each
##' entry of that list can either be a single formula, or a list of formulae.
##' Each corresponding entry in `family` should be the same length as the
##' list in `formula` or of length 1 (in which case it will be repeated
##' for all the variables therein).
##'
##' We use the following codes for different families of distributions:
##' 0 or 5 = binary;
##' 1 = normal;
##' 2 = t-distribution;
##' 3 = gamma;
##' 4 = beta;
##' 6 = log-normal.
##'
##' The family variables for the copula are also numeric and taken from
##' `VineCopula`.
##' Use, for example, 1 for Gaussian, 2 for t, 3 for Clayton, 4 for Gumbel,
##' 5 for Frank, 6 for Joe and 11 for FGM copulas.
##'
##' `pars` should be a named list containing: either entries `z`,
##' `x`, `y` and `cop`, or variable names that correspond to the
##' LHS of formulae in `formulas`. Each of these should themselves be a list
##' containing `beta` (a vector of regression parameters) and (possibly)
##' `phi`, a dispersion parameter. For any discrete variable that is a
##' treatment, you can also specify `p`, an initial proportion to simulate
##' from (otherwise this defaults to 0.5).
##'
##' Link functions for the Gaussian, t and Gamma distributions can be the
##' identity, inverse or log functions. Gaussian and t-distributions default to
##' the identity, and Gamma to the log link. For the Bernoulli the logit and
##' probit links are available.
##'
##' Control parameters are `oversamp` (default value 10), `trace` (default
##' value 0, increasing to 1 increases verbosity of output),
##' `max_oversamp` (default value 1000), `warn` (which currently does
##' nothing), `max_wt` which is set to 1, and increases each time the function
##' is recalled.
## (if weights empirically appear not to have an upper bound, this warns if set
## to 1 (the default) and stops if set to 2), ...
##' Control parameters also include `cop`, which gives a keyword for the
##' copula that defaults to `"cop"`.
##'
##' This function is kept largely for the replication of simulations from
##' Evans and Didelez (2024).
##'
##' @examples
##' pars <- list(z=list(beta=0, phi=1),
##' x=list(beta=c(0,0.5), phi=1),
##' y=list(beta=c(0,0.5), phi=0.5),
##' cop=list(beta=1))
##' causalSamp(100, pars = pars)
##'
## @importFrom frugalSim sim_chain
##'
##' @return A data frame containing the simulated data.
##'
##' @references Evans, R.J. and Didelez, V. Parameterizing and simulating from
##' causal models (with discussion). _Journal of the Royal Statistical Society, Series B_,
##' 2024.
##'
##' @export
causalSamp <- function(n, formulas = list(list(z ~ 1), list(x ~ z), list(y ~ x), list( ~ 1)),
pars, family, link=NULL, dat=NULL, method="rejection",
control=list(), seed) {
# get control parameters or use defaults
con = list(oversamp = 10, max_oversamp=1000, max_wt = 1, warn = 1, cop="cop", trace=0)
matches = match(names(control), names(con))
con[matches] = control[!is.na(matches)]
if (any(is.na(matches))) warning("Some names in control not matched: ",
paste(names(control[is.na(matches)]),
sep = ", "))
if (round(con$oversamp) != con$oversamp) {
con$oversamp <- ceiling(con$oversamp)
message("oversamp not an integer, rounding up")
}
if (round(con$max_oversamp) != con$max_oversamp) {
con$max_oversamp <- ceiling(con$max_oversamp)
message("max_oversamp not an integer, rounding up")
}
## get keyword for copula formula
kwd <- con$cop
## process the four main arguments
tmp <- process_inputs(formulas=formulas, pars=pars, family=family, link=link,
dat=NULL, kwd=kwd, method=method)
formulas <- tmp$formulas
pars <- tmp$pars
family <- tmp$family
link <- tmp$link
dZ <- tmp$dim[1]; dX <- tmp$dim[2]; dY <- tmp$dim[3]
LHS_Z <- tmp$LHSs$LHS_Z; LHS_X <- tmp$LHSs$LHS_X; LHS_Y <- tmp$LHSs$LHS_Y
famZ <- tmp$family[[1]]; famX <- tmp$family[[2]]; famY <- tmp$family[[3]]; famCop <- tmp$family[[4]]
datNULL <- is.null(dat)
output <- c(LHS_Z, LHS_Y)
vars <- c(LHS_Z, LHS_X, LHS_Y)
if (!datNULL) {
vars <- c(names(dat), vars)
}
if (anyDuplicated(na.omit(vars))) stop("duplicated variable names")
d <- length(vars)
# ## start to simulate
# if (method == "particle") {
# forms <- tidy_formulas(unlist(formulas), kwd)
# form_all <- merge_formulas(forms)
# msks <- masks(forms, family=family, wh=form_all$wh, cp=length(output))
#
# theta <- pars2mask(pars, msks)
#
# ## construct suitable log-likelihood function
# llhd <- function(x) {
# if (!is.matrix(x)) x <- matrix(x, nrow=1)
# colnames(x) <- vars2
# mm <- model.matrix.default(form_all$formula, data=as.data.frame(x))
# ll(x, mm, beta=theta$beta, phi=theta$phi,
# inCop = match(c(LHS_Z,LHS_Y), vars2), fam_cop = unlist(last(family)),
# family = unlist(family)[seq_along(vars)])
# }
#
# vars2 <- vars
#
# dat <- as.data.frame(matrix(NA, n, d))
# names(dat) <- vars2
#
# ## get parameters for particle simulator
# d <- sum(lengths(family[1:3]))
# dc <- d - sum(family[[1]] == 0 | family[[1]] == 5)
# n_state <- rep(2, d-dc)
# prob <- rep(list(c(0.5,0.5)), d-dc)
# attrib <- list(dc=dc, n_state=n_state, prob=prob)
#
# for (i in seq_len(n)) {
# dat[i,] <- sim_chain(n=n, d=d, ltd=llhd, ns=1, sd1=2, attrib,
# sim="importance", control=list(B=10))$x
# if (con$trace > 0) rje::printPercentage(i,n)
# }
#
# return(dat)
# }
## set seed to 'seed'
if (missing(seed)) {
seed <- round(1e9*runif(1))
}
# else .Random.seed <- seed
set.seed(seed)
## set up data frame for output
oversamp <- con$oversamp
if (!datNULL) {
dat2 <- dat[rep(seq_len(nrow(dat)), each=oversamp), ]
}
out <- data.frame(matrix(0, ncol=dZ+dX+dY, nrow=n*oversamp))
names(out) <- c(LHS_Z, LHS_X, LHS_Y)
if (!datNULL) {
out <- cbind(dat2, out)
}
## obtain treatment values
tmp <- gen_X_values(n*oversamp, famX=famX, pars=pars, LHS_X=LHS_X, dX=dX)
out[LHS_X] <- tmp$datX[LHS_X]
qden <- tmp$qden
# ## give default coefficients
# if (is.null(pars2$z$beta)) pars2$z$beta = 0
# if (is.null(pars2$z$phi)) pars2$z$phi = 1
## get linear predictors for outcomes
mms <- vector(mode = "list", length=3)
mms[[3]] = lapply(formulas[[3]], model.matrix, data=out)
for (i in seq_along(mms[[3]])) {
if (ncol(mms[[3]][[i]]) != length(pars[[LHS_Y[i]]]$beta)) stop(paste0("dimension of model matrix for ", LHS_Y[i], " does not match number of coefficients provided"))
}
# etas <- vector(mode="list", length=3)
# for (i in c(1,3)) {
# etas[[i]] <- mapply(function(x, y) x %*% pars[[y]]$beta, mms[[i]], lhs(formulas[[i]]), SIMPLIFY = FALSE)
# }
copMM <- model.matrix(formulas[[4]][[1]], out)
if (is.matrix(pars$cop$beta)) {
if (nrow(pars$cop$beta) != ncol(copMM)) stop(paste0("dimension of model matrix for copula (", ncol(copMM), ") does not match number of coefficients provided (", nrow(pars$cop$beta),")"))
}
else if (is.atomic(pars$cop$beta)) {
if (length(pars$cop$beta) != ncol(copMM)) stop(paste0("dimension of model matrix for copula (", ncol(copMM), ") does not match number of coefficients provided (", length(pars$cop$beta),")"))
}
# eta <- list()
# eta$z <- mms[[1]] %*% pars2$z$beta
# eta$y <- mms[[2]] %*% pars2$y$beta
# mms[[3]] <- model.matrix(update.formula(formulas[[4]], NULL ~ . ), out)
## note that code will be slow if continuous covariates used in vine copula
if (length(famCop) > 1) {
if (nrow(unique(copMM)) > 25) warning("using vine copulas with continuous covariates may be very slow")
}
## get copula data and then modify distributions of Y and Z
out[,output] <- sim_copula(out[,output], family=famCop,
par = pars$cop, par2=pars$cop$par2, model_matrix=copMM)
for (i in seq_along(LHS_Z)) {
mms[[1]][[i]] <- model.matrix(formulas[[1]][[i]], data=out)
out[[LHS_Z[i]]] <- rescale_var(out[[LHS_Z[i]]], X=mms[[1]][[i]],
family=famZ[[i]], pars=pars[[LHS_Z[i]]],
link=link[[1]][i])
}
for (i in seq_along(LHS_Y)) out[[LHS_Y[i]]] <- rescale_var(out[[LHS_Y[i]]], X=mms[[3]][[i]],
family=famY[[i]], pars=pars[[LHS_Y[i]]],
link=link[[3]][i])
mms[[2]] = lapply(formulas[[2]], model.matrix, data=out)
for (i in seq_along(mms[[2]])) {
if (ncol(mms[[2]][[i]]) != length(pars[[LHS_X[i]]]$beta)) stop(paste0("dimension of model matrix for ", LHS_X[i], " does not match number of coefficients provided"))
}
## perform rejection sampling
wts <- rejectionWeights(out[LHS_X], mms[[2]], family=famX, pars=pars[LHS_X], qden = qden, link=link[[2]])
con$max_wt <- max(max(wts), con$max_wt)
wts <- wts/con$max_wt
# if (mean(wts > 0.2) < 0.01) {
# if (con$warn == 1) warning("weights may be unbounded")
# else if (con$warn == 2) stop("weights may be unbounded")
# }
## different behaviour depending upon whether covariates were supplied
if (!datNULL) {
done <- matrix(runif(nrow(out)) < wts, nrow=oversamp)
wh2 <- apply(done, 2, function(x) which(x)[1])
nr <- sum(colSums(done) > 0)
while (any(!done)) {
## try again...
}
}
else {
kp <- runif(nrow(out)) < wts
out2 <- out[kp, ]
nr2 <- nrow(out2)
if (nr2 < n) {
## not enough rows, so consider increasing oversampling rate
if (con$oversamp == con$max_oversamp) {
## already at maximum oversample, so just return what we have
warning(paste0("Only sampled ", nr2, " observations"))
}
else {
con$oversamp <- min(con$max_oversamp, ceiling(con$oversamp*(n/nr2*1.1)))
out2 <- Recall(n, formulas, pars=pars, family=family, link=link, dat=dat, control=con, seed=seed)
}
}
else if (nr2 > n) {
## too many rows, so take a subsample
out2 <- out2[seq_len(n), ]
}
}
rownames(out2) <- NULL
return(out2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.