Nothing
#' @title Simulate Data from DAG
#' @description Simulates data from an (augmented) DAG, respecting the optional
#' metadata fields `form`, for functional form of relationships, and
#' `distribution`, for the distributions of exogenous nodes and
#' residuals.
#' @details Data is simulated sequentially, first from exogenous nodes and then
#' from their descendants. If `x` is an augmented DAG with metadata indicating
#' the functional `form` of relationships and `distribution` of exogenous nodes
#' and residuals, this information is used. If this information is absent, nodes
#' and residuals are assumed to be normally distributed, and edges are assumed
#' to be linear, with coefficients samples based on `beta_default`.
#'
#' The argument `duplicated` controls how multiplicative terms are merged across
#' edges pointing to the same outcome node. The default `duplicated = "unique"`
#' removes terms that are duplicated across edges (i.e.,
#' if two edges point to node `"O"`, and both edges specify `.5*E`, the resulting
#' function will say `.5*E`. However, if one edge specifies `.2*E` and the other
#' specifies `.3*E`, they are not duplicated and will be added.
#' Alternatively, `duplicated = "add"` just sums terms across all edges pointing
#' into the same outcome node.
#' @param x An object of class `dagitty`.
#' @param beta_default Function used to specify missing edge coefficients.
#' Default: `runif(1, min = -0.6, max = 0.6)`
#' @param n Atomic integer defining the sample size, default: `500`
#' @param run Logical, indicating whether or not to run the simulation. Default:
#' `TRUE`.
#' @param duplicated Atomic character, indicating how to resolve duplicate terms
#' from multiple edges pointing to the same node. Default: `"unique"`. See
#' Details.
#' @return If `run` is `TRUE`, this function returns a `data.frame` with
#' an additional attribute called `attr( , which = "script")` that contains the
#' script for simulating data. If `run` is `FALSE`, this function returns the
#' script as `character` vector.
#' @examples
#' x <- dagitty::dagitty('
#' dag {
#' X [distribution="rbinom(size = 2, prob = .5)"]
#' Z [distribution="rexp()"]
#' Y [distribution="rnorm()"]
#'
#' X -> Y [form="0.5+X"]
#' Z -> Y [form="2*Z"]
#' A -> X
#' }
#' ')
#' txt <- simulate_data(x, n = 5, run = FALSE)
#' df <- simulate_data(x, n = 5, run = TRUE)
#' df_from_txt <- eval(parse(text = txt))
#' @seealso
#' \code{\link[tidySEM]{get_nodes}}, \code{\link[tidySEM]{get_edges}}
#' \code{\link[dagitty]{exogenousVariables}}
#' @rdname simulate_data
#' @export
#' @importFrom tidySEM get_nodes get_edges
#' @importFrom dagitty exogenousVariables
#' @importFrom stats runif
simulate_data <- function (x,
beta_default = round(runif(1, min = -0.6, max = 0.6), 2),
n = 500,
run = TRUE,
duplicated = "unique"){
beta_default <- substitute(beta_default)
duplicated <- duplicated[1]
if(!isTRUE(duplicated %in% c("unique", "add"))){
stop("Argument `duplicated` must be one of 'unique' or 'add'.")
}
nod <- tidySEM::get_nodes(x)
edg <- tidySEM::get_edges(x)
nodes_exo <- dagitty::exogenousVariables(x)
# Create empty script
script <- c("# Set random seed",
paste0("set.seed(", sample.int(.Machine$integer.max, size = 1), ")"),
"# Set simulation parameters",
paste0("n <- ", n),
"# Simulate exogenous nodes")
# Add exogenous nodes
for(thisn in nodes_exo){
simfn <- nod$distribution[which(nod$name == thisn)]
simfn <- try(sim_fun_txt(simfn), silent = TRUE)
if(inherits(simfn, "try-error")){
cli_msg("!" = "No valid distribution for {.val {thisn}}, used `rnorm()` instead.")
simfn <- "rnorm(n)"
}
script <- c(script, paste0(thisn, " <- ", simfn))
}
# For each endogenous node, simulate data
nodes_endo <- setdiff(nod$name, nodes_exo)
nodes_existing <- nodes_exo
endo_deps_exist <- sapply(nodes_endo, function(thisn){
all(edg$from[edg$to == thisn] %in% nodes_existing)
})
script <- c(script, "# Simulate endogenous nodes")
countr <- 1L
while((length(endo_deps_exist) > 0 & countr < 2*nrow(nod))){
if(all(!endo_deps_exist)){
stop(paste0("Could not synthesize data; the dependencies for ", paste0(names(endo_deps_exist[!endo_deps_exist]), collapse = ", "), " do not exist."))
}
thisn <- names(endo_deps_exist)[endo_deps_exist][1]
edg_thisn <- edg[edg$to == thisn, , drop = FALSE]
frm <- edges_to_simfun(edg_thisn, beta_default = beta_default, duplicated = duplicated)
resid <- try(sim_fun_txt(nod$distribution[which(nod$name == thisn)]), silent = TRUE)
if(inherits(resid, "try-error")){
cli_msg("!" = "No valid distribution for {.val {thisn}}, used `rnorm()` instead.")
simfn <- "rnorm(n = n)"
}
script <- c(script, paste0(thisn, " <- ", frm, " + ", resid))
nodes_existing <- c(nodes_existing, thisn)
endo_deps_exist <- sapply(setdiff(nod$name, nodes_existing), function(thisn){
all(edg$from[edg$to == thisn] %in% nodes_existing)
})
countr <- countr + 1L
}
script <- c(script, "df <- data.frame(", paste0(nod$name, " = ", nod$name, c(rep(",", nrow(nod)-1L), "")), ")")
if(run){
eval(parse(text = script))
attr(df, "script") <- script
return(df)
} else {
return(script)
}
}
#' @importFrom methods formalArgs
sim_fun_txt <- function(simfn){
tryCatch({
if(!inherits(simfn, "character")) simfn <- "rnorm()"
simfn <- str2lang(s = simfn)
if(is.null(simfn[["n"]])){
Args <- formalArgs(eval(simfn[[1]]))
if("n" %in% Args){
simfn[["n"]] <- quote(n)
}
}
return(deparse(substitute(simfn)))
}, error = function(e){
cli_msg("!" = "No valid distribution for data, used rnorm() instead.")
return("rnorm(n = n)")
})
}
# simulate_data <- function (x, beta_default = runif(1, min = -0.6, max = 0.6),
# n = 500){
# beta_default <- substitute(beta_default)
# nod <- tidySEM::get_nodes(x)
# edg <- tidySEM::get_edges(x)
# nodes_exo <- dagitty::exogenousVariables(x)
#
# # Create empty environment
# eval_env <- new.env()
# # Add exogenous nodes
# for(thisn in nodes_exo){
# simfn <- nod$distribution[which(nod$name == thisn)]
# assign(thisn, sim_fun(simfn, n = n), envir = eval_env)
# }
# # For each endogenous node, simulate data
# nodes_endo <- setdiff(nod$name, nodes_exo)
# endo_deps_exist <- sapply(nodes_endo, function(thisn){
# all(edg$from[edg$to == thisn] %in% names(eval_env))
# })
#
# while(length(endo_deps_exist) > 0){
# if(all(!endo_deps_exist)){
# stop(paste0("Could not synthesize data; the dependencies for ", paste0(names(endo_deps_exist[!endo_deps_exist]), collapse = ", "), " do not exist."))
# }
# thisn <- sample(names(endo_deps_exist)[endo_deps_exist], 1)
# edg_thisn <- edg[edg$to == thisn, , drop = FALSE]
# frm <- merge_formulas(edg_thisn, beta_default = beta_default)
#
# trueval <- eval(parse(text = frm), envir = eval_env)
# resid <- eval(sim_fun(nod$distribution[which(nod$name == thisn)], n = n), envir = eval_env)
# assign(thisn, trueval + resid, envir = eval_env)
# endo_deps_exist <- sapply(setdiff(nodes_endo, names(eval_env)), function(thisn){
# all(edg$from[edg$to == thisn] %in% names(eval_env))
# })
# }
#
# as.data.frame(as.list(eval_env))
#
# }
#
# sim_fun <- function(simfn, n){
# tryCatch({
# if(is.na(simfn)) simfn <- "rnorm()"
# simfn <- str2lang(s = simfn)
# if(is.null(simfn[["n"]])){
# Args <- formalArgs(eval(simfn[[1]]))
# if("n" %in% Args){
# simfn[["n"]] <- n
# }
# }
# return(eval(simfn))
# }, error = function(e){
# cli_msg("!" = "No valid distribution for data, used rnorm() instead.")
# return(rnorm(n))
# })
# }
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.