Nothing
#' Make data
#'
#' @inheritParams CausalQueries_internal_inherit_params
#' @param param_type A character. String specifying type of parameters to make
#' ("flat", "prior_mean", "posterior_mean", "prior_draw", "posterior_draw", "define").
#' With param_type set to \code{define} use arguments to be passed to \code{make_priors};
#' otherwise \code{flat} sets equal probabilities on each nodal type in each parameter set;
#' \code{prior_mean}, \code{prior_draw}, \code{posterior_mean}, \code{posterior_draw}
#' take parameters as the means or as draws from the prior or posterior.
#' @param n Non negative integer. Number of observations. If not provided it is inferred from the largest n_step.
#' @param n_steps A \code{list}. Number of observations to be observed at each step
#' @param given A string specifying known values on nodes, e.g. "X==1 & Y==1"
#' @param nodes A \code{list}. Which nodes to be observed at each step. If NULL all nodes are observed.
#' @param probs A \code{list}. Observation probabilities at each step
#' @param subsets A \code{list}. Strata within which observations are to be observed at each step. TRUE for all, otherwise an expression that evaluates to a logical condition.
#' @param complete_data A \code{data.frame}. Dataset with complete observations. Optional.
#' @param verbose Logical. If TRUE prints step schedule.
#' @param ... additional arguments that can be passed to \code{link{make_parameters}}
#' @return A \code{data.frame} with simulated data.
#' @export
#' @details
#' Note that default behavior is not to take account of whether a node has already been observed when determining whether to select or not. One can however specifically request observation of nodes that have not been previously observed.
#' @examples
#'
#' # Simple draws
#' model <- make_model("X -> M -> Y")
#' make_data(model)
#' make_data(model, n = 3, nodes = c("X","Y"))
#' make_data(model, n = 3, param_type = "prior_draw")
#' make_data(model, n = 10, param_type = "define", parameters = 0:9)
#'
#' # Data Strategies
#' # A strategy in which X, Y are observed for sure and M is observed
#' # with 50% probability for X=1, Y=0 cases
#'
#' model <- make_model("X -> M -> Y")
#' make_data(
#' model,
#' n = 8,
#' nodes = list(c("X", "Y"), "M"),
#' probs = list(1, .5),
#' subsets = list(TRUE, "X==1 & Y==0"))
#'
#'# n not provided but inferred from largest n_step (not from sum of n_steps)
#' make_data(
#' model,
#' nodes = list(c("X", "Y"), "M"),
#' n_steps = list(5, 2))
#'
#' # Wide then deep
#' make_data(
#' model,
#' n = 8,
#' nodes = list(c("X", "Y"), "M"),
#' subsets = list(TRUE, "!is.na(X) & !is.na(Y)"),
#' n_steps = list(6, 2))
#'
# # Look for X only where X has not already been observed
#'
#' make_data(
#' model,
#' n = 8,
#' nodes = list(c("X", "Y"), c("X", "M")),
#' subsets = list(TRUE, "is.na(X)"),
#' n_steps = list(3, 2))
#'
#'# Example with probabilities at each step
#'
#'make_data(
#' model,
#' n = 8,
#' nodes = list(c("X", "Y"), c("X", "M")),
#' subsets = list(TRUE, "is.na(X)"),
#' probs = list(.5, .2))
#'
#'# Example with given data
#' make_data(model, given = "X==1 & Y==1", n = 5)
make_data <- function(
model,
n = NULL,
parameters = NULL,
param_type = NULL,
nodes = NULL, # nodes revealed at each step
n_steps = NULL, # n at each step
probs = NULL, # probs at each step
subsets = TRUE, # subsets at each step
complete_data = NULL,
given = NULL,
verbose = TRUE,
...){
# n_step, n consistency
if(!is.null(n)) n_check(n)
if(!is.null(n_steps) & !is.null(n)) if(max(n_steps %>% unlist) > n) stop("n_step larger than n")
if(!is.null(n_steps) & is.null(n)) n <- max(n_steps %>% unlist)
if(is.null(n_steps) & is.null(n)) n <- 1
# n_steps and probs reconciliation
if(!is.null(n_steps) & !is.null(probs)) warning("Both `n_steps` and `prob` specified. `n_steps` overrides `probs`.")
if(is.null(n_steps) & is.null(probs)) n_steps <- n
if(is.null(n_steps)) n_steps <- NA
if(is.null(probs)) probs <- 1
# Check that parameters sum to 1 in each param_set
if(!is.null(parameters)) parameters <- clean_param_vector(model, parameters)
# If parameters not provided, make or take from model
if(is.null(parameters)) {
if(!is.null(param_type)){
parameters <- make_parameters(model, param_type = param_type, ...)
} else {
parameters <- get_parameters(model) }}
# Check nodes
if(is.null(nodes)) nodes <- list(model$nodes)
if(!is.list(nodes)) nodes <- list(nodes) # Lets one step nodes be provided as vector
if(!(all(unlist(nodes) %in% model$nodes))) stop("All listed nodes should be in model")
# Complete data
if(is.null(complete_data)) {
complete_data <- make_data_single(model,
n = n,
parameters = parameters,
given = given)
}
# Default behavior is to return complete data --
# triggered if all data and all nodes sought in step 1
if(all(model$nodes %in% nodes[[1]]))
if(probs[1]==1 || n_steps[1]==n) return(complete_data)
# Otherwise, gradually reveal
# Check length consistency
roster <- tibble(node_names = lapply(nodes, paste, collapse = ", ") %>% unlist,
nodes = nodes, n_steps = n_steps %>% unlist, probs = probs%>% unlist, subsets = subsets%>% unlist)
if(verbose) print(roster)
observed <- complete_data
observed[,] <- FALSE
# Go step by step
j = 1
while(j <= length(nodes)) {
pars <- roster[j, ]
observed <- observe_data(
complete_data,
observed = observed,
nodes_to_observe = pars$nodes %>% unlist,
prob = pars$probs,
m = pars$n_steps,
subset = pars$subsets)
j = j + 1
}
observed_data <- complete_data
observed_data[!observed] <- NA
observed_data
}
#' Observe data, given a strategy
#'
#' @param complete_data A \code{data.frame}. Data observed and unobserved.
#' @param observed A \code{data.frame}. Data observed.
#' @param nodes_to_observe A list. Nodes to observe.
#' @param prob A scalar. Observation probability.
#' @param m A integer. Number of units to observe; if specified, \code{m} overrides \code{prob}.
#' @param subset A character. Logical statement that can be applied to rows of complete data. For instance observation for some nodes might depend on observed values of other nodes; or observation may only be sought if data not already observed!
#' @return A \code{data.frame} with logical values indicating which nodes to observe in each row of `complete_data`.
#' @importFrom stats runif
#' @importFrom dplyr tibble
#' @export
#' @examples
#' model <- make_model("X -> Y")
#' df <- make_data(model, n = 8)
#' # Observe X values only
#' observe_data(complete_data = df, nodes_to_observe = "X")
#' # Observe half the Y values for cases with observed X = 1
#' observe_data(complete_data = df,
#' observed = observe_data(complete_data = df, nodes_to_observe = "X"),
#' nodes_to_observe = "Y", prob = .5,
#' subset = "X==1")
# A strategy consists of a. names of types to reveal b. number of these to reveal c. subset from which to reveal them
observe_data <- function(complete_data,
observed = NULL,
nodes_to_observe = NULL,
prob = 1,
m = NULL,
subset = TRUE){
if(is.null(observed)) {observed <- complete_data; observed[,] <- FALSE}
if(is.null(nodes_to_observe)) nodes_to_observe <- names(complete_data)
if(is.null(m)) m <- NA
# Prep observed data dataframe
observed_data <- complete_data
observed_data[!observed] <- NA
# Within which subset to reveal?
if(!is.logical(subset) & subset != "TRUE") {
sub <- with(observed_data, eval(parse(text = subset)))
} else {
sub <- rep(TRUE, nrow(observed_data))
}
if(!any(sub)) message("Empty subset")
# Target to reveal
if(is.na(m)){
E <- prob*sum(sub) # Expected number selected
m <- floor(E) + (runif(1) < E - floor(E)) # Get best m
}
observed[sample((1:length(sub))[sub], m), nodes_to_observe] <- TRUE
observed
}
#' Generate full dataset
#'
#' @inheritParams CausalQueries_internal_inherit_params
#' @param n An integer. Number of observations.
#' @param given A string specifying known values on nodes, e.g. "X==1 & Y==1"
#' @param parameters A numeric vector. Values of parameters may be specified. By default, parameters is drawn from priors.
#' @param param_type A character. String specifying type of parameters to make ("flat", "prior_mean", "posterior_mean", "prior_draw", "posterior_draw", "define). With param_type set to \code{define} use arguments to be passed to \code{make_priors}; otherwise \code{flat} sets equal probabilities on each nodal type in each parameter set; \code{prior_mean}, \code{prior_draw}, \code{posterior_mean}, \code{posterior_draw} take parameters as the means or as draws from the prior or posterior.
#' @param w Vector of event probabilities can be provided directly. This is useful for speed for repeated data draws.
#' @param P A \code{matrix}. Parameter matrix that can be used to generate w if w is not provided
#' @param A A \code{matrix}. Ambiguity matrix that can be used to generate w if w is not provided
#' @return A \code{data.frame} of simulated data.
#' @keywords internal
#'
#' @examples
#'
#' model <- make_model("X -> Y")
#'
#' # Simplest behavior uses by default the parameter vector contained in model
#' CausalQueries:::make_data_single(model, n = 5)
#'
#' CausalQueries:::make_data_single(model, n = 5, param_type = "prior_draw")
#'
#' # Simulate multiple datasets. This is fastest if event probabilities (w) are provided
#' w <- get_event_prob(model)
#' replicate(5, CausalQueries:::make_data_single(model, n = 5, w = w))
#'
make_data_single <- function(
model,
n = 1,
parameters = NULL,
param_type = NULL,
given = NULL,
w = NULL, P = NULL, A = NULL){
# Check that parameters sum to 1 in each param_set
# if(!is.null(parameters)) parameters <- clean_param_vector(model, parameters)
# If parameters not provided, take from model
if(is.null(parameters)) {
if(!is.null(param_type)){
parameters <- make_parameters(model, param_type = param_type)
} else {
parameters <- get_parameters(model) }}
# Generate event probabilities w if missing
if(is.null(w)){
w <- get_event_prob(
model,
parameters = parameters,
A = A,
P = P,
given = given)
}
# Data drawn here
make_events(model, n = n, parameters = parameters, w = w) %>%
expand_data(model)
}
#' simulate_data is an alias for make_data
#' @param ... arguments for \code{\link{make_model}}
#' @return A \code{data.frame} with simulated data.
#' @export
#' @examples
#' simulate_data(make_model("X->Y"))
simulate_data <- function(...) make_data(...)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.