R/distributions.R

Defines functions rdistr.template distr.list rcat.b0 rcat.b1 rcategor.int rcategor rcat.factor rconst rbern

Documented in distr.list rbern rcat.b0 rcat.b1 rcategor rcategor.int rcat.factor rconst rdistr.template

#' Random Sample from Bernoulli Distribution
#'
#' Wrapper for Bernoulli node distribution.
#'
#' @param n Sample size.
#' @param prob A vector of success probabilities.
#' @return Binary vector of length \code{n}.
#' @examples
#'
#'#---------------------------------------------------------------------------------------
#'# Specifying and simulating from a DAG with 3 Bernoulli nodes
#'#---------------------------------------------------------------------------------------
#'D <- DAG.empty()
#'D <- D + node("W1", distr="rbern", prob=0.05)
#'D <- D + node("W2", distr="rbern", prob=ifelse(W1==1,0.5,0.1))
#'D <- D + node("W3", distr="rbern", prob=ifelse(W1==1,0.5,0.1))
#'Dset <- set.DAG(D)
#'simdat <- sim(Dset, n=200, rndseed=1)
#' @export
rbern <- function(n, prob) {
	rbinom(n=n, prob=prob, size=1)
}

#' Constant (Degenerate) Distribution (Returns its Own Argument \code{const})
#'
#' Wrapper for constant value (degenerate) distribution.
#'
#' @param n Sample size.
#' @param const Either a vector with one constant value (replicated \code{n} times)
#' or a vector of length \code{n} or a matrix with \code{n} rows (for a multivariate node).
#' @return A vector of constants of length \code{n}.
#' @examples
#'
#'#---------------------------------------------------------------------------------------
#'# Specifying and simulating from a DAG with 1 Bernoulli and 2 constant nodes
#'#---------------------------------------------------------------------------------------
#'D <- DAG.empty()
#'D <- D + node("W1", distr = "rbern", prob = 0.45)
#'D <- D + node("W2", distr = "rconst", const = 1)
#'D <- D + node("W3", distr = "rconst", const = ifelse(W1 == 1, 5, 10))
#'
#'# TWO equivalent ways of creating a multivariate node (just repeating W1 and W2):
#'create_mat <- function(W1, W2) cbind(W1, W2)
#'vecfun.add("create_mat")
#'
#'D <- D + node(c("W1.copy1", "W2.copy1"), distr = "rconst", const = c(W1, W2))
#'D <- D + node(c("W1.copy2", "W2.copy2"), distr = "rconst", const = create_mat(W1, W2))
#'Dset <- set.DAG(D)
#'sim(Dset, n=10, rndseed=1)
#' @export
rconst <- function(n, const) {
	if (n==0) {
		return(vector(length = n))
	} else if (is.matrix(const)) {
		return(const)
	} else if (length(const) == 1) {
		return(rep.int(const, n))
	} else if (length(const) < n) {
		stop("the length of const arg is not 1 and is less than n; it needs to be one or the other")
	} else if (length(const) > n) {
		warning("the length of const arg is more than n; const was truncated to length n")
		return(const[1:n])
	} else {
		return(const)
	}
}

#' Random Sample for a Categorical Factor
#'
#' Matrix version of the categorical distribution. The argument \code{probs} can be a matrix of n rows,
#' specifying individual (varying in sample) categorical probabilities.
#' The number of categories generated is equal to \code{ncol(probs)+1}, the levels labeled as: \code{1,...,ncol(probs)+1}.
#'
#' @param n Sample size.
#' @param probs Either a vector or a matrix of success probabilities.
#' When \code{probs} is a vector, \code{n} identically distributed random categorical variables
#' are generated with categories: 1, 2, ..., length(probs)+1.
#' When \code{probs} is a matrix, the categorical probabilities of the \code{k}th sample are determined by the
#' \code{k}th row of \code{probs} matrix, i.e., \code{probs[k,]}.
#' @return A factor of length \code{n} with levels: \code{1,2, ...,ncol(probs)+1}.
#' @examples
#'
#'#---------------------------------------------------------------------------------------
#'# Specifying and simulating from a DAG with one categorical node with constant
#'# probabilities
#'#---------------------------------------------------------------------------------------
#'D <- DAG.empty()
#'D <- D + node("race",t=0,distr="rcat.factor",probs=c(0.2,0.1,0.4,0.15,0.05,0.1))
#'Dset <- set.DAG(D)
#'simdat <- sim(Dset, n=200, rndseed=1)
#'
#'#---------------------------------------------------------------------------------------
#'# Specifying and simulating from a DAG with a categorical node with varying
#'# probabilities (probabilities are determined by values sampled for nodes L0 and L1)
#'#---------------------------------------------------------------------------------------
#'D <- DAG.empty()
#'D <- D + node("L0", distr="rnorm", mean=10, sd=5)
#'D <- D + node("L1", distr="rnorm", mean=10, sd=5)
#'D <- D + node("L2", distr="rcat.factor", probs=c(abs(1/L0), abs(1/L1)))
#'Dset <- set.DAG(D)
#'simdat <- sim(Dset, n=200, rndseed=1)
#' @seealso \code{\link{rcat.b1}}, \code{\link{rcat.b0}}
#' @export
rcat.factor <- function(n, probs) {
	as.factor(rcat.b1(n = n, probs = probs))
}

#' @describeIn rcat.factor (Deperecated) Random Sample of a Categorical Factor
#' @export
rcategor <- function(n, probs) {
  warning("This function is deprecated, please use rcat.factor() instead.")
  as.factor(rcategor.int(n = n, probs = probs))
}

#' @describeIn rcat.b1 (Deperecated) Random Sample from Base 1 Categorical (Integer) Distribution
#' @export
rcategor.int <- function(n, probs) {
  warning("This function is deprecated, please use rcat.b1() instead.")
  rcat.b1(n, probs)
}

#' Random Sample from Base 1 (rcat.b1) or Base 0 (rcat.b0) Categorical (Integer) Distribution
#'
#' Same as \code{}, but returning a vector of sampled integers with range 1, 2, ..., \code{ncol(probs)+1} for \code{rcat.b1}
#' or range 0, 1, ..., \code{ncol(probs)} for \code{rcat.b0}. For sampling categorical factors see \link{rcat.factor}.
#' @param n Sample size.
#' @param probs Either a vector or a matrix of success probabilities.
#' When probs is a vector, \code{n} identically distributed random categorical variables are
#' generated.
#' When \code{probs} is a matrix, the categorical probabilities of the \code{k}th
#' sample are determined by the \code{k}th row of probs matrix, i.e., \code{probs[k,]}.
#' @return An integer vector of length \code{n} with range either in \code{0,...,ncol(probs)} or in \code{1,...,ncol(probs)+1}.
#' @seealso \code{\link{rcat.factor}}
#' @describeIn rcat.b1 Random Sample from Base 1 Categorical (Integer) Distribution
#' @export
rcat.b1 <- function(n, probs) {
	if (n==0) {
		probs <- matrix(nrow = n, ncol = length(probs), byrow = TRUE)
	}
	if (is.vector(probs) && n>0) {
		probs <- matrix(data = probs, nrow = n, ncol = length(probs), byrow = TRUE)
	}
	probs <- cbind(probs, 1 - rowSums(probs))	# sum each row and check if some need to be normalized
	pover1_idx <- which(probs[,ncol(probs)] < -(10^-6)) # which rows of probs need to be normalized
	# reset last category to zero if its with numeric epsilon error away from 0:
	restto0 <- which(abs(probs[,ncol(probs)]) <= 10^-6 & abs(probs[,ncol(probs)]) > 0L)
	if (length(restto0)>0) {
		probs[restto0, ncol(probs)] <- 0L
	}
	if (length(pover1_idx)>0) {
		warning("some categorical probabilities add up to more than 1, normalizing to add to 1")
		probs[pover1_idx, ncol(probs)] <- 0
		probs[pover1_idx, ] <- probs[pover1_idx, ,drop = FALSE] / rowSums(probs[pover1_idx, ,drop = FALSE]) # normalize
	}
	probs_cum <- matrix(nrow = nrow(probs), ncol = ncol(probs))
	probs_cum[,1] <- probs[,1]
	for (i in seq(ncol(probs))[-1]) {
		probs_cum[,i] <- rowSums(probs[,c(1:i),drop = FALSE])
	}
	cat_sample <- probs_cum - runif(nrow(probs_cum)) < 0
	if (is.matrix(cat_sample)) {
		samples <- rowSums(cat_sample) + 1
	} else {
		samples <- cat_sample
	}
	as.integer(samples)
}

#' @describeIn rcat.b1 Random Sample from Base 0 Categorical (Integer) Distribution
#' @export
rcat.b0 <- function(n, probs) rcat.b1(n, probs) - 1

#' List All Custom Distribution Functions in \code{simcausal}.
#'
#' @export
distr.list <- function() {
	message("All custom distributions defined in SimCausal:\n")
	print(ls("package:simcausal")[grep(pattern = "^r(?!net)", x = ls("package:simcausal"), perl=TRUE)])
	# print(ls("package:simcausal", pattern="^[r]"))
	# print(ls("package:simcausal", pattern="^[S]L"))
	invisible(ls("package:simcausal"))
}

#' Template for Writing Custom Distribution Functions
#'
#' Template function for writing \code{SimCausal} custom distribution wrappers.
#'
#' One of the named arguments must be 'n', this argument is passed on to the function automatically by
#' the package and is assigned to the number of samples that needs to be generated from this distribution.
#' Other arguments (in this example arg1 and arg2) must be declared by the user as
#' arguments inside the node() function that uses this distribution,
#' e.g., \code{node("Node1"}, \code{distr="distr.template"}, \code{arg1 = ...}, \code{arg2 = ...)}.
#' Both, arg1 and arg2, can be either numeric constants or formulas involving past node names.
#' The constants get passed on to the distribution function unchanged.
#' The formulas are evaluated inside the environment of the simulated data and are passed on to the
#' distribution functions as vectors.
#' The output of the distribution function is expected to be a vector of length n of the sampled covariates.
#'
#' @param n Sample size that needs to be generated
#' @param arg1 Argument 2
#' @param arg2 Argument 1
#' @return A vector of length \code{n}
#' @param ... Additional optional parameters
#' @export
rdistr.template <- function(n, arg1, arg2, ...) {
	if (length(arg1) == 1L) arg1 <- rep.int(arg1, n)
 	if (length(arg2) == 1L) arg2 <- rep.int(arg2, n)
	if (length(arg1) != n || length(arg2) != n) stop("inputs arguments should all have the same length")
	out <- as.numeric(arg1 + arg2)
	stopifnot(length(out)==n)
	# length(out) <- n
	out
}

Try the simcausal package in your browser

Any scripts or data that you put into this service are public.

simcausal documentation built on Oct. 29, 2022, 1:13 a.m.