#' @title Simulate a normal data set \eqn{S = (X)}.
#'
#' @description Creates a toy data set \eqn{S = (X)} where the columns of \eqn{X}
#' are sampled from an independent Gaussian distribution with mean \eqn{\mu_i} and
#' standard deviation \eqn{\sigma_i}, i.e. \eqn{N(\mu_i, \sigma_i^2)}.
#' The final dimension will be \eqn{n \times p},
#' with the number of data points \eqn{n} to be specified.
#'
#' @param n The desired number of data points in the data set.
#' @param mu A \eqn{p}-dimensional vector of means for \eqn{\mu}.
#' @param sigma A \eqn{p}-dimensional vector of non-negative standard deviations for \eqn{\sigma}.
#'
#' @return An \eqn{n \times p} dimensional data frame given by \eqn{S = (X)}.
#' In the default case, the columns \eqn{X_i} are sampled from \eqn{N(0,1)}, \eqn{n = 100} and \eqn{p = 10}.
#' @examples
#' generate_X()
#'
#' generate_X(n = 40, mu = 1:10, sigma = rep(1, 10))
#' @export
generate_X <- function(n = 100, mu = rep(0,10), sigma = rep(1,10)){
if(any(is.na(n))) stop("n must not be NA")
if(length(n) != 1) stop("The parameter n must be a single value and not a vector. Supplied n: ", n)
if(n %% 1 != 0) stop("n must be an integer. Supplied n: ", n)
if(n <= 0) stop("The parameter n must be a positive integer. Supplied n: ", n)
if(any(is.na(mu))) stop("mu must not be NA")
if(any(!is.numeric(mu))) stop("mu is not a numeric vector. Supplied mu: ", mu)
if(any(is.na(sigma))) stop("sigma must not be NA")
if(any(!is.numeric(sigma))) stop("sigma is not a numeric vector. Supplied sigma: ", sigma)
if(any(sigma < 0)) stop("The parameter sigma must contain non-negative numbers. Supplied sigma: ", sigma)
if(length(mu) != length(sigma)) stop("mu and sigma do not have the same length. Length of mu: ", length(mu), ". Length of sigma: ", length(sigma))
p = length(mu)
out <- suppressMessages(purrr::map2_dfc(mu, sigma, ~rnorm(n, .x, .y)))
colnames(out)[1:p] <- paste0("X", 1:p)
return(out)
}
#' @title Simulate a normal data set \eqn{S = (X, X_{cat})} that includes categorical variables.
#'
#' @description Creates a toy data set \eqn{S = (X, X_{cat})} where the columns of \eqn{X}
#' are sampled from an independent Gaussian distribution with mean \eqn{\mu_i} and
#' standard deviation \eqn{\sigma_i}, i.e. \eqn{N(\mu_i, \sigma_i^2)},
#' and the columns of \eqn{X_{cat}} are categorical, sampled with replacement from a given number of categories (indexed by integers).
#' The final dimension will be \eqn{n \times (p_1 + p_2)},
#' where \eqn{p_1} is the number of columns in \eqn{X} and \eqn{p_2} is the number of columns in \eqn{X_{cat}},
#' with the number of data points \eqn{n} to be specified.
#'
#' @param n The desired number of data points in the data set.
#' @param mu A \eqn{p_1}-dimensional vector of means for \eqn{\mu}.
#' @param sigma A \eqn{p_1}-dimensional vector of non-negative standard deviations for \eqn{\sigma}.
#' @param no_of_cat A \eqn{p_2}-dimensional vector where the entries indicate the number of categories desired for each column of \eqn{X_{cat}}.
#'
#' @return An \eqn{n \times (p_1 + p_2)} dimensional data frame given by \eqn{S = (X, X_{cat})}.
#' In the default case, the columns of \eqn{X} are sampled from \eqn{N(0,1)}, \eqn{n = 100} and \eqn{p_1 = 10, p_2 = 2},
#' i.e. two additional categorical columns of \eqn{X_{cat}} are added. The columns of \eqn{X_{cat}} are factors.
#' @examples
#' generate_X_cat()
#'
#' generate_X_cat(n = 40, mu = 1:6, sigma = rep(1, 6), no_of_cat = c(2,3,5))
#' @export
generate_X_cat <- function(n = 100, mu = rep(0,10), sigma = rep(1,10), no_of_cat = c(4,5)){
if(any(is.na(no_of_cat))) stop("no_of_cat must be not NA")
if(length(no_of_cat) < 1) stop("no_of_cat must contain at least one entry")
if(any(no_of_cat %% 1 != 0) | any(no_of_cat <= 0)) stop("no_of_cat must contain positive integers. Supplied no_of_cat: ", no_of_cat)
# Generate standard data set
out <- generate_X(n = n, mu = mu, sigma = sigma)
p <- ncol(out)
# Add categorical variables
out_cat <- suppressMessages(purrr::map_dfc(no_of_cat, ~sample(.x, n, replace = TRUE))) %>%
dplyr::mutate(dplyr::across(dplyr::everything(), factor))
colnames(out_cat) <- paste0("X", (p+1):(p+length(no_of_cat)))
return(dplyr::bind_cols(out, out_cat))
}
#' @title Simulate a normal data set \eqn{S = (X,Y)}, where \eqn{Y = X^T \beta}.
#'
#' @description Creates a toy data set \eqn{S = (X,Y)} where the columns of \eqn{X}
#' are sampled from an independent Gaussian distribution with mean \eqn{\mu_i} and
#' standard deviation \eqn{\sigma_i}, i.e. \eqn{N(\mu_i, \sigma_i^2)}. The response \eqn{Y}
#' is given by \eqn{Y = X^T \beta}. The final dimension will be \eqn{n \times (p + 1)},
#' with the number of data points \eqn{n} to be specified.
#'
#' @param n desired number of data points in the data set.
#' @param mu a \eqn{p}-dimensional vector of means for \eqn{\mu}.
#' @param sigma a \eqn{p}-dimensional vector of non-negative standard deviations for \eqn{\sigma}.
#' @param beta_coefficients a \eqn{p}-dimensional vector of coefficients for \eqn{\beta}.
#'
#' @return An \eqn{n \times (p+1)} dimensional data frame given by \eqn{S = (X,Y)}.
#' In the base case, the columns \eqn{X_i} are sampled from \eqn{N(0,1)}.
#' We also have \eqn{n = 100} and \eqn{p = 10}, with \eqn{beta}-coefficients 1 to 10.
#' @examples
#' generate_XY()
#'
#' generate_XY(n = 60, mu = 1:4, sigma = rep(1, 4), beta_coefficients = 1:4)
#' @export
generate_XY <- function(n = 100, mu = rep(0, 10), sigma = rep(1,10), beta_coefficients = 1:10){
if(any(is.na(beta_coefficients))) stop("beta_coefficients must not be NA")
if(any(!is.numeric(beta_coefficients))) stop("beta_coefficients is not a numeric vector. Supplied beta_coefficients: ", beta_coefficients)
if(length(mu) != length(beta_coefficients)) stop("mu and beta_coefficients do not have the same length. Length of mu: ", length(mu), ". Length of beta_coefficients: ", length(beta_coefficients))
# Generate standard data set
out <- generate_X(n = n, mu = mu, sigma = sigma)
# Compute outcomes Y
Y = as.vector(as.matrix(out) %*% beta_coefficients)
# Combine the data frame with the outcomes
out <- dplyr::mutate(out, Y = Y)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.