Nothing
#' Simulate Matrix with Metadata
#'
#' @description
#' Generates a matrix of random normal data, then optionally scales values
#' between 0 and 1 column-wise. It also creates corresponding data frames for feature
#' (column) and sample (row) metadata and can optionally introduce `NA` values
#' into a specified proportion of rows. A correlation between columns `rho`
#' (before scaling) can be added.
#'
#' @param n An integer specifying the number of rows (samples). Default is `100`.
#' @param p An integer specifying the number of columns (features). Default is `100`.
#' @param rho Columns correlation before scaling (compound symmetry). Default is `0.5`.
#' @param n_col_groups An integer for the number of groups to assign to features/columns. Default is `2`.
#' @param n_row_groups An integer for the number of groups to assign to samples/rows. Default is `1`.
#' @param perc_total_na Proportion of all cells to set to NA. Default is `0.1`.
#' @param perc_col_na Proportion of columns across which those NAs are spread. Default is `0.5`.
#' @param beta If `TRUE` (default) scale values between 0 and 1 column wise.
#'
#' @returns An object of class `slideimp_sim`. This is a list containing:
#' * `input`: A numeric matrix of dimension \eqn{n \times p} containing
#' the simulated values and injected `NA`s.
#' * `col_group`: A data frame with $p$ rows mapping each `feature`
#' to a `group`.
#' * `row_group`: A data frame with $n$ rows mapping each `sample`
#' to a `group`.
#'
#' @export
#'
#' @examples
#' set.seed(123)
#' sim_data <- sim_mat(n = 50, p = 10, rho = 0.5)
#' sim_data
sim_mat <- function(
n = 100,
p = 100,
rho = 0.5,
n_col_groups = 2,
n_row_groups = 1,
perc_total_na = 0.1,
perc_col_na = 0.5,
beta = TRUE
) {
checkmate::assert_int(n, lower = 2)
checkmate::assert_int(p, lower = 2)
checkmate::assert_number(rho, lower = 0, upper = 1 - .Machine$double.eps^2)
checkmate::assert_int(n_col_groups, lower = 1, upper = p)
checkmate::assert_int(n_row_groups, lower = 1, upper = n)
checkmate::assert_number(perc_total_na, lower = 0, upper = (n - 1) / n)
checkmate::assert_number(perc_col_na, lower = 0, upper = 1)
checkmate::assert_flag(beta)
# NA feasibility checks
n_total_na <- ceiling(perc_total_na * n * p)
n_cols_na <- max(1L, floor(perc_col_na * p))
max_per_col <- n - 1L
if (n_total_na > n_cols_na * max_per_col) {
needed <- ceiling(n_total_na / max_per_col)
cli::cli_abort(
c(
"{.arg perc_col_na} is too small for the requested {.arg perc_total_na}.",
"i" = "Requested {n_total_na} NA{cli::qty(n_total_na)}{?s} across {n_cols_na} column{cli::qty(n_cols_na)}{?s} (max {max_per_col} per column).",
"x" = "Need at least {needed} NA-bearing column{cli::qty(needed)}{?s}.",
">" = "Increase {.arg perc_col_na} so that at least {needed} of {p} columns can carry NAs (currently {n_cols_na}), or decrease {.arg perc_total_na}."
)
)
}
# correlated columns: var = rho + (1 - rho) = 1, cor = rho
if (rho > 0) {
common <- stats::rnorm(n) * sqrt(rho)
Z <- matrix(stats::rnorm(n * p), n, p) * sqrt(1 - rho)
d <- common %*% matrix(1, 1, p) + Z
} else {
d <- matrix(stats::rnorm(n * p), n, p)
}
# re scale
if (beta) {
mm <- col_min_max(d)
mins <- mm[1, ]
ranges <- mm[2, ] - mins
ranges[ranges == 0] <- 1
d <- sweep(d, 2, mins, "-")
d <- sweep(d, 2, ranges, "/")
}
# metadata
col_groups <- sample(
paste0("group", seq_len(n_col_groups)),
size = p,
replace = (n_col_groups < p)
)
col_group <- data.frame(
feature = paste0("feature", seq_len(p)),
group = col_groups
)
colnames(d) <- col_group$feature
rownames(d) <- paste0("sample", seq_len(n))
row_groups <- sample(
paste0("group", seq_len(n_row_groups)),
size = n,
replace = (n_row_groups < n)
)
row_group <- data.frame(
sample = rownames(d),
group = row_groups
)
# inject NAs: evenly divide across selected columns, cap at max_per_col
if (n_total_na > 0) {
cols_na <- sample.int(p, n_cols_na)
per_col <- floor(n_total_na / n_cols_na)
remainder <- n_total_na - per_col * n_cols_na
counts <- rep(per_col, n_cols_na) + c(rep(1L, remainder), rep(0L, n_cols_na - remainder))
row_idx <- unlist(lapply(counts, function(k) sample.int(n, k)))
col_idx <- rep(cols_na, times = counts)
d[cbind(row_idx, col_idx)] <- NA
}
structure(
list(
input = d,
col_group = col_group,
row_group = row_group,
n_col_groups = n_col_groups,
n_row_groups = n_row_groups
),
class = "slideimp_sim"
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.