# helpers for dataset-generation
# load needed packages
library(tibble)
library(rlang)
library(purrr)
library(dplyr)
library(tidyr)
# Generate an index table
#
# Parameters:
#
# + nrow: The length of the signals to generate (number of rows the final
# signal-table should have).
# + param: ncol Number of signals to generate.
# + max_subsig: The maximum number of subsignals the generate signals may
# contain (from 1 to `max_subsig`).
#
# Return: A tibble with the following structure (all integer columns):
#
# + col: The number of the generated signal.
# + signal_nr: The number of the subsignal.
# + start: The starting indices of the corresponding subsignal.
# + end: The ending indices of the corresponding subsignal.
#
# NOTE: every row is a subsignal
gen_sim_ind <- function(nrow, ncol, max_subsig) {
stopifnot(is.integer(nrow), is.integer(ncol), is.integer(max_subsig))
# to choose number of subsignals in signal randomly
choices_n_subsig <- seq_len(max_subsig)
# subsignal positions (both start and end) to sample from
choices_subsig_ind <- seq_len(nrow)
map_dfr(seq_len(ncol - 1), ~{
# sample number of subsignals
n_subsig <- sample(choices_n_subsig, 1)
# odd positions of `indices` represent starting indices, even ones stand
# for ending indices of the subsignal; keep sampling until all differences
# of the indices are greater than 1 (ensures subsignals have at least one 0
# between them)
repeat({
indices <- sort(sample(choices_subsig_ind, 2 * n_subsig, replace = FALSE))
if (all(diff(indices) > 1)) {
break
}
})
is_start_ind <- as.logical((seq_along(indices) %% 2) == 1)
tibble(
col = .x, signal_nr = seq_len(n_subsig), start = indices[is_start_ind],
end = indices[!is_start_ind]
)
}) %>%
# ensure the range of [1, nrow] (otherwise it's not guaranteed that 1 is
# the minimum of starting indices and `nrow` likewise not the maximum)
add_row(col = ncol, signal_nr = 1L, start = 1L, end = nrow)
}
# Generate simulated signals
#
# Use the output of gen_sim_ind() and add the signal-column (list column with
# `time` and `value`-columns).
#
# Parameters:
# + ind_tbl: The output of a call to gen_sim_ind() (index-table).
# + `...`: The elements of the ellipsis are used as quosures, which generate
# the values of the subsignals (=> generators-quosures).
# + pp_fun: Function for post-processing (is applied to the unaltered generated
# subsignal).
# + weights: A numeric vector for sampling probabilities (for the
# generator-quosures).
#
# The following symbols are masked (for generator-quosures):
#
# + n: The length of the subsignal to generate.
# + x: Equivalent to `seq_len(n)` - can be seen as a simulated "time".
# + c_x: `x` centered around `0`.
# + coefficient functions: r_coef() and rs_coef() - are used to generate
# randomized coefficients from a specified interval. These functions are
# defined below.
#
# Return: The signal-column gets added to the index-table.
gen_sim_sig <- function(ind_tbl, ..., pp_fun = function(x) x, weights = NULL) {
stopifnot(is_tibble(ind_tbl))
generator_quos <- enquos(...)
stopifnot(length(generator_quos) != 0)
# to choose a random generator
choices_gen <- seq_along(generator_quos)
# masking environment for tidy evalutation
mask <- new_environment(coef_funs)
if (missing(weights)) {
# assume the user wanted equal probabilities for a generator to be chosen
weights <- rep(1, length(generator_quos))
}
stopifnot(is.numeric(weights), length(weights) == length(generator_quos))
# check integrity of start and end-column
stopifnot(is.integer(ind_tbl$start))
stopifnot(is.integer(ind_tbl$end))
# add signal-column (list column of tibbles)
mutate(ind_tbl, signal = map2(start, end, ~{
time <- .x:.y # time column in resulting signal-tibble
subsig_len <- length(time) # length of the subsignal
mask$n <- subsig_len
mask$x <- seq_along(time)
mask$c_x <- mask$x - round(subsig_len / 2, 0) # center x around 0
# choose random generator-quosure
generator_quo <- generator_quos[[sample(choices_gen, 1, prob = weights)]]
# generate signal-values
sig_val <- eval_tidy(generator_quo, data = mask)
stopifnot(is.numeric(sig_val))
stopifnot(length(sig_val) == length(time))
# apply post-processing function to generated signal-values
sig_val <- pp_fun(sig_val)
# make sure no elements are near 0 (add 1e-4 to it); gets done to ensure,
# that a subsignal doesen't get split if elements too close to 0 are
# contained (e.g. when processed by mbte_extract_subsignals())
repeat({
near_0 <- near(sig_val, 0, tol = 1e-5)
if (!any(near_0)) {
break
}
sig_val[near_0] <- sig_val[near_0] + 1e-4
})
# return signal-tibble
tibble(time = time, value = sig_val)
}))
}
# A wrapper around gen_sim_sig() with predefined trends (linear, exponential,
# logarithmic, sigmoid, noise)
gen_sim_sig_default <- function(ind_tbl) {
gen_sim_sig(
ind_tbl,
rs_coef(2) * x,
rs_coef(3.5) * exp(r_coef(0.1, 0.4) * x) + r_coef(0, 1.5),
rs_coef(3) * log(1 + x),
r_coef(1, 4) / (1 + exp(-rs_coef(2) * c_x)), # sigmoid
r_coef(2, 4) * rnorm(n), # noise with high intensity
weights = c(2, 3, 1, 3, 3),
pp_fun = function(x) {
# add variable amount of noise to generated subsignal
noise_amount <- sample(seq(0.1, 0.8, 0.1), 1)
x <- x + noise_amount * rnorm(length(x), sd = sd(x))
# ensure that all elements of `x` are positive
if (any(x <= 0)) {
x - min(x) + 0.1
} else {
x
}
}
)
}
# Generate actual raw dataset
#
# Use input from gen_sim_sig() or gen_sim_sig_default() as input.
#
# Parameters:
#
# + sig_tbl: The output of gen_sim_sig().
# + long: A logical indicating if the resulting table should be returned
# in long- (if set to `TRUE`) or wide format (if set to `FALSE`).
# + mv_prefix: The prefix used for a simulated measurement variable/parameter
# (assuming the resulting dataset should miminc the data obtained from
# measurements).
gen_sim_raw <- function(sig_tbl, long = TRUE, mv_prefix = "mv") {
stopifnot(
is_tibble(sig_tbl),
is_scalar_logical(long),
is_scalar_character(mv_prefix) && nzchar(mv_prefix) != 0
)
# assert integrity of needed columns
stopifnot(is.integer(sig_tbl$col))
stopifnot(is.list(sig_tbl$signal))
# the correct numerical order (to avoid lexical column-ordering like
# "mv1", "mv10", "mv2")
correct_col_order <- paste0(mv_prefix, sort(sig_tbl$col))
# NOTE: the following dplyr::spread()-call is needed in any case, even if
# tidyr::gather() may be called later (if long is `TRUE`). The spread()-call
# adds the desired 0-padding to subsignals.
raw_tbl <- sig_tbl %>%
unnest(signal) %>%
# only keep relevant variables
select(time, col, value) %>%
# add prefix to column number
mutate(col = paste0(mv_prefix, col)) %>%
spread(col, value, fill = 0) %>%
# put column in the right order (e.g. | time | mv1 | mv2 | ... | mv10 |)
select(time, !!!correct_col_order)
if (long) {
gather(raw_tbl, !!mv_prefix, "value", !!!correct_col_order)
} else {
raw_tbl
}
}
#### coefficient-generation functions ####
# Helper-functions for gen_sim_sig() to generate randomized coefficients.
# Generate a random coefficient withing the range [min, max].
# NOTE: the coefficient gets sampled from the speciefied interval with 50
# subdivisions
r_coef <- function(min, max) {
sample(seq(min, max, length.out = 50L), 1L)
}
# Similar to r_coef(), but sample from the symmetric interval
# [-value, value].
rs_coef <- function(value) {
abs_value <- abs(value)
r_coef(-abs_value, abs_value)
}
coef_funs <- list(
r_coef = r_coef,
rs_coef = rs_coef
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.