Nothing
## ----message=FALSE, warning=FALSE, include=FALSE------------------------------
library(simdata)
library(ggplot2)
library(reshape2)
library(GGally)
library(knitr)
library(doParallel)
library(doRNG)
## ----message=FALSE, warning=FALSE---------------------------------------------
correlation_matrix = diag(1, nrow = 5)
sim_design = simdata::simdesign_mvtnorm(relations = correlation_matrix,
transform_initial = data.frame,
prefix_final = "variable")
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)
knitr::kable(head(sim_data))
p = GGally::ggpairs(sim_data,
upper = list(continuous = "points"),
progress = FALSE) +
theme_bw()
print(p)
## ----message=TRUE, warning=TRUE-----------------------------------------------
correlation_matrix = diag(1, nrow = 5)
transformation = simdata::function_list(
"v1" = function(x) x[, 1],
"v2*2" = function(x) x[, 2] * 2,
"v3^2" = function(x) x[, 3]^2,
"v4+v5" = function(x) x[, 4] + x[, 5],
check.names = FALSE # pass columnnames exactly as specified here
)
sim_design = simdata::simdesign_mvtnorm(
relations = correlation_matrix,
transform_initial = transformation,
prefix_final = NULL
)
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)
knitr::kable(head(sim_data))
p = GGally::ggpairs(sim_data, upper = list(continuous = "points"),
progress = FALSE) +
theme_bw()
print(p)
## -----------------------------------------------------------------------------
correlation_matrix = simdata::cor_from_upper(
15,
rbind(c(1,2,0.8), c(1,9,0.3),
c(3,5,0.3), c(3,9,-0.5),
c(4,6,-0.5), c(4,7,-0.3),
c(5,6,-0.3), c(5,12,0.5),
c(6,7,0.5), c(6,11,0.5), c(6,14,0.3),
c(7,11,0.3), c(7,14,0.3),
c(8,9,-0.3), c(8,11,0.3),
c(11,14,0.5)))
transformation = simdata::function_list(
v1 = function(z) floor(10 * z[,1] + 55),
v2 = function(z) z[,2] < 0.6,
v3 = function(z) exp(0.4 * z[,3] + 3),
v4 = function(z) z[,4] >= -1.2,
v5 = function(z) z[,4] >= 0.75,
v6 = function(z) exp(0.5 * z[,5] + 1.5),
v7 = function(z) floor(pmax(0, 100 * exp(z[,6]) - 20)),
v8 = function(z) floor(pmax(0, 80 * exp(z[,7]) - 20)),
v9 = function(z) z[,8] < -0.35,
v10 = function(z) (z[,9] >= 0.5) & (z[,9] < 1.5),
v11 = function(z) z[,9] >= 1.5,
v12 = function(z) 0.01*floor(100 * (z[,10] + 4)^2),
v13 = function(z) floor(10 * z[,11] + 55),
v14 = function(z) floor(10 * z[,12] + 55),
v15 = function(z) floor(10 * z[,13] + 55),
v16 = function(z) z[,14] < 0,
v17 = function(z) z[,15] < 0
)
sim_design = simdata::simdesign_mvtnorm(
relations = correlation_matrix,
transform_initial = transformation,
prefix_final = NULL
)
## ----echo=FALSE---------------------------------------------------------------
plotdf = melt(sim_design$cor_initial)
ggplot(plotdf, aes(x = factor(Var1), y = factor(Var2), fill = value)) +
geom_tile() +
geom_text(aes(label = value)) +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev) +
xlab(expr(paste("Initial variable ", z[i]))) +
ylab(expr(paste("Initial variable ", z[i]))) +
coord_equal() +
scale_fill_gradient2(
"Correlation",
limits = c(-1, 1),
low = "#377eb8", mid = "white", high = "#e41a1c") +
theme_bw()
## -----------------------------------------------------------------------------
simdata::plot_cor_network(sim_design, seed = 1)
## -----------------------------------------------------------------------------
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)
## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
numeric_vars = names(sim_design$types_final)[sim_design$types_final == "numeric"]
plotdf = melt(sim_data[, numeric_vars], measure.vars = numeric_vars)
p = ggplot(plotdf, aes(x = "", y = value)) +
geom_violin() +
geom_dotplot(binaxis = "y", stackdir = "center") +
facet_wrap(~ variable, scales = "free", nrow = 3) +
theme_bw() +
ylab("Value") +
theme(axis.title.x = element_blank())
print(p)
discrete_vars = names(sim_design$types_final)[sim_design$types_final != "numeric"]
plotdf = melt(sim_data[, discrete_vars], measure.vars = discrete_vars)
p = ggplot(plotdf, aes(x = variable, fill = value, color = value)) +
geom_bar(alpha = 0.5, size = 1) +
scale_fill_brewer(palette = "Set1") +
scale_color_brewer(palette = "Set1") +
ylab("Proportion of total samplesize (%)") +
theme_bw() +
theme(axis.title.x = element_blank())
print(p)
## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
plotdf = cor(sim_data, method = "s", use = "p")
plotdf = melt(plotdf)
ggplot(plotdf, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), size = 2) +
scale_fill_gradient2(
"Correlation",
limits = c(-1, 1),
low = "#377eb8", mid = "white", high = "#e41a1c") +
coord_equal() +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev) +
xlab(expr(paste("Final variable ", v[i]))) +
ylab(expr(paste("Final variable ", v[i]))) +
theme_bw() +
theme(text = element_text(size = 10))
## ----message=FALSE, warning=FALSE---------------------------------------------
# draw full network
simdata::plot_estimated_cor_network(sim_design,
cor_cutoff = NULL,
edge_label_function = NULL,
edge_width_function = function(x) x*25,
use_edge_weights = TRUE,
edge.color = "clipped-ramp",
seed = 2321673)
## ----message=FALSE, warning=FALSE---------------------------------------------
# simplify by using cor_cutoff
simdata::plot_estimated_cor_network(sim_design,
seed = 2)
# set correlation type
simdata::plot_estimated_cor_network(sim_design,
cor_type = "spearman",
seed = 2321673)
# set various parameters
simdata::plot_estimated_cor_network(sim_design, seed = 2321673,
edge.color = "red-blue",
axes = TRUE, cor_type = "s",
edge_width_function = function(x) var(x)*200,
show_categorical = FALSE,
mar = c(2, 2, 0, 0))
## -----------------------------------------------------------------------------
sim_design$process_final = list(
process_truncate_by_iqr = list(
truncate_multipliers = c(v6 = 2, v7 = 2, v8 = 2)
)
)
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)
## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
numeric_vars = names(sim_design$types_final)[sim_design$types_final == "numeric"]
plotdf = melt(sim_data[, numeric_vars])
p = ggplot(plotdf, aes(x = "", y = value)) +
geom_violin() +
geom_dotplot(binaxis = "y", stackdir = "center") +
facet_wrap(~ variable, scales = "free", nrow = 3) +
theme_bw() +
ylab("Value") +
theme(axis.title.x = element_blank())
print(p)
## -----------------------------------------------------------------------------
sim_design$process_final = list(
process_truncate_by_threshold = list(
truncate_upper = c(v8 = 200, v7 = 300),
truncate_lower = c(v6 = 2)
)
)
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)
## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
numeric_vars = names(sim_design$types_final)[sim_design$types_final == "numeric"]
plotdf = melt(sim_data[, numeric_vars])
ggplot(plotdf, aes(x = "", y = value)) +
geom_violin() +
geom_dotplot(binaxis = "y", stackdir = "center") +
facet_wrap(~ variable, scales = "free", nrow = 3) +
theme_bw() +
ylab("Value") +
theme(axis.title.x = element_blank())
## -----------------------------------------------------------------------------
sim_design$process_final = list(
process_truncate_by_iqr = list(
truncate_multipliers = c(v6 = 2, v7 = 2, v8 = 2)
),
scale = list(),
data.frame = list()
)
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)
## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
numeric_vars = names(sim_design$types_final)[sim_design$types_final == "numeric"]
plotdf = melt(sim_data[, numeric_vars])
ggplot(plotdf, aes(x = "", y = value)) +
geom_violin() +
geom_dotplot(binaxis = "y", stackdir = "center") +
facet_wrap(~ variable, scales = "free", nrow = 3) +
theme_bw() +
ylab("Value") +
theme(axis.title.x = element_blank())
## -----------------------------------------------------------------------------
correlation_matrix = diag(1, nrow = 2)
transformation = simdata::function_list("v1" = function(x) x[, 1],
"v1*2" = function(x) x[, 1] * 2,
check.names = FALSE)
sim_design = simdata::simdesign_mvtnorm(
relations = correlation_matrix,
transform_initial = transformation,
prefix_final = NULL
)
# ignoring the collinearity
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 2)
knitr::kable(cor(sim_data))
# rejecting collinear matrices
sim_data = simdata::simulate_data_conditional(sim_design,
n_obs = 100, seed = 2,
reject = is_collinear,
reject_max_iter = 3,
return_tries = TRUE)
sim_data
## -----------------------------------------------------------------------------
correlation_matrix = diag(1, nrow = 3)
transformation = simdata::function_list(
"v1" = function(x) x[, 1],
"might_be_collinear" = function(x) {
if (rbinom(1, 1, 0.5)) {
return(x[, 1] * 2)
} else return(x[, 2])
},
"might_be_constant" = function(x) {
if (rbinom(1, 1, 0.5)) {
return(0)
} else return(x[, 3])
},
check.names = FALSE)
sim_design = simdata::simdesign_mvtnorm(
relations = correlation_matrix,
transform_initial = transformation,
prefix_final = NULL
)
sim_data = simdata::simulate_data_conditional(
sim_design,
n_obs = 100, seed = 3,
reject = simdata::function_list(is_collinear,
contains_constant),
reject_max_iter = 3,
return_tries = TRUE)
sprintf("Number of tries: %d", sim_data[[2]])
knitr::kable(head(sim_data[[1]]))
## -----------------------------------------------------------------------------
binomial_simdesign <- function(size = 1, prob = 0.5, ...) {
# define generator function
# make sure it returns a two-dimensional array
generator = function(n) matrix(rbinom(n, size = size, prob = prob), ncol = 1)
# setup simdesign object
# make sure to pass generator function and ...
# all other information passed is optional
dsgn = simdata::simdesign(
generator = generator,
size = size,
prob = prob,
...
)
# extend the class attribute
class(dsgn) = c("binomial_simdesign", class(dsgn))
# return the object
dsgn
}
## -----------------------------------------------------------------------------
sim_design = binomial_simdesign(size = 1, prob = 0.7)
sim_data = simdata::simulate_data(sim_design, 100, seed = 1)
knitr::kable(table(sim_data))
## -----------------------------------------------------------------------------
realdata_simdesign <- function(dataset, ...) {
# define generator function
# make sure it returns a two-dimensional array
generator = function(n) dataset[sample(1:nrow(dataset), n, replace = TRUE), ,
drop = FALSE]
# setup simdesign object
# make sure to pass generator function and ...
# all other information passed is optional
dsgn = simdata::simdesign(
generator = generator,
dataset = dataset,
...
)
# extend the class attribute
class(dsgn) = c("realdata_simdesign", class(dsgn))
# return the object
dsgn
}
## -----------------------------------------------------------------------------
data(iris)
sim_design = realdata_simdesign(iris, prefix_final = NULL)
sim_data = simdata::simulate_data(sim_design, 100, seed = 1)
knitr::kable(head(sim_data))
## -----------------------------------------------------------------------------
correlation_matrix = simdata::cor_from_upper(
100,
entries = rbind(
expand.grid(1:30, 1:30, 0.5),
expand.grid(31:50, 31:50, 0.2))
)
# create list of transformation functions programmatically
# For the first 60 variables:
# odd varibles will be translated
# even variables will be scaled
transformation = list()
for (i in 1:60) {
if (i %% 2) {
transformation[[i]] = substitute(function(x) x[, i] * 5, list(i = i))
} else transformation[[i]] = substitute(function(x) x[, i] - 10, list(i = i))
}
# the remaining are returned as they are
transformation[[61]] = function(x) x[, 61:100]
# construct single transformation function from the list
transformation = simdata::as_function_list(transformation)
# create simulation design
sim_design = simdata::simdesign_mvtnorm(
relations = correlation_matrix,
transform_initial = transformation
)
## ----echo=FALSE---------------------------------------------------------------
plotdf = melt(sim_design$cor_initial)
ggplot(plotdf, aes(x = factor(Var1), y = factor(Var2), fill = value)) +
geom_tile() +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev) +
xlab(expr(paste("Initial variable ", z[i]))) +
ylab(expr(paste("Initial variable ", z[i]))) +
coord_equal() +
scale_fill_gradient2(
"Correlation",
limits = c(-1, 1),
low = "#377eb8", mid = "white", high = "#e41a1c") +
theme_bw() +
theme(axis.text = element_blank())
## -----------------------------------------------------------------------------
simdata::plot_cor_network(sim_design, seed = 2,
vertex_labels = NA,
edge_label_function = NULL,
edge_width_function = function(x) 0.01,
edge_weight_function = function(x) 0.25 * x,
use_edge_weights = TRUE,
edge.color = "clipped-ramp",
vertex.size = 3)
## -----------------------------------------------------------------------------
sim_data = simdata::simulate_data(sim_design, n_obs = 50, seed = 5)
## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
ggplot(melt(sim_data), aes(x = value, color = variable)) +
geom_density() +
scale_color_viridis_d(option = "plasma") +
guides(color = "none") +
xlab("Value") + ylab("Density") +
theme_bw()
## ----eval=FALSE, include=TRUE-------------------------------------------------
# correlation_matrix = diag(1, nrow = 5)
# transformation = simdata::function_list(
# "v1" = function(x) x[, 1],
# "v2*2" = function(x) x[, 2] * 2,
# "v3^2" = function(x) x[, 3]^2,
# "v4+v5" = function(x) x[, 4] + x[, 5],
# check.names = FALSE # pass columnnames exactly as specified here
# )
#
# sim_design = simdata::simdesign_mvtnorm(
# relations = correlation_matrix,
# transform_initial = transformation,
# prefix_final = NULL
# )
#
# # parallelisation
# cl = parallel::makeCluster(1)
# doParallel::registerDoParallel(cl)
# res = foreach(
# i = 1:10,
# .packages = c("simdata"),
# .options.RNG = 1 # note that the seed is passed here
# ) %dorng% {
# simulate_data(sim_design, n_obs = 100)
#
# # do some task with the simulated data
# }
# parallel::stopCluster(cl)
#
# knitr::kable(purrr::map(res[1:2], summary))
## -----------------------------------------------------------------------------
# parameter
mult = 2
transformation = simdata::function_list(
"v1" = function(x) x[, 1] * mult # dangerous, depends on global variable!
)
sim_design = simdata::simdesign_mvtnorm(
relations = diag(1),
transform_initial = transformation
)
sample1 = simdata::simulate_data(sim_design, n_obs = 5, seed = 25897165)
# change value of global variable
mult = 4
sample2 = simdata::simulate_data(sim_design, n_obs = 5, seed = 25897165)
# note the different values in both columns
knitr::kable(cbind(sample1, sample2))
## -----------------------------------------------------------------------------
# parameter
mult = 2
transformation = simdata::function_list(
# specify function as partial
"v1" = simdata::partial(function(x, mult) x[, 1] * mult, mult = mult)
)
sim_design = simdata::simdesign_mvtnorm(
relations = diag(1),
transform_initial = transformation
)
sample1 = simdata::simulate_data(sim_design, n_obs = 5, seed = 25897165)
# change value of global variable
mult = 4
sample2 = simdata::simulate_data(sim_design, n_obs = 5, seed = 25897165)
# note both columns are equal now, as expected
knitr::kable(cbind(sample1, sample2))
## ----echo=FALSE---------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.