Nothing
## ----message=FALSE, warning=FALSE, include=FALSE------------------------------
library(simdata)
library(fitdistrplus)
library(dplyr)
library(ggplot2)
library(patchwork)
library(ggcorrplot)
knitr::opts_chunk$set(
error = TRUE,
eval = requireNamespace("NHANES", quietly = TRUE)
)
## ----eval=TRUE, echo=TRUE, message=FALSE, warning=FALSE-----------------------
if (requireNamespace("NHANES", quietly = TRUE)) {
df = NHANES::NHANES %>%
filter(SurveyYr == "2011_12") %>%
select(Gender, Age, Race = Race1, Weight,
BMI, BPsys = BPSys1, BPdia = BPDia1) %>%
filter(complete.cases(.)) %>%
filter(Age > 18) %>%
mutate(Gender = if_else(Gender == "male", 1, 2),
Race = as.numeric(Race))
print(head(df))
} else {
message("Package 'NHANES' not available.")
}
## ----echo=TRUE----------------------------------------------------------------
cor_target = cor(df)
ggcorrplot::ggcorrplot(cor_target, lab = TRUE)
## ----echo=TRUE, results='hide', warning=FALSE---------------------------------
dist_auto = quantile_functions_from_data(df, n_small = 15)
## ----echo=TRUE, results='hide', warning=FALSE---------------------------------
dist = list()
# gender
dist[["Gender"]] = function(x) qbinom(x, size = 1, prob = 0.5)
# age
dens = stats::density(df$Age, cut = 1) # cut defines how to deal with boundaries
# integrate
int_dens = cbind(Age = dens$x, cdf = cumsum(dens$y))
# normalize to obtain cumulative distribution function
int_dens[, "cdf"] = int_dens[, "cdf"] / max(int_dens[, "cdf"])
# derive quantile function
# outside the defined domain retun minimum and maximum age, respectively
dist[["Age"]] = stats::approxfun(int_dens[, "cdf"], int_dens[, "Age"],
yleft = min(int_dens[, "Age"]),
yright = max(int_dens[, "Age"]))
# race
dist[["Race"]] = function(x)
cut(x, breaks = c(0, 0.112, 0.177, 0.253, 0.919, 1),
labels = 1:5)
# weight
fit = fitdistrplus::fitdist(as.numeric(df$Weight), "gamma")
summary(fit)
dist[["Weight"]] = function(x) qgamma(x, shape = 16.5031110, rate = 0.2015375)
# bmi
fit = fitdistrplus::fitdist(as.numeric(df$BMI), "lnorm")
summary(fit)
dist[["BMI"]] = function(x) qlnorm(x, meanlog = 3.3283118, sdlog = 0.2153347)
# systolic blood pressure
fit = fitdistrplus::fitdist(as.numeric(df$BPsys), "lnorm")
summary(fit)
dist[["BPsys"]] = function(x) qlnorm(x, meanlog = 4.796213, sdlog = 0.135271)
# diastolic blood pressure
fit = fitdistrplus::fitdist(as.numeric(df %>%
filter(BPdia > 0) %>%
pull(BPdia)), "norm")
summary(fit)
dist[["BPdia"]] = function(x) qnorm(x, mean = 71.75758, sd = 11.36352)
## -----------------------------------------------------------------------------
# use automated specification
dsgn_auto = simdata::simdesign_norta(cor_target_final = cor_target,
dist = dist_auto,
transform_initial = data.frame,
names_final = names(dist),
seed_initial = 1)
simdf_auto = simdata::simulate_data(dsgn_auto, nrow(df), seed = 2)
# use manual specification
dsgn = simdata::simdesign_norta(cor_target_final = cor_target,
dist = dist,
transform_initial = data.frame,
names_final = names(dist),
seed_initial = 1)
simdf = simdata::simulate_data(dsgn, nrow(df), seed = 2)
## ----message=FALSE, warning=FALSE---------------------------------------------
summary(df)
summary(simdf_auto)
summary(simdf)
## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
ggcorrplot::ggcorrplot(cor(df), title = "Original", lab = TRUE)
ggcorrplot::ggcorrplot(cor(simdf_auto), title = "Simulated (automated)", lab = TRUE)
ggcorrplot::ggcorrplot(cor(simdf), title = "Simulated (manual)", lab = TRUE)
## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
vars = c("Age", "Weight", "BMI", "BPsys", "BPdia")
limits = list(
Age = c(10, 90),
Weight = c(0, 225),
BMI = c(10, 80),
BPsys = c(65, 240),
BPdia = c(0, 140)
)
plist = list()
for (i in seq_along(vars)) for (j in seq_along(vars)) {
vari = vars[i]
varj = vars[j]
if (i == j) {
p = ggplot(df, aes_string(x = vari)) +
geom_density() +
geom_density(data = simdf, color = "red") +
coord_cartesian(xlim = limits[[vari]])
} else if (i < j) {
p = ggplot(df[1:1000, ], aes_string(x = vari, y = varj)) +
geom_point(alpha = 0.04) +
coord_cartesian(xlim = limits[[vari]], ylim = limits[[varj]])
} else {
p = ggplot(simdf_auto[1:1000, ], aes_string(x = varj, y = vari)) +
geom_point(color = "red", alpha = 0.04) +
coord_cartesian(xlim = limits[[varj]], ylim = limits[[vari]])
}
plist = append(plist, list(p + theme_bw(base_size = 7)))
}
p = patchwork::wrap_plots(plist) +
patchwork::plot_annotation(title = "Simulated (automated)")
print(p)
plist = list()
for (i in seq_along(vars)) for (j in seq_along(vars)) {
vari = vars[i]
varj = vars[j]
if (i == j) {
p = ggplot(df, aes_string(x = vari)) +
geom_density() +
geom_density(data = simdf, color = "red") +
coord_cartesian(xlim = limits[[vari]])
} else if (i < j) {
p = ggplot(df[1:1000, ], aes_string(x = vari, y = varj)) +
geom_point(alpha = 0.04) +
coord_cartesian(xlim = limits[[vari]], ylim = limits[[varj]])
} else {
p = ggplot(simdf[1:1000, ], aes_string(x = varj, y = vari)) +
geom_point(color = "red", alpha = 0.04) +
coord_cartesian(xlim = limits[[varj]], ylim = limits[[vari]])
}
plist = append(plist, list(p + theme_bw(base_size = 7)))
}
p = patchwork::wrap_plots(plist) +
patchwork::plot_annotation(title = "Simulated (manual)")
print(p)
## ----echo=FALSE---------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.