#' @title Methods of the class [`xy_sim`][Xy()]
#' @details With the help of these methods you can further manipulate a cooked
#' simulation.
#' * `print.xy_sim()`: Gives you an overview of the simulation
#' * `coef.xy_sim()`: Extracts the beta coeffecients of the simulation from
#' \eqn{y = X\beta + e}
#' * `plot.xy_sim()`: Will plot the true effects of the simulation, e.g. X vs y
#' * `transform.xy_sim()`: Will return the adjusted simulated data, i.e. it will
#' apply all nonlinear transformations to the raw simulated
#' effects and multiply the X by its beta coefficient.
#' This function is mostly used internally, however,
#' exposed to the user as it could be needed in edge
#' cases.
#' * `formula.xy_sim()`: Will return a formula object which can be forwarded
#' to the machine learning algorithm. Note: Uninformative
#' features are added as well.
#' @rdname xy_sim
#' @param x an object of class [`xy_sim`][Xy()]
#' @param object an object of class [`xy_sim`][Xy()]
#' @param _data an object of class [`xy_sim`][Xy()]
#' @param ... additional parameters
#' @import ggplot2 dplyr tibble
#' @importFrom crayon bold red green yellow italic
#' @importFrom stringr str_pad
#' @importFrom glue glue
#' @importFrom tidyr gather
#' @importFrom purrr map2
#' @importFrom rlang abort
#'
#' @examples
#' # create a simulation
#' linear_sim <- Xy() %>%
#' add_linear(p = 5) %>%
#' simulate(n = 100)
#'
#' # print the simulation
#' simulation_info <- linear_sim %>% print()
#'
#' # get the coefficients of the features
#' simulation_coefs <- linear_sim %>% coef()
#'
#' # plot the underlying true effect of X
#' simulation_plot <- linear_sim %>% plot()
#'
#' # transform the data of the simulation such that the features are transformed
#' # e.g. nonlinear features are scaled by their functions.
#' transformed_simulation <- linear_sim %>% transform()
#'
#' # fetch the formula
#' eqn <- linear_sim %>% formula()
#' @name xy_sim
#'
NULL
# PRINT -------------------------------------------------------------------
#' @export
#' @name xy_sim
print.xy_sim <- function(x, ...) {
# fetch the simulation recipe
book <- x$book
# define variable style
var_style <- bold$yellow
# check screen width
screen_width <- as.integer(getOption("width") / 3)
# extract quantity
n_vars <- book %>%
group_by(type) %>%
summarize(n = n(), .groups = "keep")
# extract noise
book_e <- book %>% filter(type == "noise")
if (nrow(book_e) == 0) {
print_e <- "e ~ ?(?)"
} else {
params <- book_e %>% pull(params)
fam_name <- book_e %>% pull(family)
print_e <- paste0(
"e ~ ", fam_name, "(",
params[[1]] %>%
paste0(names(.),
" = ",
.,
collapse = ", "
),
")"
)
}
# prepare target generating process
# generate the equation for the target generating process
tgp_main_effects <- function(x) {
# fetch data
data <- x$psi
# early exit if there are no main effects
if (sum(diag(x$psi)) == 0) {
return(character())
}
idx <- str_detect(colnames(data), "random|^y$|^e$")
col_names <- colnames(data)[!idx]
raw_weights <- diag(data)[!idx]
col_names[which(col_names == "intercept")] <- ""
weights <- ifelse(raw_weights < 0,
paste("{red(\"-\")}", paste("{red(", prettyNum(signif(abs(raw_weights), 2)), ")}")),
paste("+", prettyNum(signif(raw_weights, 2)))
)
out <- paste(weights, paste0("{\"", col_names, "\"}"))
out[seq(1, length(out), 3)] <- paste0(out[seq(1, length(out), 3)], "\n")
out <- paste0(out, collapse = "\t\t")
return(out)
}
tgp_interactions <- function(x) {
# fetch the interaction
data <- x$psi
# early exit if there are no interactions
if (!x$interactions) {
return(character())
}
# overwrite main effects
diag(data) <- 0
# fetch names
col_names <- colnames(data)
# fetch interactions
tgp_single_interaction <- function(i, m, k) {
# find interactions
interactions <- which(m[, i] != 0)
# exit if there are no interactions
if (length(interactions) == 0) {
return(character())
}
# prettify the weights
pretty_weights <- ifelse(m[interactions, i] < 0,
paste0("red(- ", signif(abs(m[interactions, i]), 2), ")"),
paste0("+ ", signif(m[interactions, i], 2))
)
# paste together weights (feature_1 * feature_2)
out <- paste0(pretty_weights,
"(",
"bold(, ", k[i], ") * bold(",
k[interactions],
"))",
sep = ""
) %>%
paste0(., collapse = " ")
return(out)
}
out <- sapply(seq_len(ncol(data)),
FUN = tgp_single_interaction,
m = data,
k = col_names
)
out[seq(1, length(out), 3)] <- paste0(out[seq(1, length(out), 3)], "\n")
out <- paste0(out, collapse = " ")
return(out)
}
# filter out random vars, noise and target
# the main effects
tgp_main <- tgp_main_effects(x = x)
# the interaction effects
tgp_interact <- tgp_interactions(x = x)
tgp_error <- paste0(" + {bold(print_e)}")
tgp_start <- "{bold(\"y\")} = "
tgp <- paste(tgp_start, tgp_main, tgp_interact, "\n", tgp_error)
# summary ----
cat(bold(paste0("Xy Simulation \n")))
cat(paste0("\n"))
cat(bold("recipe\n"))
cat(paste0(rep("\u25AC", screen_width), collapse = ""), "\n")
cat(paste0("\u2023 task ", var_style(x$task), "\n"))
interactions <- ifelse(x$interactions, green("\u2714"), red("\u2718"))
cat(paste(interactions, "interactions", "\n"))
# intercept
pre_int <- ifelse("intercept" %in% (x$book %>% pull(name)), green("\u2714"), red("\u2718"))
# print intercept
cat(paste(pre_int, "intercept", "\n"))
# effects -----
# extract maximum characters for console output
nchar_max_var <- nchar("uninformative")
# linear
lin <- n_vars %>%
filter(type == "linear") %>%
pull(n)
pre_lin <- ifelse(lin == 0, red("\u2718"), green("\u2714"))
# print lin
cat(paste(pre_lin, str_pad("linear", nchar_max_var, side = "right"), var_style(lin), "\n"))
# non linear
nonlin <- n_vars %>%
filter(type == "nonlinear") %>%
pull(n)
pre_nonlin <- ifelse(nonlin == 0, red("\u2718"), green("\u2714"))
# print nonlinear
cat(paste(pre_lin, str_pad("nonlinear", nchar_max_var, side = "right"), var_style(nonlin), "\n"))
# discrete
discr <- n_vars %>%
filter(type == "discrete") %>%
pull(n)
pre_discr <- ifelse(nonlin == 0, red("\u2718"), green("\u2714"))
# print discrete
cat(paste(pre_discr, str_pad("discrete", nchar_max_var, side = "right"), var_style(discr), "\n"))
# uninformative
rand_vars <- n_vars %>%
filter(type == "random") %>%
pull(n)
pre_rand_vars <- ifelse(rand_vars == 0, red("\u2718"), green("\u2714"))
# print uninformative
cat(paste(pre_rand_vars, "uninformative", var_style(rand_vars), "\n"))
# print noise
cat(paste(green("\u2714"), str_pad("noise", nchar_max_var, side = "right"), var_style(print_e), "\n\n"))
# simulation
max_sim_chars <- nchar("correlation interval")
cat(bold("simulation\n"))
cat(paste(rep("\u25AC", screen_width), collapse = ""), "\n")
cat(paste("\u2023", str_pad("n", max_sim_chars, side = "right"), var_style(nrow(x$data)), "\n"))
cat(paste("\u2023", str_pad("r-squared", max_sim_chars, side = "right"), var_style(x$r_sq), "\n"))
cat(paste("\u2023 correlation interval", var_style(paste0("[", min(x$cor), ", ", max(x$cor), "]")), "\n"))
cat(paste("\u2023", str_pad("noise", max_sim_chars, side = "right"), var_style(print_e), "\n"))
# data generating process ----
cat("\n")
cat(bold("Target generating process:"), "\n")
# check if output is printable
# do not print if there are more than 50 variables
if (x$book %>% nrow() > 50) {
cat(italic("Too many variables output omitted. (use object %>% coef())"))
} else {
cat(glue(tgp))
}
cat("\n")
return(invisible())
}
# COEF --------------------------------------------------------------------
#' @export
#' @name xy_sim
coef.xy_sim <- function(object, ...) {
effects <- !grepl("^y|^e", colnames(object$psi))
out <- diag(object$psi)[effects]
names(out) <- colnames(object$psi)[effects]
return(out)
}
# PLOT --------------------------------------------------------------------
#' @name xy_sim
#' @export
plot.xy_sim <- function(x, ...) {
# get data
plt_df <- x %>%
pull_xy() %>%
select(-matches("intercept|xd|random")) %>%
tidyr::gather(key = "effect", value = "value", -y)
effects_plt <- ggplot(plt_df, aes_string(x = "value", y = "y"))
if (nrow(x$data) > 1000) {
effects_plt <- effects_plt + geom_hex(colour = "#13235B")
} else {
effects_plt <- effects_plt + geom_point(colour = "#13235B")
}
effects_plt <- effects_plt +
facet_wrap(formula(paste0("~ effect")), scales = "free") +
theme_minimal(base_size = 14) +
xlab("") +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE)) +
ggtitle("True effects X vs y")
if (nrow(x$data) <= 1000) {
effects_plt <- effects_plt +
geom_smooth(formula = y ~ s(x, bs = "tp"), method = "gam", colour = "#00C792")
}
print(effects_plt)
return(effects_plt)
}
# TRANSFORM ---------------------------------------------------------------
#' @export
#' @name xy_sim
transform.xy_sim <- function(`_data`, ...) {
# get data
sim_mat <- `_data` %>% pull_xye()
# fetch nonlinear functions
nlfuns <- `_data`$book %>%
pull(nlfun)
# transform nonlinear
nl_mat <- sim_mat %>%
purrr::map2(.x = ., .y = nlfuns, .f = ~ .y(.x)) %>%
as.data.frame() %>%
as.matrix()
# add weights
out <- nl_mat %*% `_data`$psi
return(out)
}
# EQUATION ----------------------------------------------------------------
#' @export
#' @name xy_sim
formula.xy_sim <- function(x, ...) {
# fetch the data
data <- x$data
# create a formula object
features <- colnames(data)
# omit target, noise and intercept
features <- features[!stringr::str_detect("^y$|intercept|^e$", features)]
# handle nonlinear features
features <- stringr::str_replace(features, "(^f.*)", "`\\1`")
# fix intercept term
features <- c(ifelse("intercept" %in% colnames(data), "1", "-1"), features)
# build equation
out <- formula(paste0("y ~ ", paste0(features, collapse = " + ")))
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.