Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.align = "center",
fig.width = 7,
fig.height = 6,
cache = TRUE
)
## -----------------------------------------------------------------------------
library(catalytic)
set.seed(1)
n <- 20 # Number of observations
p <- 5 # Number of predictors
obs_x <- matrix(rnorm(n * (p - 1)), ncol = (p - 1)) # Observation covariates
true_coefs <- rnorm(p) # True coefficient
noise <- rnorm(n) # Noise for more response variability
obs_y <- true_coefs[1] + obs_x %*% true_coefs[-1] + noise # Observation response
obs_data <- as.data.frame(cbind(obs_x, obs_y))
names(obs_data) <- c(paste0("X", 1:(p - 1)), "Y")
# Seperate observation data into train and test data
train_idx <- sample(n, 10)
train_data <- obs_data[train_idx, ]
test_data <- obs_data[-train_idx, ]
print(dim(train_data))
## -----------------------------------------------------------------------------
# Fit a Linear regression model (GLM)
glm_model <- stats::glm(
formula = Y ~ .,
family = gaussian,
data = train_data
)
predicted_y <- predict(
glm_model,
newdata = test_data
)
cat(
"MLE GLM gaussian Model - Mean Square Error (Data):",
mean((predicted_y - test_data$Y)^2)
)
cat(
"\nMLE GLM gaussian Model - Sum Square Error (Coefficients):",
sum((coef(glm_model) - true_coefs)^2)
)
## -----------------------------------------------------------------------------
plot(test_data$Y,
predicted_y,
main = "Scatter Plot of true Y vs Predicted Y",
xlab = "true Y",
ylab = "Predicted Y",
pch = 19,
col = "blue"
)
# Add a 45-degree line for reference
abline(a = 0, b = 1, col = "red", lwd = 2)
## -----------------------------------------------------------------------------
cat_init <- cat_glm_initialization(
formula = Y ~ 1,
family = gaussian,
data = train_data,
syn_size = 50,
custom_variance = NULL,
gaussian_known_variance = FALSE,
x_degree = NULL,
resample_only = FALSE,
na_replace = stats::na.omit
)
cat_init
## -----------------------------------------------------------------------------
names(cat_init)
## -----------------------------------------------------------------------------
# The number of observations (rows) in the original dataset (`obs_data`)
cat_init$obs_size
# The information detailing the process of resampling synthetic data
cat_init$syn_x_resample_inform
## -----------------------------------------------------------------------------
cat_glm_model <- cat_glm(
formula = Y ~ ., # Same as `~ .`
cat_init = cat_init, # Output object from `cat_glm_initialization`
tau = 10 # Defaults to the number of predictors / 4
)
cat_glm_model
## -----------------------------------------------------------------------------
cat_glm_predicted_y <- predict(
cat_glm_model,
newdata = test_data
)
cat(
"Catalytic `cat_glm` - Mean Square Error (Data):",
mean((cat_glm_predicted_y - test_data$Y)^2)
)
cat(
"\nCatalytic `cat_glm` - Sum Square Error (Coefficients):",
sum((coef(cat_glm_model) - true_coefs)^2)
)
## -----------------------------------------------------------------------------
plot(test_data$Y,
cat_glm_predicted_y,
main = "Scatter Plot of true Y vs Predicted Y (cat_glm)",
xlab = "true Y",
ylab = "Predicted Y (cat_glm)",
pch = 19,
col = "blue"
)
# Add a 45-degree line for reference
abline(a = 0, b = 1, col = "red", lwd = 2)
## -----------------------------------------------------------------------------
names(cat_glm_model)
## -----------------------------------------------------------------------------
# The formula used for modeling
cat_glm_model$formula
# The fitted GLM model object obtained from `stats::glm` with `tau`
cat_glm_model$coefficients
## -----------------------------------------------------------------------------
cat_glm_tune_cv <- cat_glm_tune(
formula = Y ~ ., # Same as `~ .`
cat_init = cat_init,
risk_estimate_method = "cross_validation", # Default auto-select based on the data size
discrepancy_method = "mean_square_error", # Default auto-select based on family
tau_seq = seq(0, 5, 0.5), # Default is a numeric sequence around the number of predictors / 4
cross_validation_fold_num = 2 # Default: 5
)
cat_glm_tune_cv
## -----------------------------------------------------------------------------
plot(cat_glm_tune_cv, legend_pos = "topright", text_pos = 3)
## -----------------------------------------------------------------------------
cat_glm_tune_boots <- cat_glm_tune(
formula = ~., # Same as `Y ~ .`
cat_init = cat_init,
risk_estimate_method = "parametric_bootstrap", # Default auto-select based on the data size
discrepancy_method = "mean_square_error", # Default auto-select based on family
tau_0 = 2, # Default: 1
parametric_bootstrap_iteration_times = 5, # Default: 100
)
cat_glm_tune_boots
## -----------------------------------------------------------------------------
cat_glm_tune_mallowian <- cat_glm_tune(
formula = ~., # Same as `Y ~ .`
cat_init = cat_init,
risk_estimate_method = "mallowian_estimate", # Default auto-select based on the data size
discrepancy_method = "mean_square_error", # Default auto-select based on family
)
cat_glm_tune_mallowian
## -----------------------------------------------------------------------------
cat_glm_tune_auto <- cat_glm_tune(
formula = ~., # Same as `Y ~ .`
cat_init = cat_init
)
cat_glm_tune_auto
## ----eval=FALSE---------------------------------------------------------------
# cat_glm_tune_predicted_y <- predict(
# cat_glm_tune_auto,
# newdata = test_data
# )
#
# cat(
# "Catalytic `cat_glm_tune` - Mean Square Error (Data):",
# mean((cat_glm_tune_predicted_y - test_data$Y)^2)
# )
#
# cat(
# "\nCatalytic `cat_glm_tune` - Sum Square Error (Coefficients):",
# sum((coef(cat_glm_tune_auto) - true_coefs)^2)
# )
## ----eval=FALSE---------------------------------------------------------------
# plot(test_data$Y, cat_glm_tune_predicted_y,
# main = "Scatter Plot of true Y vs Predicted Y (cat_glm_tune)",
# xlab = "true Y",
# ylab = "Predicted Y (cat_glm_tune)",
# pch = 19,
# col = "blue"
# )
#
# # Add a 45-degree line for reference
# abline(a = 0, b = 1, col = "red", lwd = 2)
## ----eval=FALSE---------------------------------------------------------------
# names(cat_glm_tune_auto)
## ----eval=FALSE---------------------------------------------------------------
# # The method used for risk estimation
# cat_glm_tune_auto$risk_estimate_method
#
# # Selected optimal tau value from `tau_seq` that minimizes discrepancy error
# cat_glm_tune_auto$tau
## ----eval = FALSE-------------------------------------------------------------
# cat_glm_bayes_model <- cat_glm_bayes(
# formula = Y ~ ., # Same as `~ .`
# cat_init = cat_init,
# tau = 50, # Default: number of predictors / 4
# chains = 1, # Default: 4
# iter = 100, # Default: 2000
# warmup = 50, # Default: 1000
# algorithm = "NUTS", # Default: NUTS
# gaussian_variance_alpha = 1, # Default: number of predictors
# gaussian_variance_beta = 1 # Default: number of predictors times variance of observation response
# )
#
# cat_glm_bayes_model
## ----eval = FALSE-------------------------------------------------------------
# traceplot(cat_glm_bayes_model)
## ----eval = FALSE-------------------------------------------------------------
# traceplot(cat_glm_bayes_model, inc_warmup = TRUE)
## ----eval = FALSE-------------------------------------------------------------
# cat_glm_bayes_predicted_y <- predict(
# cat_glm_bayes_model,
# newdata = test_data
# )
#
# cat(
# "Catalytic cat_glm_bayes - Mean Square Error (Data):",
# mean((cat_glm_bayes_predicted_y - test_data$Y)^2)
# )
#
# cat(
# "\nCatalytic cat_glm_bayes - Sum Square Error (Coefficients):",
# sum((coef(cat_glm_bayes_model) - true_coefs)^2)
# )
## ----eval = FALSE-------------------------------------------------------------
# plot(test_data$Y, cat_glm_bayes_predicted_y,
# main = "Scatter Plot of true Y vs Predicted Y (cat_glm_bayes)",
# xlab = "true Y",
# ylab = "Predicted Y (cat_glm_bayes)",
# pch = 19,
# col = "blue"
# )
#
# # Add a 45-degree line for reference
# abline(a = 0, b = 1, col = "red", lwd = 2)
## ----eval = FALSE-------------------------------------------------------------
# names(cat_glm_bayes_model)
## ----eval = FALSE-------------------------------------------------------------
# # The number of Markov chains used during MCMC sampling in `rstan`.
# cat_glm_bayes_model$chain
#
# # The mean estimated coefficients from the Bayesian GLM model
# cat_glm_bayes_model$coefficients
## ----eval = FALSE-------------------------------------------------------------
# cat_glm_bayes_joint_model <- cat_glm_bayes_joint(
# formula = Y ~ ., # Same as `~ .`
# cat_init = cat_init,
# chains = 1, # Default: 4
# iter = 100, # Default: 2000
# warmup = 50, # Default: 1000
# algorithm = "NUTS", # Default: NUTS
# tau_alpha = 2, # Default: 2
# tau_gamma = 1 # Default: 1
# )
#
# cat_glm_bayes_joint_model
## ----eval = FALSE-------------------------------------------------------------
# traceplot(cat_glm_bayes_joint_model)
## ----eval = FALSE-------------------------------------------------------------
# traceplot(cat_glm_bayes_joint_model, inc_warmup = TRUE)
## ----eval = FALSE-------------------------------------------------------------
# cat_glm_bayes_joint_predicted_y <- predict(
# cat_glm_bayes_joint_model,
# newdata = test_data
# )
#
# cat(
# "Catalytic `cat_glm_bayes_joint` - Mean Square Error (Data):",
# mean((cat_glm_bayes_joint_predicted_y - test_data$Y)^2)
# )
#
# cat(
# "\nCatalytic `cat_glm_bayes_joint` - Sum Square Error (Coefficients):",
# sum((coef(cat_glm_bayes_joint_model) - true_coefs)^2)
# )
## ----eval = FALSE-------------------------------------------------------------
# plot(test_data$Y,
# cat_glm_bayes_joint_predicted_y,
# main = "Scatter Plot of true Y vs Predicted Y (cat_glm_bayes_joint)",
# xlab = "true Y",
# ylab = "Predicted Y (cat_glm_bayes_joint)",
# pch = 19,
# col = "blue"
# )
#
# # Add a 45-degree line for reference
# abline(a = 0, b = 1, col = "red", lwd = 2)
## ----eval = FALSE-------------------------------------------------------------
# names(cat_glm_bayes_joint_model)
## ----eval = FALSE-------------------------------------------------------------
# # The estimated tau parameter from the MCMC sampling `rstan::sampling`,
# cat_glm_bayes_joint_model$tau
#
# # The mean estimated coefficients from the Bayesian GLM model
# cat_glm_bayes_joint_model$coefficients
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.