# data processing for conformal_pdf analysis
### POTENTIAL OPTIONS
# a. vary between: local vs global conformal # more a worry with cde approach - let's focus on doing 'mass' non-conformal score
# b. data options: lei-wasserman: perfect fit vs "y" poor fit
# c. conformal score: cde vs mass (maybe sim based -or- not?) ## let's not focus on sim vs not currently
# d. scaling attempted during optimization
# Libraries ----------------------------------
library(shiny)
library(flexdashboard)
library(tidyverse)
library(ks)
library(latex2exp)
library(shinyWidgets)
library(tidyverse)
library(pracma)
library(gridExtra)
library(latex2exp)
library(flexmix)
devtools::load_all(path = ".") # simulationBands
### Defaults
theme_set(theme_minimal() +
theme(text=element_text(size=16),
aspect.ratio = 1))
n <- 2000
num_sim_y <- 200
vis_bool <- FALSE
for (data_idx in 1:3){
data_name_full <- c("Lei & Wasserman (2014) - perfect fit",
"Y data - bad fit", "Y2 data - really bad fit")[data_idx]
data_name <- c("lw","y", "y2")[data_idx]
for (conformal_approach in c("cde", "mass")) {
for (conformal_mass_perfect in c(T, F)){
if(conformal_mass_perfect & conformal_approach == "cde"){
} else {
for (log10_lambda in c(-Inf,-5,-4,-3, -2,-1,0,1)) {
for (scale_optimization in c(T,F)){
for (prime in c(F, T)){
for (conditional in c(F, T)){
# Data generation ----------------------------
if (data_name == "lw"){
data_all <- lei_wasserman_data(n = n)
if (conditional){
Xx <- runif(n = n, min = .65, max = .85)
data_calibrate_list <- lei_wasserman_data_conditional_simulate(x = Xx,
n = 1,
verbose = F)
data_calibrate <- do.call(rbind, data_calibrate_list) %>%
rename("y"= "sim")
} else {
data_calibrate <- lei_wasserman_data(n = n)
}
} else { #if (data_name == "y" or "y2") {
data_all <- generate_split_mixture(n = n)
if (conditional){
Xx <- runif(n = n, min = 9, max = 10)
Yy <- gen_mix_reg(x = Xx)
data_calibrate <- data.frame(x = Xx, y = Yy)
} else {
data_calibrate <- generate_split_mixture(n = n)
}
}
}
if (vis_bool){
ggplot(data_all) +
geom_point(aes(x = x, y = y)) +
labs(title = "True distribution (training data)",
subtitle = data_name_full)
}
# Fit creation ------------------------------
if (data_name == "y") {
model <- flexmix(y~x, k = 2, data = data_all)
beta <- sapply(model@components, function(item) item[[1]]@parameters$coef[2])
intercept <- sapply(model@components, function(item) item[[1]]@parameters$coef[1])
sigma <- sapply(model@components, function(item) item[[1]]@parameters$sigma)
proportion <- model@prior
# get conformal scores from simulations
model_params <- list(beta = beta, intercept = intercept,
sd = sigma, prop = proportion)
} else if (data_name == "y2") {
beta <- c(1,1)
intercept <- c(-5,-5)
sigma <- c(1,1)
proportion <- c(.5,.5)
model_params <- list(beta = beta, intercept = intercept,
sd = sigma, prop = proportion)
}
# Calibration set's conformal score ---------------------------
if (data_name %in% c("y", "y2")){
conformal_score_info <- data_calibrate %>%
dplyr::mutate(id = 1:nrow(data_calibrate)) %>%
dplyr::group_by(id) %>%
tidyr::nest() %>%
dplyr::mutate(sim_cs = purrr::map(data, function(df) {
simulation_conformal_scores_mix_reg(df$x, df$y,
model_params, sim = num_sim_y)}),
conformal_score_true_density = purrr::map(data, function(df) {
multimode_density_f(df$y, df$x,
beta = beta,
intercept = intercept,
sd = sigma,
proportions = proportion)}
)) %>%
dplyr::select(-data) %>%
unnest(col = c(sim_cs, conformal_score_true_density))
conformal_calibration_dist_cde <- conformal_score_info$empirical_cde
conformal_calibration_dist_mass <- conformal_score_info$non_conformal_mass_ecde_and_sim_based
} else { #data_name == "lw"
conformal_score_info <- data_calibrate %>%
dplyr::mutate(id = 1:nrow(data_calibrate)) %>%
dplyr::group_by(id) %>%
tidyr::nest() %>%
dplyr::mutate(data = purrr::map(data, function(df) {
conformal_scores_lei_wasserman(df$x, df$y)
})) %>%
tidyr::unnest(data)
conformal_calibration_dist_cde <- conformal_score_info$cde
conformal_calibration_dist_mass <- conformal_score_info$mass
}
if (conformal_mass_perfect){
n_conformal <- length(conformal_calibration_dist_mass)
conformal_calibration_dist_mass <- 1:n_conformal/n_conformal
}
# true x selection ----------------------------
if (data_name %in% c("y", "y2")){
x <- 9.5
} else { #data_name == "lw"
x <- .75
}
# building prediction region information ----------
if (data_name %in% c("y", "y2")) {
delta_y <- .001 # delta_y needs to be passed through
yy <- seq(-15, 15, by = delta_y)
first_df <- data.frame(yy = yy,
true_density = multimodel_density_split_mixture(yy, x),
fit_density = multimode_density_f(yy, x,
beta = model_params$beta,
intercept = model_params$intercept,
sd = model_params$sd,
proportions = model_params$prop))
} else { # data_name == "lw"
delta_y <- .001
if (conformal_mass_perfect & log10_lambda == -Inf){
delta_y <- .0001
}
yy <- seq(-8, 8, by = delta_y)
first_df <- data.frame(yy = yy,
true_density = cde_lei_wassserman(x)(yy)) %>%
mutate(fit_density = true_density)
}
if (conformal_approach == "cde") {
first_df <- first_df %>% arrange(yy) %>%
conformal_pdf_breaks(unique(conformal_calibration_dist_cde),
conformal_score = "cde") %>%
arrange(yy)
} else{ #conformal_approach == "mass"
first_df <- first_df %>% arrange(yy) %>%
conformal_pdf_breaks(unique(conformal_calibration_dist_mass),
conformal_score = "mass") %>%
arrange(yy)
}
second_df <- first_df %>%
cumulative_comparisons(group_columns = c(g_id, g_id2, g_id3,
cut_off_lower, cut_off_upper))
# conformal_pdf ---------------------------
prob_cde <- second_df$fit_prop#second_df$fit_prop_lag[-1]
n_conformal_pdf <- length(prob_cde) - 1
try({solution_qp_l2_lam <- stepwise_conformal_cde_update(n_conformal_pdf, prob_cde,
lambda = ifelse(!is.infinite(log10_lambda),
10^(log10_lambda),
-1),
prime = prime,
alpha = 1,
scaling_constraint = scale_optimization)
if (vis_bool){
vis_stepwise_scaling_only(solution_qp_l2_lam)
vis_stepwise_scaling_only(solution_qp_l2_lam,second_df$cut_off_lower)
ggplot(second_df) +
geom_histogram(aes(x = cut_off_lower, y = ..density..)) +
geom_density(aes(x = cut_off_lower))
vis_all_densities(stepwise_scaling = solution_qp_l2_lam,
individual_df_rich = first_df,
cumulative_comparisons_df = second_df,
scale = TRUE)
}
# saving ----------------------------------------------
save_string_name <- sprintf(
paste0("dev/conformal_pdf_data/conformal_pdf_%s_%s_log10_lambda-%s",
"_scale_optimized-%s_perfect_conformal_mass-%s_prime-%s",
"_conditional-%s.Rdata"),
data_name, conformal_approach,
log10_lambda, scale_optimization,
conformal_mass_perfect, prime,conditional)
print(save_string_name)
save(solution_qp_l2_lam, first_df, second_df, file = save_string_name)
})
}
}
}
}
}
}
}
for (data_name in c("lw", "y", "y2")){
if (data_name == "y"){
x <- 9.5
} else { #data_name == "lw"
x <- .75
}
if (data_name == "lw"){
data_all <- lei_wasserman_data(n = n)
data_fit <- lei_wasserman_data(n = n)
} else { # data_name == "y"
data_all <- generate_split_mixture(n = n)
}
if (data_name == "y") {
model <- flexmix(y~x, k = 2, data = data_all)
beta <- sapply(model@components, function(item) item[[1]]@parameters$coef[2])
intercept <- sapply(model@components, function(item) item[[1]]@parameters$coef[1])
sigma <- sapply(model@components, function(item) item[[1]]@parameters$sigma)
proportion <- model@prior
data_fit <- generate_split_mixture(n = n,
right_beta = beta,
right_intercept = intercept,
right_sd = sigma,
right_proportions = proportion)
} else if (data_name == "y2"){
beta <- c(1,1)
intercept <- c(-5,-5)
sigma <- c(1,1)
proportion <- c(.5,.5)
data_fit <- generate_split_mixture(n = n,
right_beta = beta,
right_intercept = intercept,
right_sd = sigma,
right_proportions = proportion)
}
data_both <- rbind(data_all %>% mutate(color = "true distribution"),
data_fit %>% mutate(color = "fit distribution"))
save(data_both, x, file = paste0("dev/conformal_pdf_data/", data_name, ".Rdata"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.