# document hyper parameters
confidence_level <- .6
confidence_level_string <- "60%"
confidence_level_image_string <- "60percent"
n_simulations <- 1000
delta_prop <- .8
delta_prop_string <- "80%"
delta_prop_image_string <- "80percent"
n_sims_containment <- 300
verbose <- T
save_fig <- F
save_image <- save_fig
rerun <- F # to rerun all analysis even if already saved...
# Libraries -----------------
library(tidyverse)
library(devtools)
library(ggrepel)
library(gridExtra)
library(tikzDevice)
load_all("../")
load_all("../../EpiCompare/") # tidy_dist_mat
# Calibration and Test set -------------------
set.seed(1)
df <- tibble(x = runif(1000, min = -1.5, max = 1.5))
df$y <- sapply(df$x, function(x) {simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
calibration_set <- tibble(x = runif(1000, min = -1.5, max = 1.5))
calibration_set$y <- sapply(calibration_set$x, function(x) {simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
test_set <- tibble(x = runif(1000, min = -1.5, max = 1.5))
test_set$y <- sapply(test_set$x, function(x) {simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
test_set_discrete <- tibble(x = seq(-1.5,1.5, by = .01))
test_set_discrete$y <- sapply(test_set_discrete$x, function(x) {simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
vis_raw <- ggplot(test_set) +
geom_point(aes(x = x, y = y)) +
theme_minimal()
vis_raw
if (save_fig){
ggplot2::ggsave(vis_raw, filename = "quick_images/simulation_distribution_1d.png")
tikz(file = "quick_images/simulation_distribution_1d.tex")
print(vis_raw)
dev.off()
}
# Functions ------------------
#' #' Simulation based conformal score processor
#' #'
#' #' This is a basic one (below is a more complex one...)
#' #'
#' #' @param truth_df data frame of a *single* new observation
#' #' @param simulations_grouped_df grouped data frame of simulated points
#' #' @param data_column_names character vector of column names that define
#' #' locational information of points
#' #'
#' #' @return list of information including the conformal score of the observation
#' #' and internal information about the creation of the prediction regions
#' #'
#' #' @export
#' simulation_based_conformal_1d <- function(truth_df, simulations_grouped_df,
#' data_column_names = c("y"),
#' delta_prop = .8){
#'
#'
#' assertthat::assert_that(inherits(simulations_grouped_df, "grouped_df"))
#' group_names <- names(group_keys(simulations_grouped_df))
#'
#' truth_df_inner <- truth_df %>%
#' dplyr::select(dplyr::one_of(data_column_names))
#'
#' simulations_group_df_inner <- simulations_grouped_df %>%
#' dplyr::select(dplyr::one_of(c(group_names, data_column_names)))
#'
#' group_info <- simulations_group_df_inner %>% group_keys()
#'
#' dist_mat <- simulations_group_df_inner %>% group_split() %>%
#' do.call(rbind, .) %>% # match group_info ordering
#' select(one_of(data_column_names)) %>%
#' dist() %>% as.matrix()
#'
#' tdm_sims <- EpiCompare::tidy_dist_mat(dist_mat, group_info, group_info)
#'
#' # sigma selection
#'
#' sigma_size <- c("20%" = .2, "25%" = .25, "30%" = .3,
#' "35%" = .35, "40%" = .4, "45%" = .45)
#'
#' percentage <- names(sigma_size)[stats::quantile(as.matrix(tdm_sims), sigma_size) > 0][1]
#'
#'
#' # rank_df
#' pseudo_density_df <- EpiCompare::distance_psuedo_density_function(
#' tdm_sims,
#' sigma = percentage, df_out = T) %>%
#' mutate(ranking = rank(psuedo_density,ties.method = "min")) #spelling error... :(
#'
#' assertthat::assert_that(all(!is.na(pseudo_density_df$psuedo_density)),
#' msg = paste("internal error in",
#' "distance_psuedo_density_function",
#' "function's sigma selection."))
#'
#' mm_df <- maxmin_distance_vector(truth_df = truth_df_inner,
#' simulations_grouped_df = simulations_group_df_inner,
#' data_column_names = c("y"),
#' .all_info = F)
#'
#' proportion_points_not_included <- 1 - delta_prop
#'
#' top_points <- simulations_group_df_inner %>%
#' left_join(pseudo_density_df, by = group_names) %>%
#' mutate(keep = ranking > ceiling(proportion_points_not_included*nrow(simulations_group_df_inner))) %>%
#' ungroup() %>% filter(keep) %>%
#' select(one_of(data_column_names))
#'
#'
#' mm_delta <- get_delta_nn(top_points)
#'
#' containment_df <- pseudo_density_df %>%
#' left_join(mm_df, by = group_names) %>%
#' mutate(delta_close = maxmin_dist < mm_delta)
#'
#'
#' conformal_score <- max(c(containment_df$ranking[containment_df$delta_close],
#' 0))
#'
#' return(list(conformal_score = conformal_score, containment_df = containment_df,
#' mm_delta = mm_delta,
#' truth_df_inner = truth_df_inner,
#' simulations_group_df_inner = simulations_group_df_inner,
#' parameters = c("mm_delta_prop" = proportion_points_not_included,
#' "sigma_percentage" = percentage)))
#' }
#'
#' Mode clustering for euclidean data (distance based)
#'
#' Walks up a guassian kernel density estimate to find modes
#'
#' Similar to more complex code developed by Yen-Chi Chen and Chris Genovese
#'
#' @param X_mat matrix of data points (in rows), that define the kernel density
#' @param G_mat matrix of data points (in rows) that need to be "walked" to
#' their modes
#' @param sigma density scalar / sigma value
#' @param maxT int, maximum number of iterations
#' @param eps float, difference between 2 sequential points for which to stop
#' walking toward the mode for a give point's walk
#' @param verbose boolean, if this progression should be verbose
#' @param list_out if we should return every step of the process (for fun
#' visualization purposes and visual checks only)
#'
#' @return list of :
#' - matrix of \code{G_mat} points walked up all the way
#' - if \code{list_out} is \code{TRUE} then list of matrices for each step,
#' else \code{NULL}
#' @export
psuedo_density_mode_cluster_1d <- function(X_mat, G_mat = X_mat,
sigma,
maxT = 30,
eps = 1e-05,
verbose = TRUE,
list_out = FALSE){
n_X <- nrow(X_mat)
n_G <- nrow(G_mat)
G_mat_out <- G_mat # just a copy
if (list_out){
G_mat_list <- list()
G_mat_list[[1]] <- G_mat_out
}
error <- rep(1e08, n_G) # initial error = massive error
max_error <- 1e08
t <- 0
if (verbose){
pb <- progress::progress_bar$new(
format = "mode clustering [:bar] :percent eta: :eta",
total = maxT, clear = FALSE, width = 60)
}
while (max_error > eps && t < maxT){
for (g_idx in 1:n_G) {
current_vs_X_dist_vals <- apply(
sweep(X_mat, 2, G_mat_out[g_idx,], "-")^2,
1,
function(row_vals) sqrt(sum(row_vals)))
pseudo_density <- exp(-current_vs_X_dist_vals^2/sigma)
denominator <- sum(pseudo_density)
if (denominator == 0) {
stop("Error: no distance between a observation in G_mat and all observations in X_mat > 0.");
}
scaled_props <- pseudo_density / denominator
new_point <- apply(
sweep(X_mat, 1, scaled_props, "*"),
2, sum)
# get error and update
error[g_idx] <- sqrt(sum((new_point-G_mat_out[g_idx,])^2))
G_mat_out[g_idx, ] <- new_point
}
if (list_out){
G_mat_list[[t+2]] <- G_mat_out
}
t <- t + 1
max_error <- max(error)
if (verbose){
pb$tick()
}
}
if (!list_out){
G_mat_list <- list()
}
return(list(G_mat_out, G_mat_list))
}
testthat::test_that("test psuedo_density_mode_cluster_1d", {
set.seed(1)
data <- matrix(c(rnorm(mean = 2,n = 100),
rnorm(mean = -2, n = 100)),
byrow = T, nrow = 100)
sigma <- quantile(as.matrix(dist(data)), .2)
out <- psuedo_density_mode_cluster_1d(data, sigma = sigma,
maxT = 60, eps = 1e-05,
verbose = FALSE, list_out = TRUE)
df_out <- do.call(rbind, lapply(1:length(out[[2]]),
function(idx) data.frame(out[[2]][idx]) %>%
mutate(idx = idx) %>%
rownames_to_column()))
if (FALSE){
plotly::ggplotly(df_out %>% ggplot() +
geom_point(aes(x=X1, y =X2, frame = idx)) +
geom_path(aes(x=X1, y=X2, group = rowname))
)
}
dist_mat <- as.matrix(dist(out[[1]]))
adjmatrix <- dist_mat <= .01
ig <- igraph::graph_from_adjacency_matrix(adjmatrix, mode = "undirected")
groupings <- igraph::components(ig, mode = "strong")$membership
# 2 completely disjoint groups 1:50 and 51:100
testthat::expect_equal(length(unique(groupings[1:50])), 1)
testthat::expect_equal(length(unique(groupings[51:100])), 1)
testthat::expect_equal(length(unique(groupings)), 2)
})
#' Find mode clusters of euclidean objects based on distance - densities
#'
#' @param df_info data frame with rows of observations to mode cluster
#' @param position character vector of columns that define the location of the
#' points
#' @param naming_info a data frame with information about the data points (same
#' number of rows as \code{df_info})
#' @param sigma density scalar / sigma value
#' @param maxT int, maximum number of iterations
#' @param eps float, difference between 2 sequential points for which to stop
#' walking toward the mode for a give point's walk
#' @param diff_eps float, if the final step of each of the points is within
#' this distance from each-other they will be grouped together.
#' @param verbose boolean, if this progression should be verbose
#'
#' @return data frame with \code{naming_info} and a \code{grouping} column which
#' indicates which mode group each observation is in.
#' @export
mode_clustering_1d <- function(df_info, position, naming_info = NULL, sigma, maxT = 50,
eps = 1e-05, diff_eps = 1e-05, verbose = TRUE){
mat_info_raw <- df_info[,position] %>% as.matrix()
if (!is.null(naming_info)){
g_names = df_info[, naming_info, drop = F]
}
out <- psuedo_density_mode_cluster_1d(mat_info_raw, sigma = sigma,
maxT = maxT, eps = eps,
verbose = verbose, list_out = FALSE)
dist_mat <- as.matrix(dist(out[[1]]))
adjmatrix <- dist_mat <= diff_eps
ig <- igraph::graph_from_adjacency_matrix(adjmatrix, mode = "undirected")
groupings <- igraph::components(ig, mode = "strong")$membership
if (!is.null(naming_info)){
if (inherits(g_names, "data.frame")){
return(cbind(g_names, groupings = groupings))
} else {
return(data.frame(g_names, groupings = groupings))
}
} else {
return(data.frame(groupings))
}
}
testthat::test_that("test mode_clustering_1d", {
set.seed(1)
data <- matrix(c(rnorm(mean = 2,n = 100),
rnorm(mean = -2, n = 100)),
byrow = T, nrow = 100)
sigma <- quantile(as.matrix(dist(data)), .2)
data_df <- data.frame(data) %>% rownames_to_column()
names(data_df)[2:3] <- c("x","y")
out <- mode_clustering_1d(data_df, position = 2:3, naming_info = 1,
sigma = sigma, verbose = F)
testthat::expect_equal(out[,1], data_df$rowname)
# 2 completely disjoint groups 1:50 and 51:100
testthat::expect_equal(length(unique(out$groupings[1:50])), 1)
testthat::expect_equal(length(unique(out$groupings[51:100])), 1)
testthat::expect_equal(length(unique(out$groupings)), 2)
})
#' Calculates the minimum coverage radius for euclidean objects as we increase
#' the number of observations.
#'
#' @param ordered_sim_df data frame of points as rows, ordered by when they
#' should be added in the sequence
#' @param e_cols character vector of column names that define
#' locational information of points
#' @param verbose boolean, if this progression should be verbose
#'
#' @return list of min_cover_vec (minimum at that step) and dist_mat (cumulative
#' max per path at time step t (column))
coverage_down_1d_single <- function(ordered_sim_df, e_cols,
verbose = FALSE){
n_obs <- nrow(ordered_sim_df)
point_dist_mat <- as.matrix(dist(ordered_sim_df[,e_cols]))
dist_mat <- matrix(NA, nrow = n_obs, ncol = n_obs)
dist_mat[1,1] <- 0
min_cover_vec <- rep(NA, n_obs)
min_cover_vec[1] <- 0
if (n_obs > 1){
if (verbose){
pb <- progress::progress_bar$new(
format = "calculating minimum coverage [:bar] :percent eta: :eta",
total = n_obs - 1, clear = FALSE, width = 60)
}
inner_min_dist_cov_mat <- rep(Inf, n_obs)
for (r_idx in 2:n_obs){
#browser()
# check distances to new point
new_distances_with_new_point <- point_dist_mat[1:(r_idx-1),r_idx]
inner_min_dist_cov_mat[r_idx] <- min(new_distances_with_new_point)
# update minimum covereage required for older points
inner_min_dist_cov_mat[1:(r_idx-1)] <- apply(
matrix(c(inner_min_dist_cov_mat[1:(r_idx-1)],
new_distances_with_new_point),
byrow = TRUE, nrow = 2),
2, min)
# calc current min_cover value
min_cover_vec[r_idx] <- max(inner_min_dist_cov_mat[1:r_idx])
# update radi values if necessary
dist_mat[1:(r_idx-1), r_idx] <- sapply(dist_mat[1:(r_idx-1), r_idx-1],
function(x) max(c(x, min_cover_vec[r_idx])))
dist_mat[r_idx, r_idx] <- min_cover_vec[r_idx]
if (verbose) {
pb$tick()
}
}
} else {
message("A mode cluster has only a single element.")
}
return(list(min_cover_vec = min_cover_vec, dist_mat = dist_mat))
}
testthat::test_that("test coverage_down_1d_single", {
ordered_sim_df <- data.frame(x = c(1,2,1.5,4,4.1))
e_cols = c("x")
# useful ordered dist:
# dist(ordered_sim_df)
#1 2 3 4
#2 1.0
#3 0.5 0.5
#4 3.0 2.0 2.5
#5 3.1 2.1 2.6 0.1
out <- coverage_down_1d_single(ordered_sim_df, e_cols)
testthat::expect_equal(out$min_cover_vec, c(0,1,.5,2,.5))
testthat::expect_equal(out$dist_mat, matrix(c(0, 1, 1, 2, 2,
NA, 1, 1, 2, 2,
NA, NA, .5, 2, 2,
NA, NA, NA, 2, 2,
NA, NA, NA, NA, .5),
byrow = T, nrow = 5))
# data fram with single element gets message about it
testthat::expect_message(out_base <- coverage_down_1d_single(ordered_sim_df[1, , drop = F], e_cols))
testthat::expect_equal(out_base$min_cover_vec,0)
testthat::expect_equal(out_base$dist_mat,matrix(0, nrow = 1, ncol = 1))
})
#' Calculates the minimum coverage radius for euclidean objects as we increase
#' the number of observations relative to mode groupings.
#'
#' @param sim_df data frame of points as rows
#' @param e_cols character vector of column names that define
#' locational information of points
#' @param ordering_list list of indices, each vector is related to a single mode
#' cluster and the ordering defines how we build up the region.
#' @param verbose boolean, if this progression should be verbose
#'
#' @return list per mode grouping in \code{ordering_list} with inner lists of
#' min_cover_vec (minimum at that step) and dist_mat (cumulative
#' max per path at time step t (column))
#'
#' @export
coverage_down_1d_mult <- function(sim_df, e_cols,
ordering_list, # list of lists...
verbose = FALSE){
n_lists <- length(ordering_list)
n <- nrow(sim_df)
assertthat::assert_that((sum(sapply(ordering_list, length)) == n) &
(all(sort(unlist(ordering_list)) == 1:n)),
msg = "sim_df length and length of ordering_list info should match")
if (verbose){
pb <- progress::progress_bar$new(
format = "calculating minimum coverage (per mode) [:bar] :percent eta: :eta",
total = n_lists, clear = FALSE, width = 60)
}
coverd_info <- list()
t <- 1
for (order_vec in ordering_list){
if ((ncol(sim_df) == 1) | length(order_vec) == 1){
coverd_info[[t]] <- coverage_down_1d_single(sim_df[order_vec,, drop = F],
e_cols = e_cols,
verbose = FALSE)
} else {
coverd_info[[t]] <- coverage_down_1d_single(sim_df[order_vec,],
e_cols = e_cols,
verbose = FALSE)
}
if (verbose){
pb$tick()
}
t <- t + 1
}
return(coverd_info)
}
testthat::test_that("test coverage_down_1d_mult", {
# standard
sim_df <- data.frame(x = c(1,2,1.5,4,4.1, # first group
2.1, 1.1,4.1 , 1.6, 4.2 ) # second group
)
e_cols = c("x")
ordering_list <- list(c(1:5),
c(2,1,4,3) + 5, # allows for basically the same out as first group...
c(10)) # for a singleton
# this one shouldn't expect message - not actually used...
out <- coverage_down_1d_mult(sim_df[1:9,, drop = F], e_cols, ordering_list[1:2])
testthat::expect_message(out <- coverage_down_1d_mult(sim_df, e_cols, ordering_list))
# first group:
# useful ordered dist
# dist(ordered_sim_df)
#1 2 3 4
#2 1.0
#3 0.5 0.5
#4 3.0 2.0 2.5
#5 3.1 2.1 2.6 0.1
testthat::expect_equal(out[[1]]$min_cover_vec, c(0,1,.5,2,.5))
testthat::expect_equal(out[[1]]$dist_mat, matrix(c(0, 1, 1, 2, 2,
NA, 1, 1, 2, 2,
NA, NA, .5, 2, 2,
NA, NA, NA, 2, 2,
NA, NA, NA, NA, .5),
byrow = T, nrow = 5))
# second group:
testthat::expect_equal(out[[2]]$min_cover_vec, c(0,1,.5,2))
testthat::expect_equal(out[[2]]$dist_mat, matrix(c(0, 1, 1, 2,
NA, 1, 1, 2,
NA, NA, .5, 2,
NA, NA, NA, 2),
byrow = T, nrow = 4))
# singleton group:
testthat::expect_equal(out[[3]]$min_cover_vec, c(0))
testthat::expect_equal(out[[3]]$dist_mat, matrix(c(0),nrow = 1))
})
#' calculate distance between two different df's rows
#'
#' @param df1 data frame (n, p)
#' @param df2 data frame (m, p)
#'
#' @return matrix of distances (n, m)
my_pdist <- function(df1, df2){
nrow1 <- nrow(df1)
nrow2 <- nrow(df2)
out <- matrix(NA, nrow = nrow1, ncol = nrow2)
for (r_idx in 1:nrow1){
for (c_idx in 1:nrow2){
out[r_idx, c_idx] <- sqrt(sum((df1[r_idx,] - df2[c_idx,])^2))
}
}
return(out)
}
testthat::test_that("test my_pdist", {
# fixed 1d
df1 <- data.frame(x = 3:4)
df2 <- data.frame(x = 4:6)
dist_out <- my_pdist(df1, df2)
expected_dist_out <- matrix(c(1,2,3,
0,1,2),
byrow = T, nrow = 2)
testthat::expect_equal(dist_out, expected_dist_out)
# fixed 2d
df1 <- data.frame(x = 3:4,
y = 3:4)
df2 <- data.frame(x = 4:6,
x = 4:6)
dist_out <- my_pdist(df1, df2)
expected_dist_out <- matrix(c(1,2,3,
0,1,2) * sqrt(2),
byrow = T, nrow = 2)
testthat::expect_equal(dist_out, expected_dist_out)
# random square
set.seed(5)
df_r <- data.frame(x = rnorm(5),
y = rnorm(5))
dist_sq <- my_pdist(df_r,df_r)
testthat::expect_equivalent(dist_sq, as.matrix(dist(df_r)))
})
#' Helper function for simulation-based conformal score calculation based
#' potentially based on mode clustering and changing radius values.
#'
#' @param df_row_group data frame with new point information to be assessed
#' @param simulations_group_df data frame with multiplee simulated points
#' @param data_column_names character vector of column names that define
#' locational information of points
#' @param simulation_info_df a dataframe with information of each simulation's
#' psuedo-density estimate, mode clustering, ranking of psuedo-density (within
#' cluster and overall). We expect a \code{psuedo_density}, \code{groupings}
#' \code{ranking} and \code{group_ranking} columns. See
#' \code{EpiCompare::inner_expanding_info} for expected structure.
#' @param list_radius_info list of lists of radius information per each mode.
#' @param list_grouping_id list of vectors of indices (grouped by mode cluster)
#' and ordered by psuedo-density values. These values are relative to the row
#' indices in simulation_info_df
#' @param verbose boolean, if progress should be tracked with a progress bar
#'
#' @return data.frame with a row for new point with a
#' section column \code{containment_val} which is the discrete simulation-based
#' conformal score. If \code{df_row_group} only contained location columns in
#' \code{data_column_names} then a new column named \code{index} is also
#' included, and is related to the row numbers/names of \code{df_row_group}.
#' @export
inner_containment_conformal_score_mode_radius_1d <- function(df_row_group,
simulations_group_df,
data_column_names, # must be string vector (not index)
simulation_info_df,
list_radius_info,
list_grouping_id,
verbose = FALSE){
if (ncol(df_row_group) == length(data_column_names)){
df_row_group <- df_row_group %>% tibble::rownames_to_column(var = "row_index")
}
group_names <- names(df_row_group)[!(names(df_row_group) %in% data_column_names)]
if (length(group_names) == 1){
group_info_df <- df_row_group[,group_names, drop = F]
} else {
group_info_df <- df_row_group[,group_names]
}
n_draws <- nrow(simulation_info_df)
n_groups <- length(list_grouping_id)
n_check <- nrow(df_row_group)
if (verbose){
pb <- progress::progress_bar$new(
format = "Processing [:bar] :percent eta: :eta",
total = n_draws,
clear = FALSE, width = 60)
}
# https://stackoverflow.com/questions/35106567/how-to-calculate-euclidean-distance-between-two-matrices-in-r
if (length(data_column_names) == 1){
between_dist_mat <- my_pdist(df_row_group[, data_column_names, drop = F],
simulations_group_df[, data_column_names, drop = F])
# between_dist_mat <- matrix(NA, nrow = nrow(df_row_group),
# ncol = nrow(simulations_group_df))
#
# for (c_idx in 1:nrow(simulations_group_df)){
# between_dist_mat[,c_idx] <- inner_euclidean_distance_1d(df_row_group[, data_column_names],
# simulations_group_df[c_idx, data_column_names])
# }
} else {
between_dist_mat <- my_pdist(df_row_group[, data_column_names],
simulations_group_df[, data_column_names])
}
overall_info <- list()
for (g_idx in 1:n_groups){
inner_ids <- list_grouping_id[[g_idx]]
inner_rad_dist_mat <- list_radius_info[[g_idx]]$dist_mat
n_sims_inner <- length(inner_ids)
# 1. iteratively build distance across each point, store
# Only examine until covered
# track first time covered
not_covered <- rep(TRUE, n_check)
coverage_number <- rep(n_draws+1, n_check)
s_idx <- 1
while (s_idx <= n_sims_inner & sum(not_covered) > 0){
if (sum(not_covered) == 1 | s_idx == 1){
new_contained <- sweep(between_dist_mat[not_covered, inner_ids[1:s_idx], drop = F],
MARGIN = 2,
STATS = inner_rad_dist_mat[1:s_idx, s_idx],
FUN = "<=") %>%
rowSums() %>% sapply(function(x) x > 0)
} else {
new_contained <- sweep(between_dist_mat[not_covered, inner_ids[1:s_idx]],
MARGIN = 2,
STATS = inner_rad_dist_mat[1:s_idx, s_idx],
FUN = "<=") %>%
rowSums() %>% sapply(function(x) x > 0)
}
if (sum(new_contained) > 0){
# update coverage numbers for those just covered
inner_coverage <- rep(n_draws+1, sum(not_covered))
inner_coverage[new_contained] <- s_idx
coverage_number[not_covered] <- inner_coverage
# update those covered to not look at again
not_covered[not_covered] <- !new_contained
}
s_idx <- s_idx + 1
}
# calculate score per mode relative to overall ranking of points
all_info_df <- group_info_df %>%
mutate(containment_val = coverage_number)
simulation_info_inner <- rbind((simulation_info_df[inner_ids,] %>%
filter(groupings == g_idx) %>%
dplyr::ungroup() %>%
dplyr::select(.data$ranking, .data$group_ranking)),
data.frame(ranking = n_draws+1, group_ranking = n_draws+1))
overall_score <- all_info_df %>%
dplyr::left_join(simulation_info_inner,
by = c("containment_val" = "group_ranking"))
overall_info[[g_idx]] <- overall_score
}
# combining across modes
moverall_info <- do.call(rbind, overall_info) %>%
dplyr::group_by(dplyr::across(tidyselect::one_of(group_names))) %>%
dplyr::summarize(containment_val = n_draws+1 - min(.data$ranking), .groups = "keep") # why subtraction?:
return(moverall_info)
}
testthat::test_that("test inner_containment_conformal_score_mode_radius_1d", {
sim_df <- data.frame(x = c(1,2,1.5,4,4.1, # first group
2.1, 1.1,4.1 , 1.6, 4.2 ) # second group
)
e_cols = c("x")
list_grouping_id <- list(c(1:5),
c(2,1,4,3) + 5, # allows for basically the same out as first group...
c(10))
# natural structure for simulation_info_df
psuedo_density_df <- data.frame(psuedo_density = c(.4,.3,.2,.1,.05,
.31,.41,.11,.21,
.5))
mode_grouping <- data.frame(groupings = c(rep(1,5),
rep(2,4),
3))
suppressMessages(list_radius_info <- coverage_down_1d_mult(sim_df, e_cols = e_cols,
ordering_list = list_grouping_id,
verbose = FALSE))
# could be fed into EpiCompare::inner_expanding_info
# (but doesn't have grouping names...)
simulation_info_df <- psuedo_density_df %>%
cbind(mode_grouping) %>%
dplyr::mutate(ranking = rank(-.data$psuedo_density,
ties.method ="random")) %>% # no impact in example
dplyr::group_by(.data$groupings) %>%
dplyr::mutate(group_ranking = rank(-.data$psuedo_density,
ties.method ="random")) # no impact in example
test_df <- data.frame(x = c(4.1, 4.11, 0, 3))
# 4.1 and 4.11 will be contained with 4.1 in either the 1st or 2nd list
# both of the sim 4.1s are at the bottom of the pile in terms of density,
# with overall rankings 8 and 9 out of 10
# 0 will be contained by 1 (first list after we include 2), so it should have
# ranking associated with 2 (in the first list), which has a ranking of 5
# 3 will also be captured by 2.1 from the second list as it has a higher
# density than the 2 from the first list and same radius, associated with
# a rank of 4
# since the conformal scores are n_sims + 1 - rank, we should expect:
out_expected <- data.frame(row_index = as.character(1:4),
containment_val = 10+1 - c(8, 8,5,4))
out <- inner_containment_conformal_score_mode_radius_1d(df_row_group = test_df,
simulations_group_df = sim_df,
data_column_names = e_cols,
simulation_info_df = simulation_info_df,
list_radius_info = list_radius_info,
list_grouping_id = list_grouping_id,
verbose = FALSE)
out_raw <- out %>% ungroup %>% as.data.frame()
testthat::expect_equal(out_raw, out_expected)
# with their own names!
test_df2 <- data.frame(x = c(4.1,4.11, 0, 3),
names = c("alice", "bill", "chuck", "dave"))
out_expected2 <- data.frame(names = c("alice", "bill", "chuck", "dave"),
containment_val = 10+1 - c(8, 8,5,4))
out2 <- inner_containment_conformal_score_mode_radius_1d(df_row_group = test_df2,
simulations_group_df = sim_df,
data_column_names = e_cols,
simulation_info_df = simulation_info_df,
list_radius_info = list_radius_info,
list_grouping_id = list_grouping_id,
verbose = FALSE)
out2_raw <- out2 %>% ungroup %>% as.data.frame()
testthat::expect_equal(out2_raw, out_expected2)
# 2d:
sim_df_2d <- data.frame(x = c(1,2,1.5,4,4.1, # first group
2.1, 1.1,4.1 , 1.6, 4.2 ), # second group
y = c(1,2,1.5,4,4.1, # first group
2.1, 1.1,4.1 , 1.6, 4.2 ) # second group
)
test_df_2d <- data.frame(x = c(4.1, 4.11, 0, 3),
y = c(4.1, 4.11, 0, 3))
e_cols = c("x", "y")
out_2d <- inner_containment_conformal_score_mode_radius_1d(df_row_group = test_df_2d,
simulations_group_df = sim_df_2d,
data_column_names = e_cols,
simulation_info_df = simulation_info_df,
list_radius_info = list_radius_info,
list_grouping_id = list_grouping_id,
verbose = FALSE)
})
#' global wrapper for simulation-based conformal score calculation that allows
#' for mode clustering and changes of radius.
#'
#' @param truth_df
#' @param simulations_df
#' @param smooth_functions
#' @param data_column_names
#' @param .maxT
#' @param .sigma_string
#' @param .diff_eps
#' @param .eps
#' @param return_min
#' @param verbose
#'
#' @return
#' @export
#'
#' @examples
simulation_based_conformal_1d_complex <- function(truth_df, simulations_df,
smooth_functions = list(), #named list
data_column_names = c("y"),
.maxT = 50,
.sigma_string = "25%",
.diff_eps = 1e-06,
.eps = 1e-06,
return_min = TRUE,
verbose = FALSE){
# calculating sigma value ------
dist_mat <- simulations_df %>% select(one_of(data_column_names)) %>%
dist() %>% as.matrix()
group_names <- names(simulations_df)[!(names(simulations_df) %in% data_column_names)]
if (length(group_names) == 0){
simulations_df <- simulations_df %>% rownames_to_column(var = "row_index")
group_names <- names(simulations_df)[!(names(simulations_df) %in% data_column_names)]
}
# sigma selection
sigma_lower <- EpiCompare:::check_character_percent(.sigma_string, ".sigma_string")
sigma_sizes <- sapply(sigma_lower + .05*(0:5), function(v) min(v, 1))
percentage_inner <- sigma_sizes[stats::quantile(as.matrix(dist_mat), sigma_sizes) > 0][1]
sigma_val <- stats::quantile(as.matrix(dist_mat), percentage_inner)
tdm <- EpiCompare::tidy_dist_mat(dist_mat,
rownames_df = simulations_df[, group_names, drop = F],
colnames_df = simulations_df[, group_names, drop = F])
# rank_df
pseudo_density_df <- EpiCompare::distance_psuedo_density_function(
tdm,
sigma = sigma_val, df_out = T) %>%
dplyr::mutate(ranking = rank(-.data$psuedo_density,ties.method = "random")) #spelling error... :(, no ties! need ordering
assertthat::assert_that(all(!is.na(pseudo_density_df$psuedo_density)),
msg = paste("internal error in",
"distance_psuedo_density_function",
"function's sigma selection."))
top_points <- simulations_df %>%
left_join(pseudo_density_df, by = group_names) %>%
mutate(keep = ranking > ceiling(.2*nrow(simulations_df))) %>%
ungroup() %>% filter(keep) %>%
select(one_of(data_column_names))
mm_delta <- EpiCompare::get_delta_nn(top_points)
# mode clustering ------------
out_groups <- mode_clustering_1d(df_info = simulations_df,
position = c(1:ncol(simulations_df))[names(simulations_df) %in% data_column_names],
naming_info = c(1:ncol(simulations_df))[names(simulations_df) %in% group_names],
sigma, maxT = .maxT,
eps = .eps,
diff_eps = .diff_eps,
verbose = verbose)
simulation_info_out <- EpiCompare::inner_expanding_info(pseudo_density_df, out_groups)
simulation_info_df <- simulation_info_out[[1]]
ordering_list <- simulation_info_out[[2]]
## fixed
tm_radius_fixed <- inner_convert_single_radius_to_structure(mm_delta,
ordering_list)
## vary
tm_radius_vary <- coverage_down_1d_mult(sim_df = simulations_df,
e_cols = data_column_names,
ordering_list = ordering_list,
verbose = FALSE)
# no mode clustering ---------
out_groups_nm <- out_groups %>% dplyr::mutate(groupings = 1)
simulation_info_out_nm <- inner_expanding_info(pseudo_density_df, out_groups_nm)
simulation_info_df_nm <- simulation_info_out_nm[[1]]
ordering_list_nm <- simulation_info_out_nm[[2]]
tm_radius_fixed_nm <- inner_convert_single_radius_to_structure(mm_delta,
ordering_list_nm)
## vary
tm_radius_vary_nm <- coverage_down_1d_mult(sim_df = simulations_df,
e_cols = data_column_names,
ordering_list = ordering_list_nm,
verbose = FALSE)
# smoothing --------
tm_radius_vary_update_list <- list()
tm_radius_vary_nm_update_list <- list()
if (length(smooth_functions) > 0 ){
for (f_name in names(smooth_functions)){
tm_radius_vary_update_list[[f_name]] <- update_tm_smooth(tm_radius_vary,
func = smooth_functions[[f_name]])
tm_radius_vary_nm_update_list[[f_name]] <- update_tm_smooth(tm_radius_vary_nm,
func = smooth_functions[[f_name]])
}
}
conformal_df_fixed <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df, # diff
list_radius_info = tm_radius_fixed, # diff
list_grouping_id = ordering_list, # diff
verbose = verbose)
conformal_df_fixed_nm <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df_nm, # diff
list_radius_info = tm_radius_fixed_nm, # diff
list_grouping_id = ordering_list_nm, # diff
verbose = verbose)
conformal_df_vary <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df, # diff
list_radius_info = tm_radius_vary, # diff
list_grouping_id = ordering_list, # diff
verbose = verbose)
conformal_df_vary_nm <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df_nm, # diff
list_radius_info = tm_radius_vary_nm, # diff
list_grouping_id = ordering_list_nm, # diff
verbose = verbose)
conformal_df_vary_nm_smooth_list <- list()
conformal_df_vary_smooth_list <- list()
if (length(smooth_functions) > 0 ){
for (f_name in names(smooth_functions)){
conformal_df_vary_nm_smooth_list[[f_name]] <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df_nm, # diff
list_radius_info = tm_radius_vary_nm_update_list[[f_name]], # diff
list_grouping_id = ordering_list_nm, # diff
verbose = verbose)
conformal_df_vary_smooth_list[[f_name]] <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df, # diff
list_radius_info = tm_radius_vary_update_list[[f_name]], # diff
list_grouping_id = ordering_list, # diff
verbose = verbose)
}
}
if (return_min){
return(list(conformal_score = list(fixed = conformal_df_fixed,
fixed_nm = conformal_df_fixed_nm,
vary = conformal_df_vary,
vary_nm = conformal_df_vary_nm,
smooth_vary = conformal_df_vary_smooth_list,
smooth_vary_nm = conformal_df_vary_nm_smooth_list),
parameters = c("mm_delta_prop" = .2,
"mm_delta" = mm_delta,
"sigma_percentage" = percentage_inner)))
} else{
return(list(conformal_score = list(fixed = conformal_df_fixed,
fixed_nm = conformal_df_fixed_nm,
vary = conformal_df_vary,
vary_nm = conformal_df_vary_nm,
smooth_vary = conformal_df_vary_smooth_list,
smooth_vary_nm = conformal_df_vary_nm_smooth_list),
containment_df = simulation_info_df,
mm_delta = mm_delta,
tm_radius = list(fixed = tm_radius_fixed,
fixed_nm = tm_radius_fixed_nm,
vary = tm_radius_vary,
vary_nm = tm_radius_vary_nm,
smooth_vary = tm_radius_vary_update_list,
smooth_vary_nm = tm_radius_vary_nm_update_list),
truth_df_inner = truth_df,
simulations_group_df_inner = simulations_group_df,
ordering_list = ordering_list,
parameters = c("mm_delta_prop" = .2,
"sigma_percentage" = percentage_inner)))
}
}
testthat::test_that("test simulation_based_conformal_1d_complex", {
create_smooth_function <- function(df){
eval(parse(text = paste0("smooth_function <- function(x,y){
tryCatch({
inner_ss <- smooth.spline(x,y, df = ",
df,
")
return(predict(inner_ss,x)$y)},
error = function(cond){
message(sprintf(\"returning y: error in size of x (%i < 4)\", length(x)))
return(y)
},
warning = function(cond){
message(sprintf(paste(\"returning y: error in size of x\",
\"relative to size of df (%i < %i)\"),
length(x), ", df, "))
return(y)
}
)
}")))
return(smooth_function)
}
inner_moving <- function(x, y, window = 5, fill_left = T,
fun = min){
n <- length(y)
if (n < window){
message(sprintf(paste("returning y: error in size of x",
"relative to size of window (%i < %i)"),
n, window))
return(y)
}
y_smooth <- rep(NA, n)
for (idx in window:n){
y_smooth[idx] <- fun(y[(idx-window+1):idx])
}
if (fill_left){
y_smooth[1:(window-1)] <- y_smooth[window]
}
return(y_smooth)
}
create_min_smooth_function <- function(window){
eval(parse(text = paste0("out_function <- function(x,y){
inner_moving(x,y, window = ", window,", fill_left = T, fun = min)
}")))
return(out_function)
}
smooth_functions <- list("window10" = create_min_smooth_function(10))
set.seed(50)
sim_df <- data.frame(matrix(c(rnorm(100, mean = -2), rnorm(100, mean = 2)),
byrow = TRUE, ncol = 2)) %>%
rename(x = "X1", y = "X2")
test_df <- data.frame(x = c(2,-1, -2.5),
y = c(2,-3.5, 2.5))
if (FALSE){
sim_df %>% ggplot() +
geom_point(aes(x=x,y=y)) +
geom_point(data = test_df, aes(x=x,y=y),color ="red")
}
# we expect that the first will have a higher conformal score than the second,
# and the third will have a conformal score of 0
out <- simulation_based_conformal_1d_complex(truth_df = test_df,
simulations_df = sim_df,
smooth_functions = smooth_functions, #named list
data_column_names = c("x","y"),
.maxT = 100,
.eps = 1e-07,
.diff_eps = 1e-06, return_min = F)
#out$conformal_score
testthat::expect_true(all(out$conformal_score$vary >=
out$conformal_score$smooth_vary$window10))
testthat::expect_true(all(out$conformal_score$vary_nm >=
out$conformal_score$smooth_vary_nm$window10))
list_cs <- out$conformal_score[c("fixed", "fixed_nm", "vary", "vary_nm")]
list_cs[[5]] <- out$conformal_score$smooth_vary$window10
list_cs[[6]]<- out$conformal_score$smooth_vary_nm$window10
cs_mat_idx <- 1
for (cs_mat in list_cs){
testthat::expect_true(all(diff(cs_mat$containment_val) <= rep(0,2)))
if (!(cs_mat_idx %in% 3:4)){
testthat::expect_equal(cs_mat$containment_val[3], 0)
}
testthat::expect_gt(cs_mat$containment_val[1], 60) # should be higher - pretty central...
cs_mat_idx <- cs_mat_idx + 1
}
})
#' @param conformal_score_cut note this can be an integer, string percentage or
# fraction, but relates to the conformal cut (not the confidence level).
#'
# simulation_based_conformal_1d_region <- function(simulations_grouped_df,
# data_column_names = c("y"),
# conformal_score_cut = .9,
# delta_prop = .8){
#
# if(!EpiCompare::is.wholenumber(conformal_score_cut)){
# if(is.character(conformal_score_cut)){
# inner_percent <- EpiCompare::check_character_percent(conformal_score_cut)
# } else {
# inner_percent <- conformal_score_cut
# }
# conformal_score_cut <- ceiling(inner_percent * nrow(simulations_grouped_df))
# }
#
# assertthat::assert_that(inherits(simulations_grouped_df, "grouped_df"))
# group_names <- names(group_keys(simulations_grouped_df))
#
#
# simulations_group_df_inner <- simulations_grouped_df %>%
# dplyr::select(dplyr::one_of(c(group_names, data_column_names)))
#
# group_info <- simulations_group_df_inner %>% group_keys()
#
# dist_mat <- simulations_group_df_inner %>% group_split() %>%
# do.call(rbind, .) %>% # match group_info ordering
# select(one_of(data_column_names)) %>%
# dist() %>% as.matrix()
#
# tdm_sims <- EpiCompare::tidy_dist_mat(dist_mat, group_info, group_info)
#
# # sigma selection
#
# sigma_size <- c("20%" = .2, "25%" = .25, "30%" = .3,
# "35%" = .35, "40%" = .4, "45%" = .45)
#
# percentage <- names(sigma_size)[stats::quantile(as.matrix(tdm_sims), sigma_size) > 0][1]
#
#
# # rank_df
# pseudo_density_df <- EpiCompare::distance_psuedo_density_function(
# tdm_sims,
# sigma = percentage, df_out = T) %>%
# mutate(ranking = rank(psuedo_density,ties.method = "min")) #spelling error... :(
#
# assertthat::assert_that(all(!is.na(pseudo_density_df$psuedo_density)),
# msg = paste("internal error in",
# "distance_psuedo_density_function",
# "function's sigma selection."))
#
# proportion_points_not_included <- 1 - delta_prop
#
# top_points <- simulations_group_df_inner %>%
# left_join(pseudo_density_df, by = group_names) %>%
# mutate(keep = ranking > ceiling(proportion_points_not_included*nrow(simulations_group_df_inner))) %>%
# ungroup() %>% filter(keep) %>%
# select(one_of(data_column_names))
#
# mm_delta <- get_delta_nn(top_points)
#
# conformal_band_points <- simulations_group_df_inner %>%
# left_join(pseudo_density_df, by = group_names) %>%
# mutate(keep = ranking > conformal_score_cut) %>%
# ungroup() %>% filter(keep) %>%
# select(one_of(data_column_names))
#
#
# return(list(conformal_band_points = conformal_band_points,
# mm_delta = mm_delta))
# }
## Conformal Calibration Step ----------------
#
# in this step, we get the distribution of all conformal scores for the
# calibration set.
#
file_exists <- check_file_exists(sprintf("calibration_information_1d_%s_sims_delta_size_%s.Rdata",
n_simulations, delta_prop_string),
dir = ".")
if (!file_exists | rerun){
library(parallel)
library(foreach)
library(doSNOW)
max_cores <- parallel::detectCores()
cl <- makeCluster(max_cores-4)
registerDoSNOW(cl)
iterations <- nrow(calibration_set)
pb <- txtProgressBar(max = iterations, style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
n_sims <- n_simulations
verbose <- T
number_points <- 50
calibration_conformal_scores <- foreach(calibration_idx = 1:nrow(calibration_set),
#.combine = c,
.options.snow = opts,
.packages = c("dplyr", "tidyr",
"EpiCompare",
"simulationBands")) %dopar%{
x_inner <- calibration_set$x[calibration_idx]
obs_inner <- calibration_set[calibration_idx,]
inner_sim <- data.frame(x = rep(x_inner, n_sims))
inner_sim$y <- sapply(inner_sim$x, function(x) {simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
inner_sim$idx <- paste0("sim",1:n_sims)
inner_sim <- inner_sim %>% group_by(idx)
conformal_score <- simulation_based_conformal_1d(truth_df = obs_inner,
simulations_grouped_df = inner_sim,
data_column_names = c("y"),
delta_prop = delta_prop)
return(conformal_score)
}
conformal_score_vector <- sapply(calibration_conformal_scores, function(x) x$conformal_score)
mm_delta_vector <- sapply(calibration_conformal_scores, function(x) x$mm_delta)
save(calibration_conformal_scores,
file = sprintf("calibration_information_1d_%s_sims_delta_size_%s.Rdata",
n_simulations, delta_prop_string))
parallel::stopCluster(cl)
} else {
load(sprintf("calibration_information_1d_%s_sims_delta_size_%s.Rdata",
n_simulations, delta_prop_string))
conformal_score_vector <- sapply(calibration_conformal_scores, function(x) x$conformal_score)
mm_delta_vector <- sapply(calibration_conformal_scores, function(x) x$mm_delta)
}
df_conformal_calibration_fit_info <- data.frame(
idx = 1:nrow(calibration_set),
cs = conformal_score_vector,
mm_delta = mm_delta_vector)
df_conformal_calibration_ecdf_info <- data.frame(
cs = conformal_score_vector,
ecdf = ecdf(conformal_score_vector)(conformal_score_vector))
vis1_d <- df_conformal_calibration_fit_info %>%
ggplot() +
geom_histogram(aes(x = cs)) +
labs(x = "simulation conformal score",
title = "calibration set")
vis2_d <- df_conformal_calibration_ecdf_info %>%
ggplot() +
geom_line(aes(x = cs, y = ecdf)) +
labs(x = "simulation conformal score",
y = "empirical cumulative distribution function",
title = "calibration set") +
ylim(0,1)
vis3_d <- df_conformal_calibration_fit_info %>%
ggplot() +
geom_point(aes(y = cs, x = mm_delta), alpha = .1) +
labs(y = "simulation conformal score",
x = "delta for delta ball around filament compression points",
title = "calibration set")
vis4_d <- df_conformal_calibration_fit_info %>%
ggplot() +
geom_point(aes(x = idx, y = cs)) +
xlim(0,20)
vis5_d <- df_conformal_calibration_fit_info %>%
ggplot() +
geom_point(aes(x = idx, y = mm_delta)) +
xlim(0,20)
gridExtra::grid.arrange(vis1_d, vis2_d,
vis3_d, vis4_d,
vis5_d,
nrow = 3)
if (save_fig){
ggvis <- gridExtra::arrangeGrob(vis1_d + theme_minimal() +
labs(title = "Distribution of \nsimulation conformal score",
x = "simulation-based conformal score (\"radius\")",
y = "# of calibration set"),
vis2_d + theme_minimal() +
labs(x = "simulation-based conformal score (\"radius\")",
title = "Cumulative distribution of \nsimulation conformal score"),
layout_matrix = matrix(c(1,1,1,1,2,2,
1,1,1,1,2,2,
1,1,1,1,2,2),
nrow = 3, byrow = T))
ggplot2::ggsave(plot = ggvis,
width = 12,
height = 5,
filename = sprintf("quick_images/simulation_bands_1d_delta_size_%s.png",
delta_prop_image_string))
ggvis2 <- gridExtra::arrangeGrob(vis1_d + theme_minimal() +
labs(title = "Distribution of \\\\simulation conformal score",
x = "simulation-based conformal score ('radius')",
y = "number of obs in calibration set"),
vis2_d + theme_minimal() +
labs(x = "simulation-based conformal score ('radius')",
title = "Cumulative distribution of \nsimulation conformal score"),
layout_matrix = matrix(c(1,1,1,1,2,2,
1,1,1,1,2,2,
1,1,1,1,2,2),
nrow = 3, byrow = T))
tikz(file = sprintf("quick_images/simulation_bands_1d_delta_size_%s.tex",
delta_prop_image_string),
width = 12,
height = 5)
plot(ggvis2)
dev.off()
}
## Test set ---------------
# just some info:
conformal_score_vector %>% stats::quantile(1-c(.6,.9)) %>% floor()
ecdf(conformal_score_vector)(99) # .925 ()
file_exists <- check_file_exists(sprintf(
paste0("conformal_test_information_1d_%s",
"_sims_delta_size_%s_confidence_level_%s.Rdata"),
n_simulations, delta_prop_string, confidence_level_string),
dir = ".")
if (!file_exists | rerun){
library(parallel)
library(foreach)
library(doSNOW)
max_cores <- parallel::detectCores()
cl <- makeCluster(max_cores-4)
registerDoSNOW(cl)
iterations <- nrow(test_set)
pb <- txtProgressBar(max = iterations, style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
n_sims <- n_simulations
verbose <- T
number_points <- 50
cut <- conformal_score_vector %>% stats::quantile(1-confidence_level) %>%
floor()
calibration_conformal_scores <- foreach(calibration_idx = 1:nrow(test_set),
#.combine = c,
.options.snow = opts,
.packages = c("dplyr", "tidyr",
"simulationBands",
"EpiCompare")) %dopar%{
x_inner <- test_set$x[calibration_idx]
inner_sim <- data.frame(x = rep(x_inner, n_sims))
inner_sim$y <- sapply(inner_sim$x, function(x) {simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
inner_sim$idx <- paste0("sim",1:n_sims)
inner_sim <- inner_sim %>% group_by(idx)
conformal_score <- simulation_based_conformal_1d_region(
simulations_grouped_df = inner_sim,
data_column_names = c("y"),
conformal_score_cut = cut)
return(conformal_score)
}
save(calibration_conformal_scores,
file = sprintf(
paste0("conformal_test_information_1d_%s",
"_sims_delta_size_%s_confidence_level_%s.Rdata"),
n_simulations, delta_prop_string, confidence_level_string))
parallel::stopCluster(cl)
} else {
load(sprintf(
paste0("conformal_test_information_1d_%s",
"_sims_delta_size_%s_confidence_level_%s.Rdata"),
n_simulations, delta_prop_string, confidence_level_string))
}
file_exists <- check_file_exists(sprintf(
paste0("conformal_test_information_discrete_1d_%s",
"_sims_delta_size_%s.Rdata"), n_simulations, delta_prop_string),
dir = ".")
if (!file_exists | rerun){
library(parallel)
library(foreach)
library(doSNOW)
max_cores <- parallel::detectCores()
cl <- makeCluster(max_cores-4)
registerDoSNOW(cl)
iterations <- nrow(test_set_discrete)
pb <- txtProgressBar(max = iterations, style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
n_sims <- n_simulations
verbose <- T
number_points <- 50
cut <- conformal_score_vector %>% stats::quantile(1 - confidence_level) %>% floor()
calibration_conformal_scores_discrete <- foreach(calibration_idx = 1:nrow(test_set_discrete),
#.combine = c,
.options.snow = opts,
.packages = c("dplyr", "tidyr",
"EpiCompare",
"simulationBands")) %dopar%{
x_inner <- test_set_discrete$x[calibration_idx]
inner_sim <- data.frame(x = rep(x_inner, n_sims))
inner_sim$y <- sapply(inner_sim$x, function(x) {simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
inner_sim$idx <- paste0("sim", 1:n_sims)
inner_sim <- inner_sim %>% group_by(idx)
conformal_score <- simulation_based_conformal_1d_region(
simulations_grouped_df = inner_sim,
data_column_names = c("y"),
conformal_score_cut = cut,
delta_prop = delta_prop)
return(conformal_score)
}
save(calibration_conformal_scores_discrete,
file = sprintf(paste0("conformal_test_information_discrete_1d_%s",
"_sims_delta_size_%s.Rdata"), n_simulations, delta_prop_string))
parallel::stopCluster(cl)
} else {
load(sprintf(paste0("conformal_test_information_discrete_1d_%s",
"_sims_delta_size_%s.Rdata"), n_simulations, delta_prop_string))
}
### visualization of above ------------------
delta_y = .001
yy <- seq(-8, 8, by = delta_y)
containment_vec <- function(points, yy, radius){
if (nrow(points) == 0){
return(rep(FALSE, length(yy)))
}
nn2_vals <- RANN::nn2(data = as.matrix(points),
query = as.matrix(yy),
k = 1,
treetype = "kd")
contained <- nn2_vals$nn.dists < radius
return(contained)
}
if (verbose) {
pb <- progress::progress_bar$new(
format = "Containment calculation [:bar] :percent eta: :eta",
total = nrow(test_set_discrete), clear = FALSE, width = 80)
}
df_grid_all <- data.frame()
df_containment_prop_all <- data.frame()
for (calibration_idx in 1:nrow(test_set_discrete)){
# for visual
x_inner <- test_set_discrete$x[calibration_idx]
points <- calibration_conformal_scores_discrete[[calibration_idx]]$conformal_band_points
radius <- calibration_conformal_scores_discrete[[calibration_idx]]$mm_delta
contained <- containment_vec(points, yy, radius)
df_inner <- data.frame(
calibration_idx = calibration_idx,
x = x_inner,
y = yy,
contained = contained)
df_grid_all <- rbind(df_grid_all, df_inner)
if (verbose) {
pb$tick()
}
# for containment of true observations:
inner_sim_y <- sapply(rep(x_inner, n_sims_containment),
function(x) {
simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
contained_check <- containment_vec(points, inner_sim_y, radius)
df_containment_prop_inner <- data.frame(
x = x_inner,
containment_prop = mean(contained_check))
df_containment_prop_all <- rbind(df_containment_prop_all,
df_containment_prop_inner)
}
vis1 <- df_grid_all %>% filter(contained) %>%
ggplot() +
geom_point(data = test_set, aes(x = x, y = y), color = "blue") +
geom_tile(aes(x = x, y = y), alpha = .5) +
labs(title = sprintf(paste("Our Simulation-based \nConformal Inference",
"Prediction Regions (%s)"),
confidence_level_string)) +
xlim(-1.5, 1.5) + ylim(-8,8) +
theme_minimal()
vis2 <- df_containment_prop_all %>%
ggplot() +
geom_line(aes(x = x, y = containment_prop)) +
ylim(0,1) + xlim(-1.5, 1.5) +
geom_hline(yintercept = confidence_level, color = "blue") +
geom_label(data = data.frame(x = 1.5, y = confidence_level,
label = confidence_level),
aes(x = x, y = y, label = label)) +
labs(y = "containment proportion",
title = sprintf(paste("containment proportion of %s ys\nrandomly",
"sampled per x value"),n_sims_containment)) +
theme_minimal()
gridExtra::grid.arrange(vis1, vis2,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
if (save_image){
ggvis <- gridExtra::arrangeGrob(vis1, vis2,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
ggplot2::ggsave(plot = ggvis, width = 5, height = 7,
filename =
sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_%s",
"_sims_delta_size_%s_confidence_level_%s.png"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
vis1_tikz <- df_grid_all %>% filter(contained) %>%
ggplot() +
geom_point(data = test_set, aes(x = x, y = y), color = "blue") +
geom_tile(aes(x = x, y = y), alpha = .5) +
labs(title = paste("Simulation-based \\\ Conformal Inference",
"Prediction Regions (60\\%)")) +
xlim(-1.5, 1.5) + ylim(-8,8) +
theme_minimal()
ggvis_tikz <- gridExtra::arrangeGrob(vis1_tikz, vis2,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
tikz(file = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_%s",
"_sims_delta_size_%s_confidence_level_%s.tex"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
plot(ggvis_tikz)
dev.off()
}
## Discrete CDE based approaches
## "Global" CD approach
The approach here will be similar to what a has done before - in a discrete fashion.
calibration_info <- calibration_set
calibration_info$calibration_cde_values <- sapply(1:nrow(calibration_set),
function(r_idx){
simulationBands::cde_lei_wassserman(calibration_set$x[r_idx])(calibration_set$y[r_idx])
})
cutoff_g_cd <- stats::quantile(calibration_info$calibration_cde_values,
1- confidence_level)
delta_y = .001
yy <- seq(-8, 8, by = delta_y)
if (verbose) {
pb <- progress::progress_bar$new(
format = "Containment calculation [:bar] :percent eta: :eta",
total = nrow(test_set_discrete), clear = FALSE, width = 80)
}
df_grid_all_g_cd <- data.frame()
df_containment_prop_all_g_cd <- data.frame()
for (calibration_idx in 1:nrow(test_set_discrete)){
# for visual
x_inner <- test_set_discrete$x[calibration_idx]
cde_values <- simulationBands::cde_lei_wassserman(x_inner)(yy)
contained <- cde_values > cutoff_g_cd
df_inner <- data.frame(
calibration_idx = calibration_idx,
x = x_inner,
y = yy,
contained = contained,
cde = cde_values)
df_grid_all_g_cd <- rbind(df_grid_all_g_cd, df_inner)
if (verbose) {
pb$tick()
}
# for containment of true observations:
inner_sim_y <- sapply(rep(x_inner, n_sims_containment),
function(x) {
simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
sim_cde_values <- simulationBands::cde_lei_wassserman(x_inner)(inner_sim_y)
contained_check <- sim_cde_values > cutoff_g_cd
df_containment_prop_inner <- data.frame(
x = x_inner,
containment_prop = mean(contained_check))
df_containment_prop_all_g_cd <- rbind(df_containment_prop_all_g_cd,
df_containment_prop_inner)
}
vis1g <- df_grid_all_g_cd %>% filter(contained) %>%
ggplot() +
geom_point(data = test_set, aes(x = x, y = y), color = "blue") +
geom_tile(aes(x = x, y = y), alpha = .5) +
labs(title = sprintf(paste("\"Global\" CD*\nConformal",
"Inference Prediction Regions (%s)"),
confidence_level_string)) +
xlim(-1.5,1.5) + ylim(-8,8) +
theme_minimal()
vis2g <- df_containment_prop_all_g_cd %>%
ggplot() +
geom_line(aes(x = x, y = containment_prop)) +
ylim(0,1) + xlim(-1.5,1.5) +
geom_hline(yintercept = confidence_level, color = "blue") +
geom_label(data = data.frame(x = 1.5, y = confidence_level,
label = confidence_level),
aes(x = x, y = y, label = label)) +
labs(y = "containment proportion",
title = sprintf(paste("containment proportion of",
"%s ys\nrandomly sampled per x value"),
n_sims_containment)) +
theme_minimal()
gridExtra::grid.arrange(vis1g, vis2g,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
if (save_image){
ggvis <- gridExtra::arrangeGrob(vis1g, vis2g,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
ggplot2::ggsave(plot = ggvis,
filename = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_g_cd_%s",
"_sims_delta_size_%s_confidence_level_%s.png"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
vis1g_tikz <- df_grid_all_g_cd %>% filter(contained) %>%
ggplot() +
geom_point(data = test_set, aes(x = x, y = y), color = "blue") +
geom_tile(aes(x = x, y = y), alpha = .5) +
labs(title = paste('"Global" CD* \\\\\ Conformal',
"Inference Prediction Regions (60\\%)")) +
xlim(-1.5,1.5) + ylim(-8,8) +
theme_minimal()
ggvis_tex <- gridExtra::arrangeGrob(vis1g_tikz, vis2g,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
tikz(file = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_g_cd_%s",
"_sims_delta_size_%s_confidence_level_%s.tex"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
plot(ggvis_tex)
dev.off()
}
## Lei & Wasserman --------
# We're going to break X in 8 bins (the same number as Figure 4, Lei2014)
cutoff_lei_wasserman <- calibration_info %>%
mutate(x_cut = cut(x, breaks = seq(-1.5-2*.Machine$double.eps,
1.5+2*.Machine$double.eps, length.out = 9)),
x_cut_lower = cut_to_numeric(x_cut,.lower = T),
x_cut_upper = cut_to_numeric(x_cut,.lower = F)) %>%
group_by(x_cut, x_cut_lower, x_cut_upper) %>%
summarize(cutoff_lei_wasserman = quantile(calibration_cde_values,
1 - confidence_level),
.groups = "drop_last")
delta_y = .001
yy <- seq(-8, 8, by = delta_y)
if (verbose) {
pb <- progress::progress_bar$new(
format = "Containment calculation [:bar] :percent eta: :eta",
total = nrow(test_set_discrete), clear = FALSE, width = 80)
}
df_grid_all_lw_cd <- data.frame()
df_containment_prop_all_lw_cd <- data.frame()
for (calibration_idx in 1:nrow(test_set_discrete)){
# for visual
x_inner <- test_set_discrete$x[calibration_idx]
cde_values <- simulationBands::cde_lei_wassserman(x_inner)(yy)
group_id <- cut(x_inner, breaks = seq(-1.5-2*.Machine$double.eps,
1.5+2*.Machine$double.eps, length.out = 9))
inner_cutoff <- cutoff_lei_wasserman$cutoff_lei_wasserman[cutoff_lei_wasserman$x_cut == group_id]
assertthat::assert_that(length(inner_cutoff) == 1)
contained <- cde_values > inner_cutoff
df_inner <- data.frame(
calibration_idx = calibration_idx,
x = x_inner,
y = yy,
contained = contained)
df_grid_all_lw_cd <- rbind(df_grid_all_lw_cd, df_inner)
if (verbose) {
pb$tick()
}
# for containment of true observations:
inner_sim_y <- sapply(rep(x_inner, n_sims_containment),
function(x) {
simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
sim_cde_values <- simulationBands::cde_lei_wassserman(x_inner)(inner_sim_y)
contained_check <- sim_cde_values > inner_cutoff
df_containment_prop_inner <- data.frame(
x = x_inner,
containment_prop = mean(contained_check))
df_containment_prop_all_lw_cd <- rbind(df_containment_prop_all_lw_cd,
df_containment_prop_inner)
}
vis1lw <- df_grid_all_lw_cd %>% filter(contained) %>%
ggplot() +
geom_point(data = test_set, aes(x = x, y = y), color = "blue") +
geom_tile(aes(x = x, y = y), alpha = .5) +
labs(title = sprintf(paste("Lei and Wasserman (2014) ~Local~\nConformal",
"Inference Prediction Regions (%s)"),
confidence_level_string)) +
xlim(-1.5,1.5) + ylim(-8,8) +
theme_minimal()
vis2lw <- df_containment_prop_all_lw_cd %>%
ggplot() +
geom_line(aes(x = x, y = containment_prop)) +
ylim(0,1) + xlim(-1.5,1.5) +
geom_hline(yintercept = confidence_level, color = "blue") +
geom_label(data = data.frame(x = 1.5, y = confidence_level,
label = confidence_level),
aes(x = x, y = y, label = label)) +
labs(y = "containment proportion",
title = sprintf(paste("containment proportion of %s",
"ys\nrandomly sampled per x value"),
n_sims_containment)) +
theme_minimal()
gridExtra::grid.arrange(vis1lw, vis2lw,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
if (save_image){
ggvis <- gridExtra::arrangeGrob(vis1lw, vis2lw,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
ggplot2::ggsave(plot = ggvis,
filename = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_lw_cd_%s",
"_sims_delta_size_%s_confidence_level_%s.png"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
vis1lw_tikz <- df_grid_all_lw_cd %>% filter(contained) %>%
ggplot() +
geom_point(data = test_set, aes(x = x, y = y), color = "blue") +
geom_tile(aes(x = x, y = y), alpha = .5) +
labs(title = paste("Lei and Wasserman (2014) $\\sim$ Local $\\sim$ \\\\\ Conformal",
"Inference Prediction Regions (60\\%)")) +
xlim(-1.5,1.5) + ylim(-8,8) +
theme_minimal()
ggvis_tikz <- gridExtra::arrangeGrob(vis1lw_tikz, vis2lw,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
tikz(file = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_lw_cd_%s",
"_sims_delta_size_%s_confidence_level_%s.tex"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
plot(ggvis_tikz)
dev.off()
}
## Izbicki 2020 ------------------
### calibration
delta_y = .001
yy <- seq(-8, 8, by = delta_y)
n_sims <- n_simulations
if (verbose) {
pb <- progress::progress_bar$new(
format = "Containment calculation [:bar] :percent eta: :eta",
total = nrow(calibration_set), clear = FALSE, width = 80)
}
df_grid_all_g_cd_calibration_set <- data.frame()
df_containment_prop_all_g_cd_calibration_set <- data.frame()
for (calibration_idx in 1:nrow(calibration_set)){
# for visual
x_inner <- calibration_set$x[calibration_idx]
cde_values <- simulationBands::cde_lei_wassserman(x_inner)(yy)
contained <- cde_values > cutoff_g_cd
df_inner <- data.frame(
calibration_idx = calibration_idx,
x = x_inner,
y = yy,
contained = contained,
cde = cde_values)
df_grid_all_g_cd_calibration_set <- rbind(df_grid_all_g_cd_calibration_set, df_inner)
if (verbose) {
pb$tick()
}
# for containment of true observations:
inner_sim_y <- sapply(rep(x_inner, n_sims_containment),
function(x) {
simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
sim_cde_values <- simulationBands::cde_lei_wassserman(x_inner)(inner_sim_y)
contained_check <- sim_cde_values > cutoff_g_cd
df_containment_prop_inner <- data.frame(
x = x_inner,
containment_prop = mean(contained_check))
df_containment_prop_all_g_cd_calibration_set <-
rbind(df_containment_prop_all_g_cd_calibration_set,
df_containment_prop_inner)
}
#yy
t_range <- range(df_grid_all_g_cd_calibration_set$cde)
n_levels <- 1000
t_grid <- seq(0, t_range[2], length.out = n_levels)
n_calibrate <- length(unique(df_grid_all_g_cd_calibration_set$calibration_idx))
n_y <- length(yy)
n_t <- length(t_grid)
k <- max(round(n_calibrate/100),1)
g_calibrate_cde <- matrix(NA, n_calibrate,
n_t)
if (verbose) {
pb <- progress::progress_bar$new(
format = "Make Profile Matrix (Calibration) [:bar] :percent eta: :eta",
total = n_calibrate, clear = FALSE, width = 80)
}
for (calibration_idx in 1:n_calibrate){
cde_values <- df_grid_all_g_cd_calibration_set$cde[df_grid_all_g_cd_calibration_set$calibration_idx == calibration_idx]
g_calibrate_cde[calibration_idx,] <- predictionBands:::profile_density(t_grid, yy,
cde_values)
if (verbose) {
pb$tick()
}
}
kmeans_result <- try(LICORS::kmeanspp(g_calibrate_cde,k=k),
silent = TRUE)
if(class(kmeans_result)=="try-error"){
kmeans_result <- kmeans(g_calibrate_cde,centers = k)
}
centers_kmeans <- kmeans_result$centers
library(FNN)
which_partition_calibration <- RANN::nn2(data = centers_kmeans, g_calibrate_cde, k = 1)$nn.idx
#predictionBands:::which_neighbors(centers_kmeans,
#g_calibrate_cde,1)
calibration_info$partition_id <- which_partition_calibration
rm(g_calibrate_cde)
### test
n_test <- length(unique(df_grid_all_g_cd$calibration_idx))
g_test_cde <- matrix(NA, n_test,
n_t)
if (verbose) {
pb <- progress::progress_bar$new(
format = "Make Profile Matrix (Test) [:bar] :percent eta: :eta",
total = n_test, clear = FALSE, width = 80)
}
for (calibration_idx in 1:n_test){
cde_values <- df_grid_all_g_cd$cde[df_grid_all_g_cd$calibration_idx == calibration_idx]
g_test_cde[calibration_idx,] <- predictionBands:::profile_density(t_grid, yy,
cde_values)
if (verbose) {
pb$tick()
}
}
which_partition_test <- RANN::nn2(data = centers_kmeans, g_test_cde, k = 1)$nn.idx
#predictionBands:::which_neighbors(centers_kmeans,
# g_test_cde,1)
cutoff_izbicki <- calibration_info %>%
mutate(partition_id = as.numeric(partition_id)) %>%
group_by(partition_id) %>%
summarize(cutoff_izbicki = quantile(calibration_cde_values,
1 - confidence_level))
delta_y = .001
yy <- seq(-8, 8, by = delta_y)
if (verbose) {
pb <- progress::progress_bar$new(
format = "Containment calculation [:bar] :percent eta: :eta",
total = nrow(test_set_discrete), clear = FALSE, width = 80)
}
df_grid_all_iz_cd <- data.frame()
df_containment_prop_all_iz_cd <- data.frame()
for (calibration_idx in 1:nrow(test_set_discrete)){
# for visual
x_inner <- test_set_discrete$x[calibration_idx]
cde_values <- simulationBands::cde_lei_wassserman(x_inner)(yy)
partition_id <- which_partition_test[calibration_idx]
inner_cutoff <- cutoff_izbicki$cutoff_izbicki[cutoff_izbicki$partition_id == partition_id]
contained <- cde_values > inner_cutoff
df_inner <- data.frame(
calibration_idx = calibration_idx,
x = x_inner,
y = yy,
contained = contained)
df_grid_all_iz_cd <- rbind(df_grid_all_iz_cd, df_inner)
if (verbose) {
pb$tick()
}
# for containment of true observations:
inner_sim_y <- sapply(rep(x_inner, n_sims_containment),
function(x) {
simulationBands::lei_wasserman_data_conditional_simulate(x, n = 1)[[1]]$sim})
sim_cde_values <- simulationBands::cde_lei_wassserman(x_inner)(inner_sim_y)
contained_check <- sim_cde_values > inner_cutoff
df_containment_prop_inner <- data.frame(
x = x_inner,
containment_prop = mean(contained_check))
df_containment_prop_all_iz_cd <- rbind(df_containment_prop_all_iz_cd,
df_containment_prop_inner)
}
vis1iz <- df_grid_all_iz_cd %>% filter(contained) %>%
ggplot() +
geom_point(data = test_set, aes(x = x, y = y), color = "blue") +
geom_tile(aes(x = x, y = y), alpha = .5) +
labs(title = sprintf(paste0("Izbicki et. al (2020) ~Local~\n",
"Conformal Inference Prediction Regions (%s)"),
confidence_level_string)) +
xlim(-1.5,1.5) + ylim(-8,8) +
theme_minimal()
vis2iz <- df_containment_prop_all_iz_cd %>%
ggplot() +
geom_line(aes(x = x, y = containment_prop)) +
ylim(0,1) + xlim(-1.5,1.5) +
geom_hline(yintercept = confidence_level, color = "blue") +
geom_label(data = data.frame(x = 1.5, y = confidence_level,
label = confidence_level),
aes(x = x, y = y, label = label)) +
labs(y = "containment proportion",
title = sprintf(paste("containment proportion of %s",
"ys\nrandomly sampled per x value"),
n_sims_containment)) +
theme_minimal()
gridExtra::grid.arrange(vis1iz, vis2iz,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
if (save_image){
ggvis <- gridExtra::arrangeGrob(vis1iz, vis2iz,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
ggplot2::ggsave(plot = ggvis, sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_iz_cd_%s",
"_sims_delta_size_%s_confidence_level_%s.png"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
vis1iz_tikz <- df_grid_all_iz_cd %>% filter(contained) %>%
ggplot() +
geom_point(data = test_set, aes(x = x, y = y), color = "blue") +
geom_tile(aes(x = x, y = y), alpha = .5) +
labs(title = paste0("Izbicki et. al (2020) $\\sim$ Local $\\sim$ \\\\\ ",
"Conformal Inference Prediction Regions (60\\%)")) +
xlim(-1.5,1.5) + ylim(-8,8) +
theme_minimal()
vis2iz_tikz <- df_containment_prop_all_iz_cd %>%
ggplot() +
geom_line(aes(x = x, y = containment_prop)) +
ylim(0,1) + xlim(-1.5,1.5) +
geom_hline(yintercept = confidence_level, color = "blue") +
geom_label(data = data.frame(x = 1.5, y = confidence_level,
label = confidence_level),
aes(x = x, y = y, label = label)) +
labs(y = "containment proportion",
title = sprintf(paste("containment proportion of %s",
"ys\\\\\ randomly sampled per x value"),
n_sims_containment)) +
theme_minimal()
ggvis_tikz <- gridExtra::arrangeGrob(vis1iz_tikz, vis2iz_tikz,
layout_matrix = matrix(c(1,1,1,1,
1,1,1,1,
1,1,1,1,
2,2,2,2,
2,2,2,2), byrow = T, ncol = 4))
tikz(file = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_iz_cd_%s",
"_sims_delta_size_%s_confidence_level_%s.tex"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
plot(ggvis_tikz)
dev.off()
}
# all together ----------
gridExtra::grid.arrange(grobs = list(
vis1g, vis1lw, vis1iz, vis1,
vis2g, vis2lw, vis2iz, vis2) %>%
lapply(function(g){g + theme(text = element_text(size = 15),
plot.title = element_text(size = 22))}),
layout_matrix = matrix(c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8),
ncol = 4*4, byrow = T),
bottom = paste("*a \"global\" version of",
"Izbicki et. al 2020's CD approach."))
if (save_image){
ggvis <- gridExtra::arrangeGrob(grobs = list(
vis1, vis1g, vis1lw, vis1iz,
vis2, vis2g, vis2lw, vis2iz ) %>%
lapply(function(g){g + theme(text = element_text(size = 6),
plot.title = element_text(size = 12))}),
layout_matrix = matrix(c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8),
ncol = 4*4, byrow = T),
bottom = paste("*a \"global\" version of",
"Izbicki et. al 2020's CD approach."))
ggplot2::ggsave(plot = ggvis, width = 17, height = 8,
filename = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_options_%s",
"_sims_delta_size_%s_confidence_level_%s.png"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
ggvis_tikz <- gridExtra::arrangeGrob(grobs = list(
vis1g_tikz, vis1lw_tikz, vis1iz_tikz, vis1_tikz,
vis2g, vis2lw, vis2iz_tikz, vis2) %>%
lapply(function(g){g + theme(text = element_text(size = 6),
plot.title = element_text(size = 12))}),
layout_matrix = matrix(c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8),
ncol = 4*4, byrow = T),
bottom = paste('*a "global" version of',
"Izbicki et. al 2020's CD approach."))
tikz(file = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_options_%s",
"_sims_delta_size_%s_confidence_level_%s_PRESENT.tex"),
n_simulations, delta_prop_image_string,
confidence_level_image_string),
width = 17, height = 8)
plot(ggvis_tikz)
dev.off()
}
## presentation size ------
gridExtra::grid.arrange(grobs = list(
vis1g, vis1lw, vis1iz, vis1,
vis2g, vis2lw, vis2iz, vis2) %>%
lapply(function(g){g + theme(text = element_text(size = 15),
plot.title = element_text(size = 22))}),
layout_matrix = matrix(c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8),
ncol = 4*4, byrow = T),
bottom = paste("*a \"global\" version of",
"Izbicki et. al 2020's CD approach."))
if (save_image){
ggvis <- gridExtra::arrangeGrob(grobs = c(list(
vis1 + labs(title = "Our Simulation-based Approach"),
vis1g + labs(title = "Global CD*"),
vis1lw + labs(title = "Lei and Wasserman\n(2014) ~Local~"),
vis1iz + labs(title = "Izbicki et al.\n(2020) ~Local~")
) %>%
lapply(function(g){g + theme(text = element_text(size = 18),
plot.title = element_text(size = 26))}),
list(
vis2 + labs(title =""),
vis2g + labs(title ="") ,
vis2lw + labs(title =""),
vis2iz + labs(title ="")
) %>%
lapply(function(g){g + theme(text = element_text(size = 18),
plot.title = element_text(size = 18))})),
layout_matrix = matrix(c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8),
ncol = 4*4, byrow = T),
top = grid::textGrob("Conformal Inference Prediction Regions (60%)",
gp = grid::gpar(fontsize = 32)),
bottom = grid::textGrob(paste("*a \"global\" version of",
"Izbicki et. al 2020's CD approach."),
gp = grid::gpar(fontsize = 26)))
ggplot2::ggsave(plot = ggvis, width = 17, height = 8,
filename = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_options_%s",
"_sims_delta_size_%s_confidence_level_%s_PRESENT.png"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
ggvis_tikz <- gridExtra::arrangeGrob(grobs = list(
vis1g_tikz, vis1lw_tikz, vis1iz_tikz, vis1_tikz,
vis2g, vis2lw, vis2iz_tikz, vis2) %>%
lapply(function(g){g + theme(text = element_text(size = 6),
plot.title = element_text(size = 12))}),
layout_matrix = matrix(c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,
5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8),
ncol = 4*4, byrow = T),
bottom = paste('*a "global" version of',
"Izbicki et. al 2020's CD approach."))
tikz(file = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_options_%s",
"_sims_delta_size_%s_confidence_level_%s.tex"),
n_simulations, delta_prop_image_string,
confidence_level_image_string),
width = 17, height = 8)
plot(ggvis_tikz)
dev.off()
}
## UAI size --------
gridExtra::grid.arrange(grobs = list(
vis1g, vis1lw, vis1iz, vis1,
vis2g, vis2lw, vis2iz, vis2) %>%
lapply(function(g){g + theme(text = element_text(size = 8),
plot.title = element_text(size = 15))}),
layout_matrix = matrix(c(1,1,1,1,2,2,2,2,
1,1,1,1,2,2,2,2,
1,1,1,1,2,2,2,2,
1,1,1,1,2,2,2,2,
5,5,5,5,6,6,6,6,
5,5,5,5,6,6,6,6,
rep(NA,8),
3,3,3,3,4,4,4,4,
3,3,3,3,4,4,4,4,
3,3,3,3,4,4,4,4,
3,3,3,3,4,4,4,4,
7,7,7,7,8,8,8,8,
7,7,7,7,8,8,8,8),
ncol = 4*2, byrow = T),
bottom = grid::textGrob(paste("*a \"global\" version of",
"Izbicki et. al 2020's CD approach."),
gp=grid::gpar(fontsize=15)))
if (save_image){
ggvis <- gridExtra::arrangeGrob(grobs = list(
vis1,vis1g, vis1lw, vis1iz,
vis2,vis2g, vis2lw, vis2iz) %>%
lapply(function(g){g + theme(text = element_text(size = 11),
plot.title = element_text(size = 11))}),
layout_matrix = matrix(c(1,1,1,1,2,2,2,2,
1,1,1,1,2,2,2,2,
1,1,1,1,2,2,2,2,
1,1,1,1,2,2,2,2,
5,5,5,5,6,6,6,6,
5,5,5,5,6,6,6,6,
rep(NA,8),
3,3,3,3,4,4,4,4,
3,3,3,3,4,4,4,4,
3,3,3,3,4,4,4,4,
3,3,3,3,4,4,4,4,
7,7,7,7,8,8,8,8,
7,7,7,7,8,8,8,8),
ncol = 4*2, byrow = T),
bottom = paste("*a \"global\" version of",
"Izbicki et. al 2020's CD approach."))
ggplot2::ggsave(plot = ggvis, width = 8, height = 12,
filename = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_options_%s",
"_sims_delta_size_%s_confidence_level_%s_UAI.png"),
n_simulations, delta_prop_image_string,
confidence_level_image_string))
ggvis_tikz <- gridExtra::arrangeGrob(grobs = list(
vis1g_tikz, vis1lw_tikz, vis1iz_tikz, vis1_tikz,
vis2g, vis2lw, vis2iz_tikz, vis2) %>%
lapply(function(g){g + theme(text = element_text(size = 6),
plot.title = element_text(size = 12))}),
layout_matrix = matrix(c(1,1,1,1,2,2,2,2,
1,1,1,1,2,2,2,2,
1,1,1,1,2,2,2,2,
1,1,1,1,2,2,2,2,
5,5,5,5,6,6,6,6,
5,5,5,5,6,6,6,6,
rep(NA,8),
3,3,3,3,4,4,4,4,
3,3,3,3,4,4,4,4,
3,3,3,3,4,4,4,4,
3,3,3,3,4,4,4,4,
7,7,7,7,8,8,8,8,
7,7,7,7,8,8,8,8),
ncol = 4*2, byrow = T),
bottom = paste('*a "global" version of',
"Izbicki et. al 2020's CD approach."))
tikz(file = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_options_%s",
"_sims_delta_size_%s_confidence_level_%s_UAI.tex"),
n_simulations, delta_prop_image_string,
confidence_level_image_string),
width = 17, height = 8)
plot(ggvis_tikz)
dev.off()
}
if (save_image){
library(cowplot)
vis_block_a <- plot_grid(vis1 + theme(text = element_text(size = 11),
plot.title = element_text(size = 11)),
vis2 + theme(text = element_text(size = 11),
plot.title = element_text(size = 11)),
ncol = 1, rel_heights = c(4,2),
labels = c("a)",""))
vis_block_b <- plot_grid(vis1g + theme(text = element_text(size = 11),
plot.title = element_text(size = 11)),
vis2g + theme(text = element_text(size = 11),
plot.title = element_text(size = 11)),
ncol = 1, rel_heights = c(4,2),
labels = c("b)",""))
vis_row_1 <- plot_grid(vis_block_a,
vis_block_b,
nrow = 1)
vis_block_c <- plot_grid(vis1lw + theme(text = element_text(size = 11),
plot.title = element_text(size = 11)),
vis2lw + theme(text = element_text(size = 11),
plot.title = element_text(size = 11)),
ncol = 1, rel_heights = c(4,2),
labels = c("c)",""))
vis_block_d <- plot_grid(vis1iz + theme(text = element_text(size = 11),
plot.title = element_text(size = 11)),
vis2iz + theme(text = element_text(size = 11),
plot.title = element_text(size = 11)),
ncol = 1, rel_heights = c(4,2),
labels = c("d)",""))
vis_row_2 <- plot_grid(vis_block_c,
vis_block_d,
nrow = 1)
vis_all <- plot_grid(vis_row_1,
vis_row_2,
nrow = 2)
ggsave2(vis_all, filename = sprintf(paste0("quick_images/sim_bands_conf",
"_test_discrete_1d_options_%s",
"_sims_delta_size_%s_confidence_level_%s_UAI_cowplot.png"),
n_simulations, delta_prop_image_string,
confidence_level_image_string),
height = 17, width = 8)
}
#install.packages("patchwork")
library(patchwork)
all_grobs <- list(
vis1, vis1g, vis1lw, vis1iz,
vis2, vis2g, vis2lw, vis2iz) %>%
lapply(function(g){g + theme(text = element_text(size = 11),
plot.title = element_text(size = 11))})
patchwork_layout <- "
AAAABBBB
AAAABBBB
AAAABBBB
AAAABBBB
EEEEFFFF
EEEEFFFF
########
CCCCDDDD
CCCCDDDD
CCCCDDDD
CCCCDDDD
GGGGHHHH
GGGGHHHH
"
vis_all_patchwork <- all_grobs[[1]] + all_grobs[[2]] + all_grobs[[3]] + all_grobs[[4]] +
all_grobs[[5]] + all_grobs[[6]] + all_grobs[[7]] + all_grobs[[8]] +
plot_layout(design = patchwork_layout)
vis_all_patchwork %>% plot()
vis_all_patchwork + plot_annotation()
save(vis1, vis1g, vis1lw, vis1iz,
vis2, vis2g, vis2lw, vis2iz,file = "vis_files.Rdata")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.