#' Mode clustering for euclidean data (distance based)
#'
#' Walks up a guassian kernel density estimate to find modes
#'
#' Similar to more complex code developed by Yen-Chi Chen and Chris Genovese
#'
#' @param X_mat matrix of data points (in rows), that define the kernel density
#' @param G_mat matrix of data points (in rows) that need to be "walked" to
#' their modes
#' @param sigma density scalar / sigma value
#' @param maxT int, maximum number of iterations
#' @param eps float, difference between 2 sequential points for which to stop
#' walking toward the mode for a give point's walk
#' @param verbose boolean, if this progression should be verbose
#' @param list_out if we should return every step of the process (for fun
#' visualization purposes and visual checks only)
#'
#' @return list of :
#' - matrix of \code{G_mat} points walked up all the way
#' - \code{t}, the actual number of steps taken
#' - if \code{list_out} is \code{TRUE} then list of matrices for each step,
#' else \code{NULL}
#' @export
psuedo_density_mode_cluster_1d <- function(X_mat, G_mat = X_mat,
sigma,
maxT = 30,
eps = 1e-05,
verbose = TRUE,
list_out = FALSE){
n_X <- nrow(X_mat)
n_G <- nrow(G_mat)
G_mat_out <- G_mat # just a copy
if (list_out){
G_mat_list <- list()
G_mat_list[[1]] <- G_mat_out
}
error <- rep(1e08, n_G) # initial error = massive error
max_error <- 1e08
t <- 0
if (verbose){
pb <- progress::progress_bar$new(
format = "mode clustering [:bar] :percent eta: :eta",
total = maxT, clear = FALSE, width = 60)
}
while (max_error > eps && t < maxT){
for (g_idx in 1:n_G) {
current_vs_X_dist_vals <- apply(
sweep(X_mat, 2, G_mat_out[g_idx,], "-")^2,
1,
function(row_vals) sqrt(sum(row_vals)))
pseudo_density <- exp(-current_vs_X_dist_vals^2/sigma)
denominator <- sum(pseudo_density)
if (denominator == 0) {
stop("Error: no distance between a observation in G_mat and all observations in X_mat > 0.");
}
scaled_props <- pseudo_density / denominator
new_point <- apply(
sweep(X_mat, 1, scaled_props, "*"),
2, sum)
# get error and update
error[g_idx] <- sqrt(sum((new_point-G_mat_out[g_idx,])^2))
G_mat_out[g_idx, ] <- new_point
}
if (list_out){
G_mat_list[[t+2]] <- G_mat_out
}
t <- t + 1
max_error <- max(error)
if (verbose){
pb$tick()
}
}
if (!list_out){
G_mat_list <- list()
}
return(list(G_mat = G_mat_out, t = t, G_mat_list = G_mat_list))
}
#' Find mode clusters of euclidean objects based on distance - densities
#'
#' @param df_info data frame with rows of observations to mode cluster
#' @param position character vector of columns that define the location of the
#' points
#' @param naming_info a data frame with information about the data points (same
#' number of rows as \code{df_info})
#' @param sigma density scalar / sigma value
#' @param maxT int, maximum number of iterations
#' @param eps float, difference between 2 sequential points for which to stop
#' walking toward the mode for a give point's walk
#' @param diff_eps float, if the final step of each of the points is within
#' this distance from each-other they will be grouped together.
#' @param verbose boolean, if this progression should be verbose
#'
#' @return data frame with \code{naming_info} and a \code{grouping} column which
#' indicates which mode group each observation is in.
#' @export
mode_clustering_1d <- function(df_info, position, naming_info = NULL, sigma, maxT = 50,
eps = 1e-05, diff_eps = 1e-05, verbose = TRUE){
mat_info_raw <- df_info[,position] %>% as.matrix()
if (!is.null(naming_info)){
g_names = df_info[, naming_info, drop = F]
}
out <- psuedo_density_mode_cluster_1d(mat_info_raw, sigma = sigma,
maxT = maxT, eps = eps,
verbose = verbose, list_out = FALSE)
dist_mat <- as.matrix(stats::dist(out[[1]]))
adjmatrix <- dist_mat <= diff_eps
ig <- igraph::graph_from_adjacency_matrix(adjmatrix, mode = "undirected")
groupings <- igraph::components(ig, mode = "strong")$membership
if (!is.null(naming_info)){
if (inherits(g_names, "data.frame")){
return(cbind(g_names, groupings = groupings))
} else {
return(data.frame(g_names, groupings = groupings))
}
} else {
return(data.frame(groupings))
}
}
#' Calculates the minimum coverage radius for euclidean objects as we increase
#' the number of observations.
#'
#' @param ordered_sim_df data frame of points as rows, ordered by when they
#' should be added in the sequence
#' @param e_cols character vector of column names that define
#' locational information of points
#' @param verbose boolean, if this progression should be verbose
#'
#' @return list of min_cover_vec (minimum at that step) and dist_mat (cumulative
#' max per path at time step t (column))
#'
#' @export
coverage_down_1d_single <- function(ordered_sim_df, e_cols,
verbose = FALSE){
n_obs <- nrow(ordered_sim_df)
point_dist_mat <- as.matrix(stats::dist(ordered_sim_df[,e_cols]))
dist_mat <- matrix(NA, nrow = n_obs, ncol = n_obs)
dist_mat[1,1] <- 0
min_cover_vec <- rep(NA, n_obs)
min_cover_vec[1] <- 0
if (n_obs > 1){
if (verbose){
pb <- progress::progress_bar$new(
format = "calculating minimum coverage [:bar] :percent eta: :eta",
total = n_obs - 1, clear = FALSE, width = 60)
}
inner_min_dist_cov_mat <- rep(Inf, n_obs)
for (r_idx in 2:n_obs){
#browser()
# check distances to new point
new_distances_with_new_point <- point_dist_mat[1:(r_idx-1),r_idx]
inner_min_dist_cov_mat[r_idx] <- min(new_distances_with_new_point)
# update minimum covereage required for older points
inner_min_dist_cov_mat[1:(r_idx-1)] <- apply(
matrix(c(inner_min_dist_cov_mat[1:(r_idx-1)],
new_distances_with_new_point),
byrow = TRUE, nrow = 2),
2, min)
# calc current min_cover value
min_cover_vec[r_idx] <- max(inner_min_dist_cov_mat[1:r_idx])
# update radi values if necessary
dist_mat[1:(r_idx-1), r_idx] <- sapply(dist_mat[1:(r_idx-1), r_idx-1],
function(x) max(c(x, min_cover_vec[r_idx])))
dist_mat[r_idx, r_idx] <- min_cover_vec[r_idx]
if (verbose) {
pb$tick()
}
}
} else {
message("A mode cluster has only a single element.")
}
return(list(min_cover_vec = min_cover_vec, dist_mat = dist_mat))
}
#' Calculates the minimum coverage radius for euclidean objects as we increase
#' the number of observations relative to mode groupings.
#'
#' @param sim_df data frame of points as rows
#' @param e_cols character vector of column names that define
#' locational information of points
#' @param ordering_list list of indices, each vector is related to a single mode
#' cluster and the ordering defines how we build up the region.
#' @param verbose boolean, if this progression should be verbose
#'
#' @return list per mode grouping in \code{ordering_list} with inner lists of
#' min_cover_vec (minimum at that step) and dist_mat (cumulative
#' max per path at time step t (column))
#'
#' @export
coverage_down_1d_mult <- function(sim_df, e_cols,
ordering_list, # list of lists...
verbose = FALSE){
n_lists <- length(ordering_list)
n <- nrow(sim_df)
assertthat::assert_that((sum(sapply(ordering_list, length)) == n) &
(all(sort(unlist(ordering_list)) == 1:n)),
msg = "sim_df length and length of ordering_list info should match")
if (verbose){
pb <- progress::progress_bar$new(
format = "calculating minimum coverage (per mode) [:bar] :percent eta: :eta",
total = n_lists, clear = FALSE, width = 60)
}
coverd_info <- list()
t <- 1
for (order_vec in ordering_list){
if ((ncol(sim_df) == 1) | length(order_vec) == 1){
coverd_info[[t]] <- coverage_down_1d_single(sim_df[order_vec,, drop = F],
e_cols = e_cols,
verbose = FALSE)
} else {
coverd_info[[t]] <- coverage_down_1d_single(sim_df[order_vec,],
e_cols = e_cols,
verbose = FALSE)
}
if (verbose){
pb$tick()
}
t <- t + 1
}
return(coverd_info)
}
#' calculate distance between two different df's rows
#'
#' @param df1 data frame (n, p)
#' @param df2 data frame (m, p)
#'
#' @return matrix of distances (n, m)
my_pdist <- function(df1, df2){
nrow1 <- nrow(df1)
nrow2 <- nrow(df2)
out <- matrix(NA, nrow = nrow1, ncol = nrow2)
for (r_idx in 1:nrow1){
for (c_idx in 1:nrow2){
out[r_idx, c_idx] <- sqrt(sum((df1[r_idx,] - df2[c_idx,])^2))
}
}
return(out)
}
#' Helper function for simulation-based conformal score calculation based
#' potentially based on mode clustering and changing radius values.
#'
#' @param df_row_group data frame with new point information to be assessed
#' @param simulations_group_df data frame with multiplee simulated points
#' @param data_column_names character vector of column names that define
#' locational information of points
#' @param simulation_info_df a dataframe with information of each simulation's
#' psuedo-density estimate, mode clustering, ranking of psuedo-density (within
#' cluster and overall). We expect a \code{psuedo_density}, \code{groupings}
#' \code{ranking} and \code{group_ranking} columns. See
#' \code{EpiCompare::inner_expanding_info} for expected structure.
#' @param list_radius_info list of lists of radius information per each mode.
#' @param list_grouping_id list of vectors of indices (grouped by mode cluster)
#' and ordered by psuedo-density values. These values are relative to the row
#' indices in simulation_info_df
#' @param verbose boolean, if progress should be tracked with a progress bar
#'
#' @return data.frame with a row for new point with a
#' section column \code{containment_val} which is the discrete simulation-based
#' conformal score. If \code{df_row_group} only contained location columns in
#' \code{data_column_names} then a new column named \code{index} is also
#' included, and is related to the row numbers/names of \code{df_row_group}.
#' @export
inner_containment_conformal_score_mode_radius_1d <- function(df_row_group,
simulations_group_df,
data_column_names, # must be string vector (not index)
simulation_info_df,
list_radius_info,
list_grouping_id,
verbose = FALSE){
if (ncol(df_row_group) == length(data_column_names)){
df_row_group <- df_row_group %>% tibble::rownames_to_column(var = "row_index")
}
group_names <- names(df_row_group)[!(names(df_row_group) %in% data_column_names)]
if (length(group_names) == 1){
group_info_df <- df_row_group[,group_names, drop = F]
} else {
group_info_df <- df_row_group[,group_names]
}
n_draws <- nrow(simulation_info_df)
n_groups <- length(list_grouping_id)
n_check <- nrow(df_row_group)
if (verbose){
pb <- progress::progress_bar$new(
format = "Processing [:bar] :percent eta: :eta",
total = n_draws,
clear = FALSE, width = 60)
}
# https://stackoverflow.com/questions/35106567/how-to-calculate-euclidean-distance-between-two-matrices-in-r
if (length(data_column_names) == 1){
between_dist_mat <- my_pdist(df_row_group[, data_column_names, drop = F],
simulations_group_df[, data_column_names, drop = F])
# between_dist_mat <- matrix(NA, nrow = nrow(df_row_group),
# ncol = nrow(simulations_group_df))
#
# for (c_idx in 1:nrow(simulations_group_df)){
# between_dist_mat[,c_idx] <- inner_euclidean_distance_1d(df_row_group[, data_column_names],
# simulations_group_df[c_idx, data_column_names])
# }
} else {
between_dist_mat <- my_pdist(df_row_group[, data_column_names],
simulations_group_df[, data_column_names])
}
overall_info <- list()
for (g_idx in 1:n_groups){
inner_ids <- list_grouping_id[[g_idx]]
inner_rad_dist_mat <- list_radius_info[[g_idx]]$dist_mat
n_sims_inner <- length(inner_ids)
# 1. iteratively build distance across each point, store
# Only examine until covered
# track first time covered
not_covered <- rep(TRUE, n_check)
coverage_number <- rep(n_draws+1, n_check)
s_idx <- 1
while (s_idx <= n_sims_inner & sum(not_covered) > 0){
if (sum(not_covered) == 1 | s_idx == 1){
new_contained <- sweep(between_dist_mat[not_covered, inner_ids[1:s_idx], drop = F],
MARGIN = 2,
STATS = inner_rad_dist_mat[1:s_idx, s_idx],
FUN = "<=") %>%
rowSums() %>% sapply(function(x) x > 0)
} else {
new_contained <- sweep(between_dist_mat[not_covered, inner_ids[1:s_idx]],
MARGIN = 2,
STATS = inner_rad_dist_mat[1:s_idx, s_idx],
FUN = "<=") %>%
rowSums() %>% sapply(function(x) x > 0)
}
if (sum(new_contained) > 0){
# update coverage numbers for those just covered
inner_coverage <- rep(n_draws+1, sum(not_covered))
inner_coverage[new_contained] <- s_idx
coverage_number[not_covered] <- inner_coverage
# update those covered to not look at again
not_covered[not_covered] <- !new_contained
}
s_idx <- s_idx + 1
}
# calculate score per mode relative to overall ranking of points
all_info_df <- group_info_df %>%
dplyr::mutate(containment_val = coverage_number)
simulation_info_inner <- rbind((simulation_info_df[inner_ids,] %>%
dplyr::filter(.data$groupings == g_idx) %>%
dplyr::ungroup() %>%
dplyr::select(.data$ranking, .data$group_ranking)),
data.frame(ranking = n_draws+1, group_ranking = n_draws+1))
overall_score <- all_info_df %>%
dplyr::left_join(simulation_info_inner,
by = c("containment_val" = "group_ranking"))
overall_info[[g_idx]] <- overall_score
}
# combining across modes
moverall_info <- do.call(rbind, overall_info) %>%
dplyr::group_by(dplyr::across(tidyselect::one_of(group_names))) %>%
dplyr::summarize(containment_val = n_draws+1 - min(.data$ranking), .groups = "keep") # why subtraction?:
return(moverall_info)
}
#' global wrapper for simulation-based conformal score calculation that allows
#' for mode clustering and changes of radius for 1d examples
#'
#' @param truth_df data frame with new point information to be assessed
#' @param simulations_df data frame with multiplee simulated points
#' @param smooth_functions a list of functions a function for the smoothing
#' approach for the varied approach
#' @param data_column_names columns of \code{truth_df} and
#' \code{simulations_df} that correspond to the output space.
#' @param .maxT int, max number of steps for mode clustering mean-shift
#' algorithm
#' @param .sigma_string string, the quantile of the distance matrix to define
#' the sigma (e.g. \code{"30\%"})
#' @param .diff_eps float, the error allows between mode clustered final steps
#' to still be counted as the same mode.
#' @param .eps float, when to stop the model clustering (if maxT doesn't stop it
#' first)
#' @param return_min boolean, if list of information returned is mimimum (for slurm)
#' @param verbose boolean, be verbose about progression
#'
#' @return list of information...
#' @export
#'
simulation_based_conformal_1d_complex <- function(truth_df, simulations_df,
smooth_functions = list(), #named list
data_column_names = c("y"),
.maxT = 50,
.sigma_string = "25%",
.diff_eps = 1e-06,
.eps = 1e-06,
return_min = TRUE,
verbose = FALSE){
# calculating sigma value ------
dist_mat <- simulations_df %>% dplyr::select(tidyselect::one_of(data_column_names)) %>%
stats::dist() %>% as.matrix()
group_names <- names(simulations_df)[!(names(simulations_df) %in% data_column_names)]
if (length(group_names) == 0){
simulations_df <- simulations_df %>% tibble::rownames_to_column(var = "row_index")
group_names <- names(simulations_df)[!(names(simulations_df) %in% data_column_names)]
}
# sigma selection
sigma_lower <- EpiCompare:::check_character_percent(.sigma_string, ".sigma_string")
sigma_sizes <- sapply(sigma_lower + .05*(0:5), function(v) min(v, 1))
percentage_inner <- sigma_sizes[stats::quantile(as.matrix(dist_mat), sigma_sizes) > 0][1]
sigma_val <- stats::quantile(as.matrix(dist_mat), percentage_inner)
tdm <- EpiCompare::tidy_dist_mat(dist_mat,
rownames_df = simulations_df[, group_names, drop = F],
colnames_df = simulations_df[, group_names, drop = F])
# rank_df
pseudo_density_df <- EpiCompare::distance_psuedo_density_function(
tdm,
sigma = sigma_val, df_out = T) %>%
dplyr::mutate(ranking = rank(-.data$psuedo_density,ties.method = "random")) #spelling error... :(, no ties! need ordering
assertthat::assert_that(all(!is.na(pseudo_density_df$psuedo_density)),
msg = paste("internal error in",
"distance_psuedo_density_function",
"function's sigma selection."))
top_points <- simulations_df %>%
dplyr::left_join(pseudo_density_df, by = group_names) %>%
dplyr::mutate(keep = .data$ranking > ceiling(.2*nrow(simulations_df))) %>%
dplyr::ungroup() %>% dplyr::filter(.data$keep) %>%
dplyr::select(tidyselect::one_of(data_column_names))
mm_delta <- EpiCompare::get_delta_nn(top_points)
# mode clustering ------------
out_groups <- mode_clustering_1d(df_info = simulations_df,
position = c(1:ncol(simulations_df))[names(simulations_df) %in% data_column_names],
naming_info = c(1:ncol(simulations_df))[names(simulations_df) %in% group_names],
sigma = sigma_val, maxT = .maxT,
eps = .eps,
diff_eps = .diff_eps,
verbose = verbose)
simulation_info_out <- EpiCompare::inner_expanding_info(pseudo_density_df, out_groups)
simulation_info_df <- simulation_info_out[[1]]
ordering_list <- simulation_info_out[[2]]
## fixed
tm_radius_fixed <- EpiCompare:::inner_convert_single_radius_to_structure(mm_delta,
ordering_list)
## vary
tm_radius_vary <- coverage_down_1d_mult(sim_df = simulations_df,
e_cols = data_column_names,
ordering_list = ordering_list,
verbose = FALSE)
# no mode clustering ---------
out_groups_nm <- out_groups %>% dplyr::mutate(groupings = 1)
simulation_info_out_nm <- EpiCompare::inner_expanding_info(pseudo_density_df, out_groups_nm)
simulation_info_df_nm <- simulation_info_out_nm[[1]]
ordering_list_nm <- simulation_info_out_nm[[2]]
tm_radius_fixed_nm <- EpiCompare:::inner_convert_single_radius_to_structure(mm_delta,
ordering_list_nm)
## vary
tm_radius_vary_nm <- coverage_down_1d_mult(sim_df = simulations_df,
e_cols = data_column_names,
ordering_list = ordering_list_nm,
verbose = FALSE)
# smoothing --------
tm_radius_vary_update_list <- list()
tm_radius_vary_nm_update_list <- list()
if (length(smooth_functions) > 0 ){
for (f_name in names(smooth_functions)){
tm_radius_vary_update_list[[f_name]] <- EpiCompare::update_tm_smooth(tm_radius_vary,
func = smooth_functions[[f_name]])
tm_radius_vary_nm_update_list[[f_name]] <- EpiCompare::update_tm_smooth(tm_radius_vary_nm,
func = smooth_functions[[f_name]])
}
}
conformal_df_fixed <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df, # diff
list_radius_info = tm_radius_fixed, # diff
list_grouping_id = ordering_list, # diff
verbose = verbose)
conformal_df_fixed_nm <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df_nm, # diff
list_radius_info = tm_radius_fixed_nm, # diff
list_grouping_id = ordering_list_nm, # diff
verbose = verbose)
conformal_df_vary <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df, # diff
list_radius_info = tm_radius_vary, # diff
list_grouping_id = ordering_list, # diff
verbose = verbose)
conformal_df_vary_nm <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df_nm, # diff
list_radius_info = tm_radius_vary_nm, # diff
list_grouping_id = ordering_list_nm, # diff
verbose = verbose)
conformal_df_vary_nm_smooth_list <- list()
conformal_df_vary_smooth_list <- list()
if (length(smooth_functions) > 0 ){
for (f_name in names(smooth_functions)){
conformal_df_vary_nm_smooth_list[[f_name]] <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df_nm, # diff
list_radius_info = tm_radius_vary_nm_update_list[[f_name]], # diff
list_grouping_id = ordering_list_nm, # diff
verbose = verbose)
conformal_df_vary_smooth_list[[f_name]] <- inner_containment_conformal_score_mode_radius_1d(
df_row_group = truth_df,
simulations_group_df = simulations_df,
data_column_names = data_column_names,
simulation_info_df = simulation_info_df, # diff
list_radius_info = tm_radius_vary_update_list[[f_name]], # diff
list_grouping_id = ordering_list, # diff
verbose = verbose)
}
}
if (return_min){
return(list(conformal_score = list(fixed = conformal_df_fixed,
fixed_nm = conformal_df_fixed_nm,
vary = conformal_df_vary,
vary_nm = conformal_df_vary_nm,
smooth_vary = conformal_df_vary_smooth_list,
smooth_vary_nm = conformal_df_vary_nm_smooth_list),
parameters = c("mm_delta_prop" = .2,
"mm_delta" = mm_delta,
"sigma_percentage" = percentage_inner)))
} else{
return(list(conformal_score = list(fixed = conformal_df_fixed,
fixed_nm = conformal_df_fixed_nm,
vary = conformal_df_vary,
vary_nm = conformal_df_vary_nm,
smooth_vary = conformal_df_vary_smooth_list,
smooth_vary_nm = conformal_df_vary_nm_smooth_list),
containment_df = simulation_info_df,
mm_delta = mm_delta,
tm_radius = list(fixed = tm_radius_fixed,
fixed_nm = tm_radius_fixed_nm,
vary = tm_radius_vary,
vary_nm = tm_radius_vary_nm,
smooth_vary = tm_radius_vary_update_list,
smooth_vary_nm = tm_radius_vary_nm_update_list),
truth_df_inner = truth_df,
simulations_group_df_inner = simulations_df,
ordering_list = ordering_list,
parameters = c("mm_delta_prop" = .2,
"sigma_percentage" = percentage_inner)))
}
}
#' Inner function to repeat values before (if current value is na)
#'
#' from https://stackoverflow.com/questions/7735647/replacing-nas-with-latest-non-na-value
#'
#' @param x vector (with NA values )
#'
#' @return updated x value
repeat.before <- function(x) { # repeats the last non NA value. Keeps leading NA
ind = which(!is.na(x)) # get positions of nonmissing values
if(is.na(x[1])) # if it begins with a missing, add the
ind = c(1,ind) # first position to the indices
rep(x[ind], times = diff( # repeat the values at these indices
c(ind, length(x) + 1) )) # diffing the indices + length yields how often
# they need to be repeated
}
#' combine coverage_down_1d_single grouping across modes
#'
#' @param inner_sim_data simulation data
#' @param order_idx pseudo-density rank ordering of simulation observations
#' @param grouping vector of groupings of simulation data
#'
#' @return vector combination of coverage_down_1d_single between groups
#' @export
combine_via_modes <- function(inner_sim_data, order_idx, grouping){
# combining coverage_down_1d_single across modes (with grouping definition)
data_full <- matrix(rep(NA, length(order_idx)*2), ncol=2)
group_index_options <- unique(grouping)
grouping_order <- grouping[order_idx]
g_counter <- 1
for (g_idx in group_index_options){
inner_ordering <- order_idx[grouping_order == g_idx]
inner_min_coverage <- coverage_down_1d_single(data.frame(sim = inner_sim_data[inner_ordering]),
e_cols = "sim",verbose = F)$min_cover_vec
data_full[grouping_order == g_idx,g_counter] <- inner_min_coverage
g_counter <- g_counter + 1
}
data_full <- data_full %>% apply(2, repeat.before)
data_full[is.na(data_full)] <- 0 # starting values
min_cover_vec <- data_full %>% apply(1, max)
return(min_cover_vec)
}
#' calculate 1d conformal score across different radius proportion values
#'
#' @param inner_sim_data vector of simulation values
#' @param obs_values vector of true observed values (can be length 1)
#' @param radius_prop_range vector of proportion for minumum spanning radius
#' @param sigma_prop proportion for sigma
#'
#' @return conoformal score for observations
#' @export
cs_across_radius <- function(inner_sim_data, obs_values, radius_prop_range, sigma_prop, x_value=NA){
B <- length(inner_sim_data)
# get min_cover_vec
sim_dmat <- as.matrix(dist(inner_sim_data))
sigma_val <- quantile(sim_dmat, sigma_prop)
sim_pd <- apply(exp(-(sim_dmat^2)/sigma_val^2), 1, mean)
sorting_info <- sort(-sim_pd, method = "shell", index.return = TRUE)
order_idx <- sorting_info$ix
if(is.na(x_value) || x_value < -.5){
min_cover_vec <- coverage_down_1d_single(data.frame(sim = inner_sim_data[order_idx]),
e_cols = "sim",verbose = F)$min_cover_vec
} else {
# cheating mode clustering (for thesis completion :( )
f <- function(x){(x-1)^2*(x+1)}
midpoint <- f(x_value)
split_logic <- inner_sim_data > midpoint
min_cover_vec <- combine_via_modes(inner_sim_data, order_idx, split_logic)
}
# min spanning radius relative to radius quantiles
radius_cut_idx <- sapply(radius_prop_range,
function(rad_prop) {
ceiling(rad_prop*length(min_cover_vec))
}
)
radius_range <- min_cover_vec[radius_cut_idx]
all_inner_distances <- my_pdist(df1 = data.frame(obs_values),
df2 = data.frame(inner_sim_data[order_idx]))
inner_B_plus_1_minus_cs_per_radius <- matrix(NA, nrow = length(obs_values),
ncol = length(radius_prop_range))
for (obs_idx in 1:length(obs_values)){
r_idx <- 1
for (rad in radius_range){
inner_B_plus_1_minus_cs_per_radius[obs_idx,r_idx] <- min(
c(which(all_inner_distances[obs_idx,] <= rad),
ncol(all_inner_distances)+1)
)
r_idx <- r_idx + 1
}
}
inner_cs_per_radius <- B+1 - inner_B_plus_1_minus_cs_per_radius
return(list(cs_mat = inner_cs_per_radius, radius_range = radius_range))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.