#### BOOTSTRAPPING FOR PERSISTENCE DIAGRAMS ####
#' Estimate persistence threshold(s) for topological features in a data set using bootstrapping.
#'
#' Bootstrapping is used to find a conservative estimate of a 1-`alpha` percent "confidence interval" around
#' each point in the persistence diagram of the data set, and points whose intervals do not
#' touch the diagonal (birth == death) would be considered "significant" or "real".
#' One threshold is computed for each dimension in the diagram.
#'
#' The thresholds are then determined by calculating the 1-`alpha'` percentile of the bottleneck
#' distance values between the real persistence diagram and other diagrams obtained
#' by bootstrap resampling the data. Since `ripsDiag` is the slowest homology engine but is the
#' only engine which calculates representative cycles (as opposed to co-cycles with `PyH`), two
#' homology engines are input to this function - one to calculate the actual persistence diagram, `FUN_diag`
#' (possibly with representative (co)cycles) and one to calculate the bootstrap diagrams, `FUN_boot` (this should be
#' a faster engine, like `calculate_homology` or `PyH`).
#' p-values can be calculated for any feature which survives the thresholding if both `return_subsetted` and `return_pvals` are `TRUE`,
#' however these values may be larger than the original `alpha` value in some cases. Note that this is not part of the original bootstrap procedure.
#' If stricter thresholding is desired,
#' or the p-values must be less than `alpha`, set `p_less_than_alpha` to `TRUE`. The minimum
#' possible p-value is always 1/(`num_samples` + 1).
#' Note that since \code{\link[TDAstats]{calculate_homology}}
#' can ignore the longest-lived cluster, fewer "real" clusters may be found. To avoid this possibility
#' try setting `FUN_diag` equal to 'ripsDiag'. Please note that due to the TDA package no longer being available on CRAN,
#' if `FUN_diag` or `FUN_boot` are 'ripsDiag' then `bootstrap_persistence_thresholds` will look for the ripsDiag function in the global environment,
#' so the TDA package should be attached with `library("TDA")` prior to use.
#'
#' @param X the input dataset, must either be a matrix or data frame.
#' @param FUN_diag a string representing the persistent homology function to use for calculating the full persistence diagram, either
#' 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.
#' @param FUN_boot a string representing the persistent homology function to use for calculating the bootstrapped persistence diagrams, either
#' 'calculate_homology' (the default), 'PyH' or 'ripsDiag'.
#' @param maxdim the integer maximum homological dimension for persistent homology, default 0.
#' @param thresh the positive numeric maximum radius of the Vietoris-Rips filtration.
#' @param distance_mat a boolean representing if `X` is a distance matrix (TRUE) or not (FALSE, default).
#' dimensions together (TRUE, the default) or if one threshold should be calculated for each dimension separately (FALSE).
#' @param ripser the imported ripser module when `FUN_diag` or `FUN_boot` is `PyH`.
#' @param ignore_infinite_cluster a boolean indicating whether or not to ignore the infinitely lived cluster when `FUN_diag` or `FUN_boot` is `PyH`.
#' @param calculate_representatives a boolean representing whether to calculate representative (co)cycles, default FALSE. Note that representatives cant be
#' calculated when using the 'calculate_homology' function.
#' @param num_samples the positive integer number of bootstrap samples, default 30.
#' @param alpha the type-1 error threshold, default 0.05.
#' @param return_diag a boolean representing whether or not to return the calculated persistence diagram, default TRUE.
#' @param return_subsetted a boolean representing whether or not to return the subsetted persistence diagram (with or without representatives), default FALSE.
#' @param return_pvals a boolean representing whether or not to return p-values for features in the subsetted diagram, default FALSE.
#' @param p_less_than_alpha a boolean representing whether or not subset further and return only feature whose p-values are strictly less than `alpha`, default `FALSE`. Note that this is not part of the original bootstrap procedure.
#' @param num_workers the integer number of cores used for parallelizing (over bootstrap samples), default one less the maximum amount of cores on the machine.
#' @param ... additional parameters for internal methods.
#' @return either a numeric vector of threshold values, with one for each dimension 0..`maxdim` (in that order), or a list containing those thresholds and elements (if desired)
#' @export
#' @importFrom stats quantile
#' @importFrom foreach foreach %dopar% %do%
#' @importFrom parallel makeCluster stopCluster clusterExport clusterEvalQ
#' @importFrom parallelly availableCores
#' @importFrom doParallel registerDoParallel
#' @author Shael Brown - \email{shaelebrown@@gmail.com}
#' @references
#' Chazal F et al (2017). "Robust Topological Inference: Distance to a Measure and Kernel Distance." \url{https://www.jmlr.org/papers/volume18/15-484/15-484.pdf}.
#' @examples
#'
#' if(require("TDAstats"))
#' {
#' # create a persistence diagram from a sample of the unit circle
#' df <- TDAstats::circle2d[sample(1:100,size = 50),]
#'
#' # calculate persistence thresholds for alpha = 0.05
#' # and return the calculated diagram as well as the subsetted diagram
#' bootstrapped_diagram <- bootstrap_persistence_thresholds(X = df,
#' maxdim = 1,thresh = 2,num_workers = 2)
#' }
bootstrap_persistence_thresholds <- function(X,FUN_diag = "calculate_homology",FUN_boot = "calculate_homology",maxdim = 0,thresh,distance_mat = FALSE,ripser = NULL,ignore_infinite_cluster = TRUE,calculate_representatives = FALSE,num_samples = 30,alpha = 0.05,return_subsetted = FALSE,return_pvals = FALSE,return_diag = TRUE,num_workers = parallelly::availableCores(omit = 1),p_less_than_alpha = FALSE,...){
# function for thresholding the points computed in a diagram
# based on persistence
# to avoid check issues
N <- NULL
# get additional parameters for internal methods
additional_params <- list(...)
if("bootstrap_samples" %in% names(additional_params))
{
bootstrap_samples <- additional_params$bootstrap_samples
}else
{
bootstrap_samples <- NULL
}
if("return_distances" %in% names(additional_params))
{
return_distances <- additional_params$return_distances
}else
{
return_distances <- FALSE
}
# error check parameters
if(is.null(distance_mat))
{
stop("distance_mat must not be NULL.")
}
if(length(distance_mat) > 1 | !inherits(distance_mat,"logical"))
{
stop("distance_mat must be a single logical (i.e. T or F).")
}
if(is.na(distance_mat) | is.nan(distance_mat) )
{
stop("distance_mat must not be NA/NAN.")
}
if(is.null(return_subsetted))
{
stop("return_subsetted must not be NULL.")
}
if(length(return_subsetted) > 1 | !inherits(return_subsetted,"logical"))
{
stop("return_subsetted must be a single logical (i.e. T or F).")
}
if(is.na(return_subsetted) | is.nan(return_subsetted) )
{
stop("return_subsetted must not be NA/NAN.")
}
if(is.null(return_pvals))
{
stop("return_pvals must not be NULL.")
}
if(length(return_pvals) > 1 | !inherits(return_pvals,"logical"))
{
stop("return_pvals must be a single logical (i.e. T or F).")
}
if(is.na(return_pvals) | is.nan(return_pvals) )
{
stop("return_pvals must not be NA/NAN.")
}
if(is.null(p_less_than_alpha))
{
stop("p_less_than_alpha must not be NULL.")
}
if(length(p_less_than_alpha) > 1 | !inherits(p_less_than_alpha,"logical"))
{
stop("p_less_than_alpha must be a single logical (i.e. T or F).")
}
if(is.na(p_less_than_alpha) | is.nan(p_less_than_alpha) )
{
stop("p_less_than_alpha must not be NA/NAN.")
}
if(is.null(return_diag))
{
stop("return_diag must not be NULL.")
}
if(length(return_diag) > 1 | !inherits(return_diag,"logical"))
{
stop("return_diag must be a single logical (i.e. T or F).")
}
if(is.na(return_diag) | is.nan(return_diag) )
{
stop("return_diag must not be NA/NAN.")
}
check_param(param = maxdim,param_name = "maxdim",numeric = T,whole_numbers = T,multiple = F,finite = T,non_negative = T)
check_param(param = thresh,param_name = "thresh",numeric = T,whole_numbers = F,multiple = F,finite = T,non_negative = T,positive = T)
if(!inherits(X,"data.frame") & !inherits(X,"matrix"))
{
stop("X must either be a dataframe or a matrix.")
}
if(nrow(X) < 2 | ncol(X) < 1)
{
stop("X must have at least two rows and one column.")
}
if(length(which(stats::complete.cases(X) == F)) > 0)
{
stop("X must not contain any missing values.")
}
if(distance_mat == T & (ncol(X) != nrow(X) | !inherits(X,"matrix")))
{
stop("if distance_mat is TRUE then X must be a square matrix.")
}
if((inherits(X,"matrix") & !inherits(X[1,1],"numeric")) | (inherits(X,"data.frame") & length(which(unlist(lapply(X,is.numeric)))) < ncol(X)))
{
stop("X must have only numeric entries.")
}
if(is.null(FUN_diag))
{
stop("FUN_diag must not be NULL.")
}
if(length(FUN_diag) > 1 | !inherits(FUN_diag,"character"))
{
stop("FUN_diag must be a single string.")
}
if(is.na(FUN_diag) | is.nan(FUN_diag) )
{
stop("FUN_diag must not be NA/NAN.")
}
if(FUN_diag %in% c("calculate_homology","ripsDiag","PyH") == F)
{
stop("FUN_diag must be either \'calculate_homology\', \'PyH\' or \'ripsDiag\'.")
}
if(FUN_diag == 'calculate_homology')
{
if(requireNamespace("TDAstats",quietly = T) == F)
{
stop("To use the \'calculate_homology\' function the package TDAstats must be installed.")
}
}
if(FUN_diag == 'ripsDiag')
{
tryCatch(expr = {ripsDiag <- get("ripsDiag",envir = globalenv())},
error = function(e){
stop("To use the \'ripsDiag\' function the package TDA must be attached in the global environment.")
})
}
if(FUN_diag == "PyH")
{
if(is.null(ignore_infinite_cluster))
{
stop("ignore_infinite_cluster must not be NULL.")
}
if(length(ignore_infinite_cluster) > 1 | !inherits(ignore_infinite_cluster,"logical"))
{
stop("ignore_infinite_cluster must be a single logical (i.e. T or F).")
}
if(is.na(ignore_infinite_cluster) | is.nan(ignore_infinite_cluster) )
{
stop("ignore_infinite_cluster must not be NA/NAN.")
}
check_ripser(ripser)
}
if(is.null(FUN_boot))
{
stop("FUN_boot must not be NULL.")
}
if(length(FUN_boot) > 1 | !inherits(FUN_boot,"character"))
{
stop("FUN_boot must be a single string.")
}
if(is.na(FUN_boot) | is.nan(FUN_boot) )
{
stop("FUN_boot must not be NA/NAN.")
}
if(FUN_boot %in% c("calculate_homology","ripsDiag","PyH") == F)
{
stop("FUN_boot must be either \'calculate_homology\', \'PyH\' or \'ripsDiag\'.")
}
if(FUN_boot == 'calculate_homology')
{
if(requireNamespace("TDAstats",quietly = T) == F)
{
stop("To use the \'calculate_homology\' function the package TDAstats must be installed.")
}
}
if(FUN_boot == 'ripsDiag')
{
tryCatch(expr = {ripsDiag <- get("ripsDiag",envir = globalenv())},
error = function(e){
stop("To use the \'ripsDiag\' function the package TDA must be attached in the global environment.")
})
}
if(FUN_boot == "PyH")
{
if(is.null(ignore_infinite_cluster))
{
stop("ignore_infinite_cluster must not be NULL.")
}
if(length(ignore_infinite_cluster) > 1 | !inherits(ignore_infinite_cluster,"logical"))
{
stop("ignore_infinite_cluster must be a single logical (i.e. T or F).")
}
if(is.na(ignore_infinite_cluster) | is.nan(ignore_infinite_cluster) )
{
stop("ignore_infinite_cluster must not be NA/NAN.")
}
check_ripser(ripser)
}
if(FUN_diag == "ripsDiag" & FUN_boot == "PyH")
{
if(ignore_infinite_cluster == T)
{
message("Setting ignore_infinite_cluster to F because FUN_diag is 'ripsDiag' and FUN_boot is 'PyH'.")
ignore_infinite_cluster <- F
}
}
if(is.null(calculate_representatives))
{
stop("calculate_representatives must not be NULL.")
}
if(length(calculate_representatives) > 1 | !inherits(calculate_representatives,"logical"))
{
stop("calculate_representatives must be a single boolean value.")
}
if(is.na(calculate_representatives) | is.nan(calculate_representatives) )
{
stop("calculate_representatives must not be NA/NAN.")
}
check_param(param_name = "num_samples",param = num_samples,numeric = T,whole_numbers = T,multiple = F,finite = T,at_least_one = T)
check_param(param_name = "alpha",param = alpha,numeric = T,whole_numbers = F,multiple = F,finite = T,non_negative = T)
if(alpha <= 0 | alpha >= 1)
{
stop("alpha must be between 0 and 1 (non-inclusive).")
}
check_param("num_workers",num_workers,whole_numbers = T,at_least_one = T,finite = T,numeric = T,multiple = F)
if(num_workers > parallelly::availableCores())
{
warning("num_workers is greater than the number of available cores - setting to maximum value less one.")
num_workers <- parallelly::availableCores(omit = 1)
}
# if not returning just subsetted diagrams
# first calculate the "real" persistence diagram, storing representative cycles if need be
if(return_diag | (is.null(bootstrap_samples) & !return_distances))
{
if(FUN_diag == "PyH")
{
diag <- PyH(X = X,maxdim = maxdim,thresh = thresh,distance_mat = distance_mat,ripser = ripser,ignore_infinite_cluster = ignore_infinite_cluster,calculate_representatives = calculate_representatives)
if(calculate_representatives == T)
{
representatives = diag$representatives
diag <- diag$diagram
}
}
if(FUN_diag == "calculate_homology")
{
diag <- diagram_to_df(TDAstats::calculate_homology(mat = X,dim = maxdim,threshold = thresh,format = ifelse(test = distance_mat == T,yes = "distmat",no = "cloud")))
representatives <- NULL
}
if(FUN_diag == "ripsDiag")
{
diag <- ripsDiag(X = X,maxdimension = maxdim,maxscale = thresh,dist = ifelse(test = distance_mat == F,yes = "euclidean",no = "arbitrary"),library = "dionysus",location = calculate_representatives,printProgress = F)
if(calculate_representatives == T)
{
representatives <- diag$cycleLocation
}
diag <- diagram_to_df(diag)
}
}
# compute in parallel if FUN != "PyH"
if(FUN_boot != "PyH")
{
cl <- parallel::makeCluster(num_workers)
doParallel::registerDoParallel(cl)
parallel::clusterEvalQ(cl,c(library("clue"),library("rdist")))
parallel::clusterExport(cl,varlist = c("diagram_distance","check_diagram","check_param","diagram_to_df"),envir = environment())
foreach_func <- foreach::`%dopar%`
}else
{
foreach_func <- foreach::`%do%`
}
# perform bootstrapping in parallel
tryCatch(expr = {
bootstrap_values <- foreach_func(foreach::foreach(N = 1:num_samples,.combine = c),ex = {
if(is.null(bootstrap_samples))
{
# sample data points with replacement
s <- unique(sample(1:nrow(X),size = nrow(X),replace = T))
}else
{
s <- bootstrap_samples[[N]]
}
if(distance_mat == F)
{
X_sample <- X[s,]
}else
{
# also subset columns for a distance matrix
X_sample <- X[s,s]
}
# calculate diagram (without representatives)
if(FUN_boot == "PyH")
{
bootstrap_diag <- PyH(X = X_sample,maxdim = maxdim,thresh = thresh,distance_mat = distance_mat,ripser = ripser,ignore_infinite_cluster = T,calculate_representatives = F)
}
if(FUN_boot == "calculate_homology")
{
bootstrap_diag <- diagram_to_df(TDAstats::calculate_homology(mat = X_sample,dim = maxdim,threshold = thresh,format = ifelse(test = distance_mat == T,yes = "distmat",no = "cloud")))
}
if(FUN_boot == "ripsDiag")
{
bootstrap_diag <- diagram_to_df(ripsDiag(X = X_sample,maxdimension = maxdim,maxscale = thresh,dist = ifelse(test = distance_mat == F,yes = "euclidean",no = "arbitrary"),library = "dionysus",location = F,printProgress = F))
}
# if desired return subsetted diagram
if(!is.null(bootstrap_samples) & !return_distances)
{
return(bootstrap_diag)
} # else...
# return bottleneck distance with original diagram in each dimension
ret_vec <- list()
for(d in 0:maxdim)
{
ret_vec[[length(ret_vec) + 1]] <- diagram_distance(D1 = diag,D2 = bootstrap_diag,p = Inf,dim = d)
}
return(ret_vec)
})
},
warning = function(w){
if(grepl(pattern = "Every diagram must be non-empty.",w))
{
warning("A diagram in some dimension was empty. Try decreasing the maxdim parameter.")
}else
{
warning(w)
}},
error = function(e){stop(e)},
finally = {
# make sure to close cluster
if(FUN_boot != "PyH")
{
parallel::stopCluster(cl)
}
})
# if desired return list of diagrams or distances
if(!is.null(bootstrap_samples))
{
if(return_diag)
{
return(list(diag = diag,diagrams = as.list(bootstrap_values)))
}else
{
return(as.list(bootstrap_values))
}
}
if(return_distances)
{
return(bootstrap_values)
}
# convert distance threshold(s) into persistence threshold(s)
thresholds <- unlist(lapply(X = 0:maxdim,FUN = function(X){
return(2*stats::quantile(unlist(bootstrap_values[seq(X + 1,length(bootstrap_values),maxdim + 1)]),probs = c(1-alpha))[[1]])
}))
# make return list
ret_list <- list(thresholds = thresholds)
if(return_diag == T)
{
ret_list$diag <- diag
}
if(calculate_representatives == T)
{
ret_list$representatives <- representatives
}
if(return_subsetted == T)
{
# subset in each dimension
inds <- c()
for(d in 0:maxdim)
{
inds <- c(inds,which(diag$dimension == d & diag$death - diag$birth > thresholds[[d + 1]]))
}
ret_list$subsetted_diag <- diag[inds,]
# deal with representatives
if(calculate_representatives == T & FUN_diag == "ripsDiag")
{
ret_list$subsetted_representatives = representatives[inds]
}
if(calculate_representatives == T & FUN_diag == "PyH")
{
ret_list$subsetted_representatives = representatives
if(maxdim > 0)
{
for(d in 1:maxdim)
{
if(length(which(diag$dimension == d)) > 0)
{
min_ind <- min(which(diag$dimension == d))
max_ind <- max(which(diag$dimension == d))
ret_list$subsetted_representatives[[d + 1]] <- ret_list$subsetted_representatives[[d + 1]][inds[which(inds %in% c(min_ind:max_ind))] - min_ind + 1]
}else
{
ret_list$subsetted_representatives[[d + 1]] <- list()
}
}
}
}
if(return_pvals)
{
# compute p-values for each remaining feature
# p-value is (Z + 1)/(N + 1) where N is the number of samples and
# Z is the number of bootstrap samples which do not have smaller distances to the diagonal
pvals <- c()
ret_list$pvals <- pvals
if(nrow(ret_list$subsetted_diag) > 0)
{
pvals <- unlist(lapply(X = 1:nrow(ret_list$subsetted_diag),FUN = function(X){
pers <- ret_list$subsetted_diag[X,3L] - ret_list$subsetted_diag[X,2L]
d <- ret_list$subsetted_diag[X,1L]
Z <- length(which(2*unlist(bootstrap_values[seq(d + 1,length(bootstrap_values),maxdim + 1)]) >= pers))
return((Z + 1)/(num_samples + 1))
}))
ret_list$pvals <- pvals
if(p_less_than_alpha)
{
# perform stricter thresholding
inds <- which(pvals < alpha)
ret_list$pvals <- ret_list$pvals[inds]
if(calculate_representatives == T & FUN_diag == "ripsDiag")
{
ret_list$subsetted_representatives <- ret_list$subsetted_representatives[inds]
}
if(calculate_representatives == T & FUN_diag == "PyH")
{
if(maxdim > 0)
{
for(d in 1:maxdim)
{
min_ind <- min(which(ret_list$subsetted_diag$dimension == d))
max_ind <- max(which(ret_list$subsetted_diag$dimension == d))
ret_list$subsetted_representatives[[d + 1]] <- ret_list$subsetted_representatives[[d + 1]][inds[which(inds %in% c(min_ind:max_ind))] - min_ind + 1]
}
}
}
ret_list$subsetted_diag <- ret_list$subsetted_diag[rownames(ret_list$subsetted_diag)[inds],]
}
}
}
}
return(ret_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.