pvalue_anova_all = function(y, hc_list, sigma = NULL, simes = TRUE){
# y: vector of observed value
# hc_list: list length of $T\L$ for storing tree structure
# sigma: standard deviation of noise
n_interiornodes = length(hc_list)
ps = rep(0, n_interiornodes)
for(i in length(ps):1){
n_deg = length(hc_list[[i]])
n_leaves = length(find_leaves_in_list(i, hc_list))
if(n_deg == 1){
ps[i] = 1
next
}
y_i_bar = rep(0, n_deg)
n_i = rep(0, n_deg)
sum_squares_i = rep(0, n_deg)
for(j in 1:n_deg){
leaves_child_i = y[find_leaves_in_list(hc_list[[i]][j], hc_list)]
y_i_bar[j] = mean(leaves_child_i)
n_i[j] = length(leaves_child_i)
sum_squares_i[j] = sum((leaves_child_i - y_i_bar[j])^2)
}
y_bar = sum(y_i_bar*n_i)/sum(n_i)
sigma_est = sd(y)
if(is.null(sigma)){
# calculate the F-statistic
sb_2 = sum(n_i * (y_i_bar - y_bar)^2)/(n_deg-1)
sw_2 = sum(sum_squares_i)/(n_leaves - n_deg)
if(n_leaves == n_deg){
p_value = pchisq(sb_2/sigma_est^2, df= n_deg-1 , lower.tail=FALSE)
}else{
p_value = stats::pf(sb_2/sw_2, n_deg-1, n_leaves - n_deg, lower.tail = FALSE)
}
}else{
# calculate the chi-statistic and p-value
sb_2 = sum(n_i * (y_i_bar - y_bar)^2)/(n_deg-1)
p_value = stats::pchisq(sb_2/sigma^2, df= n_deg-1 , lower.tail=FALSE)
}
ps[i] = p_value
}
# Simes' procedure
if(simes){
for(i in n_interiornodes:1){
descendants_i = find_descendants_interior_in_list(hc_list, i)
ps_descendants = sort(ps[descendants_i])
ps[i] = min(ps_descendants * length(ps_descendants) / seq(1, length(ps_descendants), by = 1))
}
}
return(ps = ps)
}
#' Aggregate observations hierarchically with False Split Rate control
#'
#' This function aggregate observations with the same means while simultaneously controlling False Split Rate under a target level.
#' The aggregation is achieved by two steps: (1) Generate p-values for each interior node through ANOVA test (2) Sequentially test on the tree.
#' @param y A length-\code{n-observation} vector of observations
#' @param sigma Standard deviation of noise. If given, the algorithm will compute nodewise p-values with chi-squared statistics. If sigma is unkown, the algorithm will compute p-values with F-test statistics.
#' @template tree-template
#' @param alpha A use-specified target FSR level
#' @return Returns the aggregation result.
#' \item{alpha}{The target FSR level.}
#' \item{groups}{A length-\code{n-observation} vector of integers indicating the cluster to which each observation is allocated.}
#' \item{rejections}{A length-(\code{num_interior_nodes}) vector indicating whether each node is rejected.}
#' \item{p_vals}{A length-(\code{num_interior_nodes}) vector of the p-value (note: all are computed although not all are used in the sequential testing procedure)}
#' @examples
#' set.seed(123)
#' hc = hclust(dist((1:20) + runif(20)/20), method = "complete")
#' k = 4 # 4 true groups
#' groups = cutree(hc, k = 4)
#' theta = runif(k, 0, 10)[groups]
#' y = theta + runif(20, 0, 1)
#' aggregate_observations(y, sigma = 1, tree= hc, alpha = 0.1)
#' @importFrom stats pchisq pf
#' @export
aggregate_observations = function(y, sigma = NULL, tree= NULL, alpha){
# transform dendrogram to a list of length |T\L| (number of interior nodes). Item i in the list stores the children node of node i.
if(class(tree) == "dendrogram"){
dah = dend_as_hclist(tree)
hc_list = dah
labels = attr(dah, "leaf_labels")
}else if(class(tree) == "hclust"){
dend = as.dendrogram(tree)
dah = dend_as_hclist(dend)
hc_list = dah
labels = attr(dah, "leaf_labels")
}else if(class(tree) == "list"){
hc_list = tree
}else{
stop("No proper tree structure is provided.")
}
p_vals = pvalue_anova_all(y, hc_list, sigma, simes = TRUE)
result = hierarchical_test(tree = hc_list, p_vals = p_vals, alpha = alpha, independent = FALSE)
groups = result$groups
rejections = result$rejections
return(list(alpha = alpha, groups = groups, rejections = rejections, p_vals = p_vals))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.