R/tree-functions.R

Defines functions forest.completion curve.V.star.forest.fast curve.V.star.forest.naive delete.gaps pruning V.star zetas.tree nb.elements zeta.DKWM zeta.trivial zeta.kBonf zeta.HB dyadic.from.height dyadic.from.window.size dyadic.from.leaf_list

Documented in curve.V.star.forest.fast curve.V.star.forest.naive delete.gaps dyadic.from.height dyadic.from.leaf_list dyadic.from.window.size forest.completion nb.elements pruning V.star zeta.DKWM zeta.HB zeta.kBonf zetas.tree zeta.trivial

#' Create a complete dyadic tree structure
#' 
#' @description
#' Produce a set of regions with forest structure, with a single tree, which is dyadic.
#' 
#' 
#' @name dyadic
#' @details \describe{
#' \item{\code{dyadic.from.leaf_list}}{Dyadic tree structure from a given list of atoms/leafs}
#' \item{\code{dyadic.from.window.size}}{Dyadic tree structure from window size: the number of elements in each atom/leaf is set to \code{s}}
#' \item{\code{dyadic.from.height}}{Dyadic tree structure from height: the total height of the tree is set to \code{H}}
#' }
#' 
#' @param method A numeric value. If \code{method == 1}, start from the leaves
#' and group nodes of a same height 2 by 2 as long as possible (note that this can introduce gaps). 
#' If \code{method==2}, start from the root and divide nodes in 2 nodes of equal
#' size as long as possible
#' 
#' @return A list with two named elements:\describe{
#' \item{\code{leaf_list}}{A list of vectors representing the atoms of the forest structure. See [V.star()] for more information.}
#' \item{\code{C}}{A list of list representing the forest structure. See [V.star()] for more information.}
#' }
#' 
#' @references Durand, G., Blanchard, G., Neuvial, P., & Roquain, E. (2020). Post hoc false positive control for structured hypotheses. Scandinavian Journal of Statistics, 47(4), 1114-1148.
#' @references Durand G. (2025). A fast algorithm to compute a curve of confidence upper bounds for the False Discovery Proportion using a reference family with a forest structure. arXiv:2502.03849.
#' @examples
#' m <- 6
#' dd <- dyadic.from.window.size(m, s = 2, method = 2)
#' str(dd)
#' 
#' dd <- dyadic.from.height(m, H = 3, method = 2)
#' str(dd)
#' 
#' dd <- dyadic.from.height(m, method = 2)
#' str(dd)
#'
#' leaf_list <- dd$leaf_list
#' dd <- dyadic.from.leaf_list(leaf_list, method = 2)
#' str(dd)
NULL

#' @param leaf_list A list of vectors representing the atoms of the forest structure. See [V.star()] for more information.
#' @export
#' @rdname dyadic
dyadic.from.leaf_list <- function(leaf_list, method) {
  leaves <- length(leaf_list)
  if (method == 1) {
    inter <- seq_len(leaves)
    Ch <- mapply(c, inter, inter, SIMPLIFY = FALSE)
    C <- list(Ch)
    repeat {
      len <- length(Ch)
      if (len == 1) 
        break
      new_Ch <- list()
      j <- 1
      while (j < len) {
        new_Ch <- c(new_Ch, list(c(Ch[[j]][1], Ch[[j + 1]][2])))
        j <- j + 2
      }
      Ch <- new_Ch
      C <- c(list(Ch), C)
    }
  } else if (method == 2) {
    Ch <- list(c(1, leaves))
    C <- list(Ch)
    continue <- TRUE
    while (continue) {
      continue <- FALSE
      oldCh <- Ch
      Ch <- list()
      len <- length(oldCh)
      for (i in seq_len(len)) {
        oi <- oldCh[[i]]
        leaves_in_node <- oi[2] - oi[1] + 1
        if (leaves_in_node > 1) {
          cut2 <- ceiling(leaves_in_node/2)
          Ch <- c(Ch, 
                  list(c(oi[1], oi[1] + cut2 - 1)), 
                  list(c(oi[1] + cut2, oi[2])))
          if (cut2 > 1) 
            continue <- TRUE
        }
      }
      C <- c(C, list(Ch))
    }
  }
  return(list(leaf_list = leaf_list, C = C))
}

#' @param m An integer value, the number of hypotheses to have in the structure
#' @param s An integer value, the number of elements in each atom/leaf
#' @export
#' @rdname dyadic
dyadic.from.window.size <- function(m, s, method) {
  leaves <- floor(m/s)
  leaf_list <- list()
  for (l in 1:leaves) {
    leaf <- seq_len(s) + (l - 1) * s
    leaf_list <- c(leaf_list, list(leaf))
  }
  if (s * leaves < m) {
    leaf <- seq(1 + l * s, m)
    leaf_list <- c(leaf_list, list(leaf))
    # leaves<-leaves+1
  }
  C <- dyadic.from.leaf_list(leaf_list, method)$C
  return(list(leaf_list = leaf_list, C = C))
}

#' @param H An integer value, the desired maximal height of the tree. If NULL (by default), 
#' use \code{floor(2 + log2(m - 1))} which gives the maximum achievable height given \code{m}.
#' @export
#' @rdname dyadic
dyadic.from.height <- function(m, H = NULL, method) {
  if (is.null(H)) {
    H <- ifelse(m == 1, 1, floor(2 + log2(m - 1)))
  }
  if (m <= 2^(H - 2)) {
    oldH <- H
    H <- ifelse(m == 1, 1, floor(2 + log2(m - 1)))
    warning("H=", oldH, " is too large for m=", m, ", \nH=", oldH, " reduced to H=", H)
  }
  if (m < 2^(H - 1)) {
    leaf_list <- as.list(1:m)
  } else {
    leaf_list <- list()
    base <- m %/% 2^(H - 1)
    plus1 <- m %% 2^(H - 1)
    for (i in seq_len(plus1)) {
      leaf_list <- c(leaf_list, list(seq_len(base + 1) + (i - 1) * (base + 1)))
    }
    for (i in seq_len(2^(H - 1) - plus1)) {
      leaf_list <- c(leaf_list, list(plus1 * (base + 1) + seq_len(base) + (i - 1) * base))
    }
  }
  C <- dyadic.from.leaf_list(leaf_list, method)$C
  return(list(leaf_list = leaf_list, C = C))
}

#' Estimate the number of true null hypotheses among a set of p-values
#' 
#' @description
#' An upper bound on the number of true null hypotheses in the region associated to 
#' the \eqn{p}-values \code{pval}
#' is computed with confidence \code{1 - lambda}. 
#' The functions described here can be used as the \code{method} argument 
#' of [zetas.tree()].
#'
#' @param pval A vector of \eqn{p}-values.
#' @param lambda A numeric value in \eqn{[0,1]}, the target error level of the test.
#' @param k The positive integer \eqn{k} used by the \eqn{k}-Bonferroni procedure. By default it is equal to 1.
#' @param ... Additional arguments that may be passed to specific \code{zeta} functions.
#' @name zeta
#' @return The number of true nulls is over-estimated as follows:
#' \describe{
#' \item{\code{zeta.DKWM}}{Inversion of the Dvoretzky-Kiefer-Wolfowitz-Massart inequality (related to the Storey estimator of the proportion of true nulls) with parameter \code{lambda}}
#' \item{\code{zeta.HB}}{Number of conserved hypotheses by the Holm-Bonferroni procedure with parameter \code{lambda}}
#' \item{\code{zeta.kBonf}}{Number of conserved hypotheses by the \eqn{k}-Bonferroni procedure with parameter \code{lambda}, plus \eqn{k-1}}
#' \item{\code{zeta.trivial}}{The size of the p-value set which is the trivial upper bound (\code{lambda} is not used)}
#' }
#' @details The \eqn{k}-Bonferroni procedure controls the \eqn{k}-Familywise Error Rate (FWER) at the desired level, hence the number of conserved hypotheses, plus \eqn{k-1},
#' is a suitable upper bound (because up to \eqn{k-1} rejected hypotheses are also true nulls). For \eqn{k=1} (the default), it is the regular Bonferroni procedure.
#' The Holm-Bonferroni procedure controls the FWER, hence the number of conserved hypotheses is a suitable upper bound.
#' @references Durand, G., Blanchard, G., Neuvial, P., & Roquain, E. (2020). Post hoc false positive control for structured hypotheses. Scandinavian Journal of Statistics, 47(4), 1114-1148.
#' @references Dvoretzky, A., Kiefer, J., and Wolfowitz, J. (1956). Asymptotic minimax character of the sample distribution function and of the classical multinomial estimator. The Annals of Mathematical Statistics, pages 642-669.
#' @references Holm, S. A simple sequentially rejective multiple test procedure. Scandinavian Journal of Statistics 6 (1979), pp. 65-70.
#' @references Massart, P. (1990). The tight constant in the Dvoretzky-Kiefer-Wolfowitz inequality. The Annals of Probability, pages 1269-1283.
#' @references Storey, J. D. (2002). A direct approach to false discovery rates. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 64(3):479-498.
#' @references Durand G. (2025). A fast algorithm to compute a curve of confidence upper bounds for the False Discovery Proportion using a reference family with a forest structure. arXiv:2502.03849.
#' @examples
#' x <- rnorm(100, mean = c(rep(c(0, 2), each = 50)))
#' pval <- 1 - pnorm(x)
#' lambda <- 0.05
#' zeta.trivial(pval, lambda)
#' 
#' zeta.HB(pval, lambda)
#' 
#' zeta.DKWM(pval, lambda)
NULL

#' @rdname zeta
#' @export
zeta.HB <- function(pval, lambda, ...) {
  m <- length(pval)
  sorted.pval <- sort(pval)
  
  thresholds <- lambda / (m - 1:m + 1)
  v <- sorted.pval - thresholds
  indexes <- which(v > 0)
  if (! length(indexes)){
    return(0)
  }
  else{
    return(m - indexes[1] + 1)
  }
}
# TODO: zeta.HB.sorted that assumes that the pvalues are sorted and doesn't sort them

#' @rdname zeta
#' @export
zeta.kBonf <- function(pval, lambda, k=1, ...) {
  m <- length(pval)
  threshold <- lambda * k / m
  indexes <- which(pval <= threshold)
  bound <- m - length(indexes) + (k - 1)
  return(min(bound, m))
}

#' @rdname zeta
#' @export
zeta.trivial <- function(pval, lambda, ...) {
  return(length(pval))
}

#' @rdname zeta
#' @export
zeta.DKWM <- function(pval, lambda, ...) {
  s <- length(pval)
  sorted.pval <- c(0, sort(pval))
  dkwm <- min((sqrt(log(1/lambda)/2)/(2 * (1 - sorted.pval)) + 
                  sqrt(log(1/lambda)/(8 * (1 - sorted.pval)^2) + 
                         (s - seq(0, s))/(1 - sorted.pval)
                       )
                )^2,
              na.rm=TRUE)
  return(min(s, floor(dkwm)))
}

#' Number of unique regions in a reference family with forest structure
#' 
#' @param C A list of list representing the forest structure. See [V.star()] for more information.
#' 
#' @return An integer, the number of regions.
#' 
#' @export
nb.elements <- function(C) {
  H <- length(C)
  count <- 0
  for (h in H:1) {
    count <- count + length(C[[h]])
  }
  return(count)
}

#' Estimate of the proportion of true nulls in each node of a tree
#' 
#' @description
#' Takes a forest structure as input, given by the couple \code{C}, \code{leaf_list} 
#' and returns the corresponding \eqn{\zeta_k}'s according to the method(s) chosen.
#' 
#' @param C A list of list representing the forest structure. See [V.star()] for more information.
#' @param leaf_list A list of vectors representing the atoms of the forest structure. See [V.star()] for more information.
#' @param method A function with arguments \code{(pval, lambda)} that can compute an upper bound on the false positives in the region associated to 
#' the \eqn{p}-values \code{pval} at confidence level \code{1 - lambda}. It can also be a list of such functions, where the \code{h}-th function 
#' is used at depth \code{h} in the tree structure, that is on the \eqn{R_k}'s represented by the elements found in \code{C[[h]]}. Finally, it can also be a list 
#' of lists of such functions, mimicking the structure of \code{C} itself, that is, \code{method[[h]][[j]]} is applied the \eqn{R_k} represented by \code{C[[h]][[j]]}.
#' @param pvalues A vector of \eqn{p}-values, must be of size \code{m}, with \code{m} the highest element found in the vectors of \code{leaf_list}.
#' @param alpha A target error level in \eqn{]0,1[]}.
#' @param refine A boolean, \code{FALSE} by default. Whether to use the step-down refinement to try to produce smaller \eqn{\zeta_k}'s, see Details.
#' @param verbose A boolean, \code{FALSE} by default. Whether to print information about the (possibly multiple) round(s) of step-down refinement.
#' @param ... Additional arguments that may be passed to specific \code{zeta} functions.
#' 
#' @return \code{ZL}: A list of integer vectors representing the upper bounds \eqn{\zeta_k} of the forest structure. See [V.star()] for more information.
#' 
#' @details The proportion of true nulls in each node is estimated by an union bound on the regions. 
#' That is, the provided method(s) is/are applied at level \eqn{\frac{\alpha}{K}} where \eqn{K} is the number of regions. 
#' In the step-down refinement, if we find a \eqn{R_k} with associated \eqn{\zeta_k=0}, that is, we think that the region 
#' contains only false null hypotheses, we can remove it and run again the \eqn{\zeta_k}'s computation using \eqn{K-1} instead of 
#' \eqn{K} in the union bound, and so on until we don't reduce the "effective" number of regions.
#'
#' @references Durand, G., Blanchard, G., Neuvial, P., & Roquain, E. (2020). Post hoc false positive control for structured hypotheses. Scandinavian Journal of Statistics, 47(4), 1114-1148.
#' @references Durand, G. (2018). Multiple testing and post hoc bounds for heterogeneous data. PhD thesis, see Appendix B.2 for the step-down refinement.
#' @references Durand G. (2025). A fast algorithm to compute a curve of confidence upper bounds for the False Discovery Proportion using a reference family with a forest structure. arXiv:2502.03849.
#' @export
#' @examples
#' m <- 1000
#' dd <- dyadic.from.window.size(m, s = 10, method = 2)
#' leaf_list <- dd$leaf_list
#' pvalues <- runif(m)
#' C <- dd$C
#' method <- zeta.trivial
#' ZL <- zetas.tree(C, leaf_list, method, pvalues, alpha = 0.05)
#' ZL
zetas.tree <- function(C, leaf_list, method, pvalues, alpha, refine=FALSE, verbose=FALSE, ...) {
  H <- length(C)
  K <- nb.elements(C)
  ZL <- list()
  new_K <- K
  continue <- TRUE
  nb_loop <- 0
  while (continue) {
    usage_K <- new_K
    new_K <- K
    for (h in H:1) {
      Ch <- C[[h]]
      len <- length(Ch)
      zeta_inter <- numeric(len)
      if (len > 0) {
        for (k in 1:len) {
          Rk <- Ch[[k]]
          pval <- pvalues[unlist(leaf_list[Rk[1]:Rk[2]])]
          args_zeta = list(...)
          if ("pCDFlist" %in% names(args_zeta)){
            args_zeta$pCDFlist <- args_zeta$pCDFlist[unlist(leaf_list[Rk[1]:Rk[2]])]
          }
          if(typeof(method) == "list") {
            if(typeof(method[[h]]) == "list") {
              zeta_method <- method[[h]][[k]]
            }
            else {
              zeta_method <- method[[h]]
            }
          }
          else {
            zeta_method <- method
          }
          zeta_inter[k] <- do.call(zeta_method, c(list(pval = pval, lambda = alpha / usage_K), args_zeta))
          if (zeta_inter[k] == 0)
            new_K <- new_K - 1
        }
      }
      ZL[[h]] <- zeta_inter
    }
    if (verbose) {
      nb_loop <- nb_loop + 1
      print(paste0("loop number=", nb_loop, ", usage_K=", usage_K, ", new_K=", new_K))
    }
    continue <- refine && (new_K < usage_K)
  }
  return(ZL)
}

#' Post hoc bound on the number of false positives
#' 
#' @description Computes the post hoc upper bound \eqn{V^*(S)} on the number of false positives in a 
#' given selection set \eqn{S} of hypotheses, using a reference family \eqn{(R_k, \zeta_k)} that possess the forest structure
#' (see Reference).
#' 
#' @param S An integer vector with the indices of the hypotheses of the selection set. Does not need to be ordered.
#' @param C A list of list representing the forest structure. Each list of \code{C} represents a level of
#' depth in the forest structure. So, \code{C[[1]]} lists the regions at depth 1, \code{C[[2]]} lists the regions at depth 2,
#' and so on. We then use the fact that each region of the reference family
#' is the union of some of its atom over an integer interval. 
#' So he elements of each list \code{C[[i]]} are integer vectors of length 2 representing this interval. 
#' For example, \code{C[[1]][[1]] = c(1, 5)} means that the first region at depth 1 is the union of the first 5 atoms, 
#' \code{C[[2]][[3]] = c(4, 5)} means that the third region at depth 2 is the union of the atoms 4 and 5, and
#' \code{C[[3]][[5]] = c(5, 5)} means that the fifth region at depth 3 is simply the fifth atom. 
#' @param ZL A list of integer vectors representing the upper bound \eqn{\zeta_k} associated to a region \eqn{R_k} in the reference family. 
#' \code{ZL[[h]][j]} is the \eqn{\zeta_k} associated to the \eqn{R_k} described by \code{C[[h]][[j]]}.
#' @param leaf_list A list of vectors. Each vector is an integer array. The i-th vector contains the indices
#' of the hypotheses in the i-th atom. Atoms form a partition of the set of hypotheses indices : 
#' there cannot be overlap, and each index has to be inside one of the atoms. It is really important, for all functions
#' using \code{leaf_list} in this package, that each vector is sorted. Because functions of the package 
#' implicitly use that
#' \code{leaf_list[[i]][1]} is the lowest hypothesis index in leaf \eqn{P_i} and that 
#' \code{leaf_list[[i]][length(leaf_list[[i]])]} is the largest
#' 
#' @return An integer value, the post hoc upper bound \eqn{V^*(S)}.
#' 
#' @details For \code{V.star}, the forest structure doesn't need to be complete. That is, 
#' in \code{C}, some trivial intervals \code{c(i,i)} corresponding to regions that are atoms may be missing. 
#' 
#' @references Durand, G., Blanchard, G., Neuvial, P., & Roquain, E. (2020). Post hoc false positive control for structured hypotheses. Scandinavian Journal of Statistics, 47(4), 1114-1148.
#' @examples
#' m <- 20
#' C <- list(
#'   list(c(2, 5), c(8, 15), c(16, 19)),
#'   list(c(3, 5), c(8, 10), c(12, 15), c(16, 16), c(17, 19)),
#'   list(c(4, 5), c(8, 9), c(10, 10), c(12, 12), c(13, 15), c(17, 17), c(18, 19)),
#'   list(c(8, 8), c(9, 9), c(13, 13), c(14, 15), c(18, 18), c(19, 19))
#' )
#' ZL <- list(
#'   c(4, 8, 4),
#'   c(3, 3, 4, 1, 3),
#'   c(2, 2, 1, 1, 2, 1, 2),
#'   c(1, 1, 1, 2, 1, 1)
#' )
#' leaf_list <- as.list(1:m)
#' V.star(1, C, ZL, leaf_list)
#' 
#' V.star(1:5, C, ZL, leaf_list)
#' 
#' V.star(13:15, C, ZL, leaf_list)
#' 
#' V.star(1:m, C, ZL, leaf_list)
#' @export
V.star <- function(S, C, ZL, leaf_list) {
  H <- length(C)
  nb_leaves <- length(leaf_list)
  Vec <- numeric(nb_leaves) 
  # the initialization term for each atom P_i
  # is equivalent to completing the family if it isn't,
  # assuming that leaf_list does indeed contain all leaves
  # and some were just eventually missing in C and ZL
  # this initialization also takes care of the minima
  # between \zeta_k and card(S inter R_k)
  for (i in 1:nb_leaves) {
    Vec[i] <- sum(leaf_list[[i]][length(leaf_list[[i]])] >= S)
  }
  Vec <- Vec - c(0, Vec[1:(nb_leaves-1)])
  # this way of initializing is much faster than the following naive approach,
  # the downside is that it heavily uses that the elements of leaf_list are sorted
  # naive approach:
  # for (i in 1:nb_leaves) {
  #   Vec[i] <- sum(S %in% leaf_list[[i]])
  # }
  for (h in H:1) {
    nb_regions <- length(C[[h]])
    if (nb_regions > 0) {
      for (k in 1:nb_regions) {
        Rk <- C[[h]][[k]]
        sum_succ <- sum(Vec[Rk[1]:Rk[2]])
        res <- min(ZL[[h]][k], sum_succ)
        Vec[Rk[1]:Rk[2]] <- 0
        Vec[Rk[1]] <- res
      }
    }
  }
  return(sum(Vec))
}

#' Prune a forest structure to speed up computations
#' 
#' @description The pruned forest structure makes the computation of [V.star()],
#' [curve.V.star.forest.naive()] and [curve.V.star.forest.fast()] faster.
#' 
#' @param C A list of list representing the forest structure. See [V.star()] for more information.
#' @param ZL A list of integer vectors representing the upper bounds \eqn{\zeta_k} of the forest structure. See [V.star()] for more information.
#' @param leaf_list A list of vectors representing the atoms of the forest structure. See [V.star()] for more information.
#' @param prune.leafs A boolean, \code{FALSE} by default. If \code{TRUE}, will also prune atoms/leafs for which \eqn{\zeta_k \geq |R_k|},
#' this makes the computation of [V.star()] and [curve.V.star.forest.naive()] even faster but should not be used with [curve.V.star.forest.fast()]
#' because this needs the structure to be complete (i.e., with all its atoms). This is why the default option is \code{FALSE}.
#' @param delete.gaps A boolean, \code{FALSE} by default. If \code{TRUE}, will also delete the gaps in the structure induced 
#' by the pruning, see [delete.gaps()].
#' 
#' @return A list with three named elements. \describe{
#' \item{\code{VstarNm}}{\eqn{V^*(\mathbb N_m)} is computed as by-product by the algorithm, so we might as well return it.}
#' \item{\code{C}}{The new \code{C} after pruning.}
#' \item{\code{ZL}}{The new \code{ZL} after pruning.}
#' }
#' 
#' @references Durand, G., Blanchard, G., Neuvial, P., & Roquain, E. (2020). Post hoc false positive control for structured hypotheses. Scandinavian Journal of Statistics, 47(4), 1114-1148.
#' @references Durand G. (2025). A fast algorithm to compute a curve of confidence upper bounds for the False Discovery Proportion using a reference family with a forest structure. arXiv:2502.03849.
#' @export
pruning <- function(C, ZL, leaf_list, prune.leafs = FALSE, delete.gaps = FALSE) {
  H <- length(C)
  nb_leaves <- length(leaf_list)
  Vec <- numeric(nb_leaves) 
  for (i in 1:nb_leaves) {
    Vec[i] <- length(leaf_list[[i]])
    # The initialization term for each atom P_i
    # is equivalent to completing the family if it isn't,
    # assuming that leaf_list does indeed contain all leaves
    # and some were just eventually missing in C and ZL.
    # Using the length of the atoms assure that we 
    # will catch atoms with a \zeta_i larger than its length
  }
  for (h in H:1) {
    nb_regions <- length(C[[h]])
    if (nb_regions > 0) {
      for (j in nb_regions:1) {
        Chj <- C[[h]][[j]]
        candidate <- ZL[[h]][j]
        sum_succ <- sum(Vec[Chj[1]:Chj[2]])
        if (candidate >= sum_succ) {
          res <- sum_succ
          if(prune.leafs || (Chj[1] < Chj[2])){
            # Either the region is not a leaf, 
            # or it's a leaf and we chose to prune the leafs
            # so we prune.
            ZL[[h]] <- ZL[[h]][-j]
            C[[h]][[j]] <- NULL
          }
          else{
            # The region is a leaf/atom and we chose
            # to keep the leafs, so we keep it,
            # but we change the \zeta_i to the length of P_i.
            ZL[[h]][j] <- sum_succ
          }
        } else {
          res <- candidate
        }
        Vec[Chj[1]:Chj[2]] <- 0
        Vec[Chj[1]] <- res
      }
    }
  }
  if (delete.gaps) {
    gaps.deleted <- delete.gaps(C, ZL, leaf_list)
    C <- gaps.deleted$C
    ZL <- gaps.deleted$ZL
  }
  return(list(VstarNm = sum(Vec),
              C = C,
              ZL = ZL
  )
  )
}

#' Delete the gaps induced by pruning
#' 
#' @description
#' A small optimization that can be done after pruning
#' that can speed up computations (it removes the gaps introduced by the pruning)
#' 
#' @details
#' See [pruning()].
#' 
#' @param C A list of list representing the forest structure. See [V.star()] for more information.
#' @param ZL A list of integer vectors representing the upper bounds \eqn{\zeta_k} of the forest structure. See [V.star()] for more information.
#' @param leaf_list A list of vectors representing the atoms of the forest structure. See [V.star()] for more information.
#' 
#' @return A list with two named elements. \describe{
#' \item{\code{C}}{The new \code{C} after deleting the gaps.}
#' \item{\code{ZL}}{The new \code{ZL} after deleting the gaps.}
#' }
#' 
#' @usage delete.gaps(C, ZL, leaf_list)
#' 
#' @export
delete.gaps <- function(C, ZL, leaf_list) {
  H <- length(C)
  nb_leaves <- length(leaf_list)
  continue <- TRUE
  newC <- list()
  newZL <- list()
  loop.counter <- 1
  while(continue) {
    newC[[loop.counter]] <- list()
    newZL[[loop.counter]] <- numeric(0)
    leaf.available <- ! logical(nb_leaves)
    regions.delete <- list()
    for (h in 1:H) {
      nb_regions <- length(C[[h]])
      if (nb_regions > 0) {
        for (l in 1:nb_regions) {
          Chl <- C[[h]][[l]]
          if (all(leaf.available[Chl[1]:Chl[2]])) {
            newC[[loop.counter]][[Chl[1]]] <- Chl
            newZL[[loop.counter]][Chl[1]]  <- ZL[[h]][[l]]
            leaf.available[Chl[1]:Chl[2]] <- FALSE
            regions.delete <- append(regions.delete, list(c(h, l)))
          }
        }
      }
    }
    for (couple in rev(regions.delete)) {
      h <- couple[1]
      l <- couple[2]
      C[[h]][[l]] <- NULL
      ZL[[h]] <- ZL[[h]][-l]
    }
    if(length(newC[[loop.counter]]) > 0) {
      for(l in length(newC[[loop.counter]]):1) {
        if (is.null(newC[[loop.counter]][[l]])) {
          newC[[loop.counter]][[l]] <- NULL
          newZL[[loop.counter]] <- newZL[[loop.counter]][-l]
        }
      }
    }
    loop.counter <- loop.counter + 1
    continue <- any(sapply(X = ZL, FUN = function(vec){length(vec) > 0}))
  }
  return(list(C = newC, ZL = newZL))
}

#' Compute a curve of post hoc bounds based on a reference family with forest structure
#' 
#' @name curve.V.star.forest
#' 
#' @description
#' Computes the post hoc upper bound \eqn{V^*(S_t)} on the number of false positives in a 
#' given sequence of selection sets \eqn{S_t} of hypotheses, such that
#' \eqn{S_t\subset S_{t+1}} and \eqn{|S_t| = t}, 
#' using a reference family \eqn{(R_k, \zeta_k)} that possess the forest structure
#' (see References).
#' 
#' @details Two functions are available
#' \describe{
#' \item{\code{curve.V.star.forest.naive}}{Repeatedly calls [V.star()] on each \eqn{S_t}, which is not optimized and time-consuming, this should not be used in practice.}
#' \item{\code{curve.V.star.forest.fast}}{A fast and optimized version that leverage the fact that\eqn{S_{t+1}} is the union of \eqn{S_t} and a single hypothesis index. 
#' The algorithm needs to work on a complete forest, so this version first completes the forest (unless told that the forest has already been completed, see [forest.completion()]), 
#' and the completion fails if the input is a pruned forest (see [pruning()]), 
#' so if a pruned forest is given as input, it MUST be said with the \code{is.pruned} argument 
#' so that the function skips completion (so the pruned forest given as input must also be complete).}
#' }
#'
#' @param perm An integer vector of elements in \code{1:m}, all different, and of size up to \code{m} (in which case it's a permutation, hence the name). 
#' The set \eqn{S_t} is represented by \code{perm[1:t]}.
#' @param C A list of list representing the forest structure. See [V.star()] for more information.
#' @param ZL A list of integer vectors representing the upper bounds \eqn{\zeta_k} of the forest structure. See [V.star()] for more information.
#' @param leaf_list A list of vectors representing the atoms of the forest structure. See [V.star()] for more information.
#' @param pruning A boolean, \code{FALSE} by default. Whether to prune the forest (see [pruning()]) before computing the bounds. Ignored if \code{is.pruned} is \code{TRUE}.
#' @param is.pruned A boolean, \code{FALSE} by default. If \code{TRUE}, assumes that the forest structure has already been completed (see [forest.completion()]) and then pruned (see [pruning()])
#' and so skips the completion step and optional pruning step. Must be set to \code{TRUE} if giving a pruned forest, see Details.
#' @param is.complete A boolean, \code{FALSE} by default. If \code{TRUE}, assumes that the forest structure has already been completed (see [forest.completion()]) and so skips the completion step.
#' Ignored if \code{is.pruned} is \code{TRUE}.
#' @param delete.gaps A boolean, \code{FALSE} by default. If \code{TRUE}, will also delete the gaps in the structure induced 
#' by the pruning, see [delete.gaps()]. Ignored if \code{pruning} is \code{FALSE}.
#' 
#' @return A vector of length of same length as \code{perm}, where the \code{t}-th
#' element is \eqn{V^*(S_t)}.
#' @references Durand, G., Blanchard, G., Neuvial, P., & Roquain, E. (2020). Post hoc false positive control for structured hypotheses. Scandinavian Journal of Statistics, 47(4), 1114-1148.
#' @references Durand G. (2025). A fast algorithm to compute a curve of confidence upper bounds for the False Discovery Proportion using a reference family with a forest structure. arXiv:2502.03849.
#' @export
#' @examples
#' m <- 20
#' C <- list(
#'   list(c(2, 5), c(8, 15), c(16, 19)),
#'   list(c(3, 5), c(8, 10), c(12, 15), c(16, 16), c(17, 19)),
#'   list(c(4, 5), c(8, 9), c(10, 10), c(12, 12), c(13, 15), c(17, 17), c(18, 19)),
#'   list(c(8, 8), c(9, 9), c(13, 13), c(14, 15), c(18, 18), c(19, 19))
#' )
#' ZL <- list(
#'   c(4, 8, 4),
#'   c(3, 3, 4, 1, 3),
#'   c(2, 2, 1, 1, 2, 1, 2),
#'   c(1, 1, 1, 2, 1, 1)
#' )
#' leaf_list <- as.list(1:m)
#' curve.V.star.forest.naive(1:m, C, ZL, leaf_list, pruning = FALSE)
#' 
#' curve.V.star.forest.naive(1:m, C, ZL, leaf_list, pruning = TRUE)
#' 
#' curve.V.star.forest.fast(1:m, C, ZL, leaf_list, pruning = FALSE)
#' 
#' curve.V.star.forest.fast(1:m, C, ZL, leaf_list, pruning = TRUE)

#' @rdname curve.V.star.forest
#' @export
curve.V.star.forest.naive <- function(perm, C, ZL, leaf_list, pruning = FALSE, delete.gaps = FALSE){
  vstars <- numeric(length(perm))
  
  # the naive version doesn't need a proper completion of the
  # forest structure because V.star.no.extension
  # implicitly completes, and for the same reason
  # it can use super pruning
  
  if (pruning){
    pruned <- pruning(C, ZL, leaf_list, prune.leafs = TRUE, delete.gaps = delete.gaps)
    C <- pruned$C
    ZL <- pruned$ZL
    m <- length(unlist(leaf_list))
    if(length(perm) == m){
      # means that the last bound to compute
      # is V^*({1, ..., m}),
      # but the pruning already computed
      # V^*({1, ..., m}) as a by-product so we
      # might as well use it:
      vstars[m] <- pruned$VstarNm
      perm <- perm[-m]
    }
  }
  
  S <- numeric(0)
  j <- 0
  for (t in perm){
    j <- j + 1
    S <- c(S, t)
    vstars[j] <- V.star(S, C, ZL, leaf_list) 
  }
  return(vstars)
}

#' @rdname curve.V.star.forest
#' @export
curve.V.star.forest.fast <- function(perm, C, ZL, leaf_list, pruning = FALSE, is.pruned = FALSE, is.complete = FALSE, delete.gaps = FALSE){
  
  vstars <- numeric(length(perm))
  
  if (! is.pruned) {
    
    if (! is.complete) {
      # the fast version needs a proper completion of the
      # forest structure, and for the same reason
      # it must not prune the leaves
      completed <- forest.completion(C, ZL, leaf_list)
      C <- completed$C
      ZL <- completed$ZL
    }
    
    if (pruning) {
      is.pruned <- TRUE # useless atm, but kept for clarity
      pruned <- pruning(C, ZL, leaf_list, prune.leafs = FALSE, delete.gaps = delete.gaps)
      C <- pruned$C
      ZL <- pruned$ZL
      m <- length(unlist(leaf_list))
      
      if (length(perm) == m) {
        # means that the last bound to compute
        # is V^*({1, ..., m}),
        # but the pruning already computed
        # V^*({1, ..., m}) as a by-product so we
        # might as well use it:
        vstars[m] <- pruned$VstarNm
        perm <- perm[-m]
      }
    }
  }
  
  H <- length(C)
  m <- length(unlist(leaf_list))
  
  # preparation of the initial value of the etas (a copy of zetas but 
  # with only zeroes), of the initial value of K^- (with the R_k's
  # such that zeta_k = 0) and of a m x H matrix M such that
  # M[i, h] gives the index k of C[[h]] such that hypothesis i
  # is in the R_k given by C[[h]][[k]]
  etas <- ZL
  K.minus <- list()
  M <- matrix(0, ncol = H, nrow = m)
  for (h in 1:H){
    zeta_depth_h <- ZL[[h]]
    length_zeta_depth_h <- length(zeta_depth_h)
    etas[[h]] <- rep(0, length_zeta_depth_h)
    K.minus[[h]] <- vector("list", length(C[[h]]))
    if (length_zeta_depth_h > 0){
      for (k in 1:length_zeta_depth_h){
        if (zeta_depth_h[k] == 0){
          K.minus[[h]][[k]] <- C[[h]][[k]]
        }
        first_leaf <- leaf_list[[C[[h]][[k]][1]]]
        last_leaf <- leaf_list[[C[[h]][[k]][2]]]
        M[first_leaf[1]:last_leaf[length(last_leaf)], h] <- k
      }
    }
  }
  
  for (t in 1:length(perm)) {
    
    i.t <- perm[t]
    if (t > 1) {
      previous.vstar <- vstars[t - 1]
    } else {
      previous.vstar <- 0
    }
    
    # SEARCHING IF i_t IS IN K.MINUS
    # if so, we let go.next = TRUE
    # and we just go next to step t+1
    go.next <- FALSE
    for (h in 1:H) {
      k <- M[i.t, h]
      if ((k > 0) && (! is.null(K.minus[[h]][[k]]))){
        go.next <- TRUE
        break
      }
    }
    # print(paste0(i.t, " isn't in K minus"))
    
    # COMPUTING V.STAR AND UPDATING K.MINUS AND ETAS
    if (go.next) {
      # Here, i_t is in K.minus
      vstars[t] <- previous.vstar
    } else {
      # Here, i_t isn't in K.minus
      for (h in 1:H) {
        k <- M[i.t, h]
        if (k > 0){
          # if k == 0,
          # there is no k^{(t,h)} because either there is a 
          # gap in the structure (maybe because of pruning, or because
          # of the way the structure has been built),
          # either because we are past the max depth of i.t;
          # in this case we just don't do anything
          etas[[h]][[k]] <- etas[[h]][[k]] + 1
          if(etas[[h]][[k]] >= ZL[[h]][[k]]){
            K.minus[[h]][[k]] <- C[[h]][[k]]
            break
          }
        }
      }
      vstars[t] <- previous.vstar + 1
    }
    
  }
  return(vstars)
}

#' Complete a forest structure
#' 
#' @description Completes the forest in the sens of the Reference: adds the missing atoms/leafs 
#' in the reference family with a forest structure \eqn{(R_k, \zeta_k)} 
#' so that each atom is well represented by a \eqn{R_k}. The associated \eqn{\zeta_k} is 
#' taken as the trivial \eqn{|R_k|}.
#' 
#' @details The forest must not be pruned (with [pruning()]) beforehand. The code will not behave expectedly 
#' and will return a wrong result if a pruned forest is given as input. Maybe the function could be rewritten 
#' going from the leaves to the roots instead of the contrary, to avoid this issue.
#' 
#' @param C A list of list representing the forest structure. See [V.star()] for more information.
#' @param ZL A list of integer vectors representing the upper bounds \eqn{\zeta_k} of the forest structure. See [V.star()] for more information.
#' @param leaf_list A list of vectors representing the atoms of the forest structure. See [V.star()] for more information.
#' 
#' @return A list with two named elements. \describe{
#' \item{\code{C}}{The new \code{C} after completion.}
#' \item{\code{ZL}}{The new \code{ZL} after completion.}
#' }
#' 
#' @references Durand, G., Blanchard, G., Neuvial, P., & Roquain, E. (2020). Post hoc false positive control for structured hypotheses. Scandinavian Journal of Statistics, 47(4), 1114-1148.
#' @references Durand G. (2025). A fast algorithm to compute a curve of confidence upper bounds for the False Discovery Proportion using a reference family with a forest structure. arXiv:2502.03849.
#' @export
forest.completion <- function(C, ZL, leaf_list) {
  H <- length(C)
  
  leaves.to.place <- 1:length(leaf_list)
  len.to.place <- length(leaves.to.place)
  to.delete <- numeric(0)
  
  for (h in 1:H) {
    
    j <- 1
    l <- 1
    
    while (j <= len.to.place) {
      expected_leaf <- leaves.to.place[j]
      end_of_line <- l > length(C[[h]])
      if (! end_of_line) {Chl <- C[[h]][[l]]}
      if ((! end_of_line) && expected_leaf == Chl[[1]]) {
        if (expected_leaf == Chl[[2]]) {
          to.delete <- c(to.delete, j)
        }
        j <- j + Chl[2] - Chl[1] +1
      } else {
        C[[h]] <- append(C[[h]], list(c(expected_leaf, expected_leaf)), l - 1)
        ZL[[h]] <- append(ZL[[h]], length(leaf_list[[expected_leaf]]), l - 1)
        to.delete <- c(to.delete, j)
        j <- j + 1
      }
      l <- l + 1
    }
    
    leaves.to.place <- leaves.to.place[-to.delete]
    len.to.place <- length(leaves.to.place)
    to.delete <- numeric(0)
    
  }
  
  if (len.to.place > 0) {
    h <- H + 1
    C[[h]] <- list()
    ZL[[h]] <- numeric(0)
    for (expected_leaf in leaves.to.place) {
      C[[h]] <- append(C[[h]], list(c(expected_leaf, expected_leaf)))
      ZL[[h]] <- append(ZL[[h]], length(leaf_list[[expected_leaf]]))
    }
  }
  
  return(list(C = C, ZL = ZL))
}
pneuvial/sanssouci documentation built on July 4, 2025, 3:16 p.m.