R/functions_main.R
In multiFANOVA: Multiple Contrast Tests for Functional Data

Documented in multiFANOVA

#' Multiple contrast tests for functional data
#'
#' The function \code{multiFANOVA()} calculates the globalizing pointwise Hotelling's \eqn{T^2}-test
#' (GPH) statistic for the global null hypothesis and multiple local null hypotheses.
#' Respective \eqn{p}-values are obtained by a parametric bootstrap strategy.
#'
#' @param x matrix of observations \eqn{n\times j} (\eqn{n = n_1 + ... + n_k}, \eqn{j} is a number of design time points).
#' @param gr_label a vector with group labels; the integer labels (from 1 to a number of groups) should be used.
#' @param h contrast matrix. For Dunnett’s and Tukey’s contrasts, it can be created by
#' the \code{contr_mat()} function from the package \code{GFDmcv} (see examples).
#' @param n_boot number of bootstrap samples.
#' @param alpha significance level.
#' @param parallel a logical indicating whether to use parallelization.
#' @param n_cores if \code{parallel = TRUE}, a number of processes used in parallel computation.
#' Its default value means that it will be equal to the number of cores of a computer used.
#'
#' @details The function \code{multiFANOVA()} concerns the tests for the heteroscedastic
#' contrast testing problem for functional data. The details are presented in Munko et al. (2023),
#' but here we present some summary of the problem and its solutions implemented in the package.
#'
#' Suppose we have \eqn{k} independent functional samples \eqn{x_{i1},\dots,x_{in_i}},
#' which consist of independent and identically distributed stochastic processes defined
#' on interval \eqn{[a,b]} with mean function \eqn{\eta_i} and covariance function
#' \eqn{\gamma_i} for each \eqn{i\in\{1,\dots,k\}}. Note that the covariance functions
#' of the different groups may differ from each other, i.e., heteroscedasticity is explicitly allowed.
#'
#' We consider the null and alternative hypothesis
#' \deqn{\mathcal H_0: \mathbf{H}\boldsymbol{\eta}(t) = 0 \text{ for all }
#' t\in [a,b] \quad \text{vs.} \quad \mathcal H_1: \mathbf{H}\boldsymbol{\eta}(t) \neq
#' 0 \text{ for some } t\in [a,b],} where \eqn{\mathbf{H} \in \mathbb{R}^{r \times k}}
#'  denotes a known contrast matrix, i.e., \eqn{\mathbf{H}\mathbf{1}_k = \mathbf{0}_r},
#'  \eqn{\boldsymbol{\eta} := (\eta_1,\dots,\eta_k)^{\top}} is the vector of the mean functions.
#'  The formulation of this testing framework is very general
#'  and contains many special cases like the analysis of variance for functional data
#'  (FANOVA) problem. In detail, we may choose \eqn{\mathbf{H} = \mathbf{P}_k} for the one-way FANOVA problem
#'  to test the null hypothesis of no main effect, where \eqn{\mathbf{P}_k:=\mathbf{I}_k-\mathbf{J}_k/k}
#'   with \eqn{\mathbf{I}_k \in\mathbb{R}^{k\times k}} denoting the unit matrix and
#'   \eqn{\mathbf{J}_k := \mathbf{1}_k\mathbf{1}_k^{\top} \in\mathbb{R}^{k\times k}}
#'   denoting the matrix of ones. However, there are different possible choices of the
#'   contrast matrix \eqn{\mathbf{H}} which lead to this global null hypothesis.
#'   Many-to-one comparisons can be considered by choosing Dunnett's contrast matrix
#'  \eqn{\mathbf{H} = [-\mathbf{1}_{k-1}, \mathbf{I}_{k-1}]}, where the mean
#'  functions \eqn{\eta_2,\dots,\eta_k} are compared to the mean function \eqn{\eta_1}
#'   of the first group regarding the different contrasts. To compare all pairs
#'   of mean functions \eqn{\eta_{i_1},\eta_{i_2}, i_1,i_2 \in\{1,\dots,k\}} with
#'   \eqn{i_1 \neq i_2}, the Tukey's contrast matrix:
#'  \deqn{\mathbf{H} = \begin{bmatrix}
#'  -1 & 1 & 0 & 0 & \cdots & \cdots & 0 \\
#'  -1 & 0 & 1 & 0 &\cdots & \cdots & 0 \\
#'  \vdots  & \vdots &\vdots & \vdots & \ddots & \vdots & \vdots  \\
#'  -1 & 0 & 0 & 0& \cdots & \cdots & 1\\
#'  0 & -1 & 1 & 0& \cdots & \cdots & 0 \\
#'  0 & -1 & 0 & 1& \cdots & \cdots & 0 \\
#'  \vdots  & \vdots & \vdots  & \vdots & \ddots & \vdots & \vdots \\
#'  0 & 0 & 0 & 0 & \cdots & -1 & 1
#'  \end{bmatrix}  \in \mathbb{R}^{k(k-1)/2 \times k}} can be used.
#'
#'  For this testing problem, we consider the pointwise Hotelling's \eqn{T^2}-test statistic
#'  \deqn{\mathrm{TF}_{n,\mathbf{H}}(t):= n(\mathbf{H}\boldsymbol{\widehat\eta}(t))^{\top}
#'  (\mathbf{H}\mathbf{\widehat \Sigma}(t,t) \mathbf{H}^{\top})^+
#'  \mathbf{H} \boldsymbol{\widehat\eta}(t)}
#'  for all \eqn{t \in[a,b]}, where \eqn{\boldsymbol{\widehat\eta} := (\widehat\eta_1,\dots,\widehat\eta_k)^{\top}}
#'  denotes the vector of all mean function estimators, \eqn{\mathbf{A}^+} denotes the
#'  Moore-Penrose inverse of the matrix \eqn{\mathbf{A}}, and
#'  \deqn{\boldsymbol{\widehat{\Sigma}}(t,s):= \mathrm{diag}\left( \frac{n}{n_1}\widehat{\gamma}_1(t,s),
#'  \ldots,\frac{n}{n_k}\widehat{\gamma}_k(t,s)\right),} \eqn{n=n_1+\dots+n_k}, \eqn{\widehat{\gamma}_i(t,s)}
#'  is the sample covariance function for the \eqn{i}-th group, \eqn{i=1,\dots,k}.
#'  Based on this pointwise Hotelling's \eqn{T^2}-test statistic, we construct the globalizing
#'  pointwise Hotelling's \eqn{T^2}-test (GPH) statistic by integrating over the pointwise
#'  Hotelling's \eqn{T^2}-test statistic, that is
#'  \deqn{T_{n}(\mathbf{H}) := \int_a^b \mathrm{TF}_{n,\mathbf{H}}(t) \,\mathrm{ d }t.}
#'  We consider the parametric bootstrap test based on this test statistic. However, for better post hoc
#'  analysis, we also consider the multiple contrast testing procedures. The main idea of multiple contrast
#'  tests is to split up the global null hypothesis with matrix
#'  \eqn{\mathbf{H}= [\mathbf{H}_1^{\top}, \dots, \mathbf{H}_r^{\top}]^{\top}} into \eqn{r} single contrast
#'  tests with contrast vectors \eqn{\mathbf{H}_1, \dots, \mathbf{H}_r \in\mathbb{R}^{k}}.
#'  This leads to the multiple testing problem with null hypotheses
#'  \deqn{\mathcal H_{0,{\ell}} : \; \mathbf{H}_{\ell} \boldsymbol{\eta}(t) =
#'  0 \;\text{ for all }t\in[a,b],	\text{for }\ell\in \{1,\ldots,r\}.}
#'  To verify this family of null hypotheses, we adopt two approaches. First, we simply apply
#'  the above test to each hypothesis \eqn{\mathcal H_{0,{\ell}}}, and the resulting \eqn{p}-values
#' are then corrected by the Bonferroni's method. However, this approach, denoted in the package as
#' GPH, may give conservative test and loss of power. Thus, we also consider the test adopting the
#' idea for the construction of simultaneous confidence bands proposed by Buhlmann (1998).
#' This test is denoted by mGPH in the package and is a more powerful solution than the GPH
#' procedure, which was shown in Munko et al. (2023).
#'
#' Note that the value of the test statistic for the mGPH test for global hypotheses is
#' equals to \deqn{\max_{\ell\in\{1,\ldots,r\}}\frac{T_{n}(\mathbf{H}_{\ell})}{q_{l,\widetilde{\beta}}^{\mathcal{P}}},}
#' where \eqn{q_{l,\widetilde{\beta}}^{\mathcal{P}}} are the quantiles calculated using
#' the adaptation of the method by Buhlmann (1998). The critical value for it is always 1.
#'
#' Please have a look at a summary function designed for this package. It can be used
#' to simplify the output of \code{multiFANOVA()} function.
#'
#' @return A list of class \code{multifanova} containing the following components:
#' \item{res_global}{a data frame containing the results for testing the global null hypothesis,
#' i.e., test statistics and \eqn{p}-values.}
#' \item{res_multi}{all results of multiple contrasts tests for particular hypothesis in
#' a contrast matrix \code{h}, i.e., test statistics, critical values and \eqn{p}-values.}
#' \item{k}{a number of groups.}
#' \item{j}{a number of design time points.}
#' \item{n}{a vector of sample sizes.}
#' \item{h}{an argument \code{h}.}
#' \item{h_boot}{an argument \code{n_boot}.}
#' \item{alpha}{an argument \code{alpha}.}
#'
#' @examples
#' # Some of the examples may run some time.
#'
#' # Canadian weather data set
#' # There are three samples of mean temperatures for
#' # fifteen weather stations in Eastern Canada,
#' # another fifteen in Western Canada, and
#' # the remaining five in Northern Canada.
#' library(fda)
#' data_set <- t(CanadianWeather$dailyAv[,, "Temperature.C"])
#' k <- 3
#' gr_label <- rep(c(1, 2, 3), c(15, 15, 5))
#' # trajectories of mean temperatures
#' matplot(t(data_set), type = "l", col = gr_label, lty = 1,
#'         xlab = "Day", ylab = "Temperature (C)",
#'         main = "Canadian weather data set")
#' legend("bottom", legend = c("Eastern Canada", "Western Canada", "Northern Canada"),
#'        col = 1:3, lty = 1)
#' \donttest{
#' # Tukey's contrast matrix
#' h_tukey <- GFDmcv::contr_mat(k, type = "Tukey")
#' # testing without parallel computing
#' res <- multiFANOVA(data_set, gr_label, h_tukey)
#' summary(res, digits = 3)
#' # plots for pointwise Hotelling's T^2-test statistics
#' oldpar <- par(mfrow = c(2, 2), mar = c(2, 2, 2, 0.1))
#' plot(ph_test_statistic(data_set, gr_label, h_tukey), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_tukey))),
#'      main = "Global hypothesis")
#' plot(ph_test_statistic(data_set, gr_label, matrix(h_tukey[1, ], 1)), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_tukey))),
#'      main = "Contrast 1")
#' plot(ph_test_statistic(data_set, gr_label, matrix(h_tukey[2, ], 1)), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_tukey))),
#'      main = "Contrast 2")
#' plot(ph_test_statistic(data_set, gr_label, matrix(h_tukey[3, ], 1)), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_tukey))),
#'      main = "Contrast 3")
#' par(oldpar)}
#' \dontshow{
#' h_tukey <- GFDmcv::contr_mat(k, type = "Tukey")
#' res <- multiFANOVA(data_set[, 1:10], gr_label, h_tukey, n_boot = 3)
#' summary(res, digits = 3)
#' # plots for pointwise Hotelling's T^2-test statistics
#' oldpar <- par(mfrow = c(2, 2), mar = c(2, 2, 2, 0.1))
#' plot(ph_test_statistic(data_set, gr_label, h_tukey), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_tukey))),
#'      main = "Global hypothesis")
#' plot(ph_test_statistic(data_set, gr_label, matrix(h_tukey[1, ], 1)), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_tukey))),
#'      main = "Contrast 1")
#' plot(ph_test_statistic(data_set, gr_label, matrix(h_tukey[2, ], 1)), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_tukey))),
#'      main = "Contrast 2")
#' plot(ph_test_statistic(data_set, gr_label, matrix(h_tukey[3, ], 1)), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_tukey))),
#'      main = "Contrast 3")
#' par(oldpar)}
#' \donttest{
#' # testing with parallel computing
#' library(doParallel)
#' res <- multiFANOVA(data_set, gr_label, h_tukey, parallel = TRUE, n_cores = 2)
#' summary(res, digits = 3)}
#' \dontshow{
#' res <- multiFANOVA(data_set[, 1:10], gr_label, h_tukey, n_boot = 10,
#'                    parallel = TRUE, n_cores = 2)
#' summary(res, digits = 3)}
#' \donttest{
#' # Dunnett's contrast matrix
#' h_dunnett <- GFDmcv::contr_mat(k, type = "Dunnett")
#' res <- multiFANOVA(data_set, gr_label, h_dunnett)
#' summary(res, digits = 3)
#' # plots for pointwise Hotelling's T^2-test statistics
#' oldpar <- par(mfrow = c(3, 1), mar = c(2, 2, 2, 0.1))
#' plot(ph_test_statistic(data_set, gr_label, h_dunnett), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_dunnett))),
#'      main = "Global hypothesis")
#' plot(ph_test_statistic(data_set, gr_label, matrix(h_dunnett[1, ], 1)), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_dunnett))),
#'      main = "Contrast 1")
#' plot(ph_test_statistic(data_set, gr_label, matrix(h_dunnett[2, ], 1)), type = "l",
#'      ylim = c(0, max(ph_test_statistic(data_set, gr_label, h_dunnett))),
#'      main = "Contrast 2")
#' par(oldpar)}
#'
#' @references Buhlmann P. (1998) Sieve bootstrap for smoothing in nonstationary time series.
#' Annals of Statistics 26, 48-83.
#'
#' Dunnett C. (1955) A multiple comparison procedure for comparing several treatments
#' with a control. Journal of the American Statistical Association 50, 1096-1121.
#'
#' Munko M., Ditzhaus M., Pauly M., Smaga L., Zhang J.T. (2023) General multiple tests for functional data. Preprint https://arxiv.org/abs/2306.15259
#'
#' Tukey J.W. (1953) The problem of multiple comparisons. Princeton University.
#'
#' @importFrom stats p.adjust quantile
#' @import doParallel
#' @import foreach
#' @import MASS
#' @import Matrix
#' @import GFDmcv
#' @import fda
#'
#' @export
multiFANOVA <- function(x, gr_label, h, n_boot = 1000, alpha = 0.05,
                        parallel = FALSE, n_cores = NULL) {
  if (parallel) {
    # #' @importFrom utils installed.packages
    # if (!("doParallel" %in% rownames(installed.packages()))) {
    #   stop("Please install package 'doParallel'")
    # }
    # require(foreach, quietly = TRUE)
    requireNamespace("doParallel", quietly = TRUE)
    requireNamespace("foreach", quietly = TRUE)
    if (is.null(n_cores)) {
      n_cores <- parallel::detectCores()
    } else if (!(n_cores > 1)) {
      stop("n_cores should be greater than 1")
    }
    cl <- parallel::makePSOCKcluster(n_cores)
    on.exit(parallel::stopCluster(cl))
    doParallel::registerDoParallel(cl)
  }

  jj <- ncol(x)
  n_vec <- as.numeric(table(gr_label))
  kk <- length(n_vec)
  nn <- sum(n_vec)
  num_hyp <- nrow(h)
  if (nn != nrow(x)) {
    stop("the number of observations in x and the length of gr_label are different")
  }
  if (!all.equal(rowSums(h), rep(0, num_hyp))) {
    stop("matrix h is not a contrast matrix - sums in rows are not equal to zero")
  }
  mc <- mean_cov(x, gr_label)
  gph_d <- numeric(num_hyp)
  for (i in seq_len(num_hyp)) {
    gph_d[i] <- gph_f(matrix(h[i, ], nrow = 1), mc)
  }
  gph <- gph_f(h, mc)
  # bootstrap
  if (!parallel) {
    gph_pb_d <- matrix(NA, num_hyp, n_boot)
    gph_pb <- numeric(n_boot)
    for (i_b in seq_len(n_boot)) {
      # parametric bootstrap
      x_pb <- MASS::mvrnorm(n_vec[1], numeric(jj), mc$gr_cov[[1]])
      for (i_pb in 2:kk) {
        x_pb <- rbind(x_pb,
                      MASS::mvrnorm(n_vec[i_pb], numeric(jj), mc$gr_cov[[i_pb]]))
      }
      mc_p <- mean_cov(x_pb, gr_label)
      for (i in seq_len(num_hyp)) {
        gph_pb_d[i, i_b] <- gph_f(matrix(h[i, ], nrow = 1), mc_p)
      }
      gph_pb[i_b] <- gph_f(h, mc_p)
    }
  } else {
    RNGkind(kind = "Mersenne-Twister", normal.kind = "Inversion")
    test_stat_boot <- foreach(i_b = 1:n_boot, .combine = rbind,
                              .packages = c("MASS", "multiFANOVA")) %dopar%
      {
        # parametric bootstrap
        x_pb <- MASS::mvrnorm(n_vec[1], numeric(jj), mc$gr_cov[[1]])
        for (i_pb in 2:kk) {
          x_pb <- rbind(x_pb,
                        MASS::mvrnorm(n_vec[i_pb], numeric(jj), mc$gr_cov[[i_pb]]))
        }
        mc_p <- mean_cov(x_pb, gr_label)
        gph_pb_d <- numeric(num_hyp)
        for (i in seq_len(num_hyp)) {
          gph_pb_d[i] <- gph_f(matrix(h[i, ], nrow = 1), mc_p)
        }
        gph_pb <- gph_f(h, mc_p)

        c(gph_pb, gph_pb_d)
      }
    gph_pb <- test_stat_boot[, 1]
    gph_pb_d <- t(test_stat_boot[, -1])
  }
  gph_pb_d_b <- numeric(num_hyp)
  gph_pb_d_cv <- numeric(num_hyp)
  for (i in seq_len(num_hyp)) {
    gph_pb_d_b[i] <- mean(gph_pb_d[i, ] > gph_d[i])
    gph_pb_d_cv[i] <- quantile(gph_pb_d[i, ], 1 - alpha / num_hyp)
  }

  # global hypothesis
  beta <- gph_pb_d_b[which.min(gph_pb_d_b)]
  data_order <- t(apply(gph_pb_d, 1, sort))
  j_low_b <- ceiling(beta * n_boot) # - 1
  if (n_boot == j_low_b) {
    A <- gph_pb_d / data_order[, 1]
  } else {
    A <- gph_pb_d / data_order[, n_boot - j_low_b]
  }
  mgph_p_val_gl <- mean(apply(A, 2, max) > 1)

  res_global <- data.frame(gph_ts = gph,
                           gph_pval = mean(gph_pb > gph),
                           mgph_ts = max(gph_d / crit_values_ls(gph_pb_d, alpha)),
                           mgph_pval = mgph_p_val_gl)

  # multi
  mgph_p_val <- numeric(num_hyp)
  for (i_beta in seq_len(num_hyp)) {
    beta <- gph_pb_d_b[i_beta]
    data_order <- t(apply(gph_pb_d, 1, sort))
    j_low_b <- ceiling(beta * n_boot)
    if (n_boot == j_low_b) {
      A <- gph_pb_d / data_order[, 1]
    } else {
      A <- gph_pb_d / data_order[, n_boot - j_low_b]
    }
    mgph_p_val[i_beta] <- mean(apply(A, 2, max) > 1)
  }

  res_multi <- data.frame(ts = gph_d,
                          gph_cv = gph_pb_d_cv,
                          gph_pval = p.adjust(gph_pb_d_b, "bonferroni"),
                          mgph_cv = crit_values_ls(gph_pb_d, alpha),
                          mgph_pval = mgph_p_val)

  result <- list(res_global = res_global, res_multi = res_multi,
                 k = kk, j = jj, n = n_vec,
                 h = h, n_boot = n_boot, alpha = alpha)
  class(result) <- "multifanova"
  return(result)
}