fdahotelling: Inference for Functional Data Analysis in R

#' Inference on the Mean for Multivariate and Functional Data
#'
#' These functions implement testing procedures for the mean of populations of
#' vectors or functions. In particular, when the optional parameter \code{B =
#' 0}, a parametric test based on Hotelling's statistic is used, while, if
#' \code{B > 0}, a permutation test is instead used and more statistics can be
#' chosen by the user. The parametric test assumes that the data is normally
#' distributed or that the sample size is large such that the central limit
#' theorem can be applied. The permutation test assumes that the distribution
#' from which the data come is symmetric.
#'
#' All the statistics of the \pkg{fdahotelling} package can be used with the
#' permutation version of this test. In addition, the user can implement its own
#' statistic to be used with the test, provided that: \itemize{ \item the
#' user-defined statistic takes the same input arguments as the ones in the
#' \pkg{fdahotelling} package, i.e., the mandatory first dataset \code{x}, the
#' optional second dataset \code{y} and the mean function under the null
#' hypothesis \code{mu} that can have a default value, \item high values of the
#' statistic yield rejection of the null hypothesis. }
#'
#' @inheritParams statistics
#' @param skip_check Flag to skip checks on input types. Default is off. This
#'   parameter should always be set to its default \code{FALSE}. It is set to
#'   \code{TRUE} only internally, in power calculation functions, in which data
#'   generated by the hypothesized model is already in matrix form.
#' @param B Number of bootstrap permutations (default: 1000).
#' @param statistic Statistic to be used within the permutation framework.
#'   Choices are Hotelling (default), L1, L2, Linf, StandardizedL1,
#'   StandardizedL2, StandardizedLinf and All.
#' @param verbose Activate verbose (default: \code{TRUE}).
#'
#' @return A 3-column \code{\link[tibble]{tibble}} containing the names of the
#'   required statistics and their values, along with an estimate of the p-value
#'   of the corresponding statistical test.
#'
#' @seealso The parametric test is described in details in Secchi, P., Stamm,
#'   A., & Vantini, S. (2013). \emph{Inference for the mean of large \eqn{p}
#'   small \eqn{n} data: A finite-sample high-dimensional generalization of
#'   Hotelling theorem}. Electronic Journal of Statistics, 7, pp. 2005-2031.
#'   doi:10.1214/13-EJS833, available online at
#'   \url{http://projecteuclid.org/euclid.ejs/1375708877}. The permutation test
#'   is described in details in the technical report by Pini, A., Stamm, A., &
#'   Vantini, S. (2015). \emph{Hotelling \eqn{T^2} in functional Hilbert
#'   spaces}, available online at
#'   \url{https://mox.polimi.it/publication-results/?id=524&tipo=add_qmox}.
#'
#' @examples
#' ####################
#' # One-Sample Tests #
#' ####################
#' x <- aneurisk %>%
#'  dplyr::filter(variable == "curvature" & group == "low") %>%
#'  dplyr::select(data) %>%
#'  purrr::flatten_df()
#' # Parametric test
#' test_onesample(x = x, mu = 0, step_size = 0.01, B = 0)
#' # Permutation test
#' test_onesample(x = x, mu = 0, step_size = 0.01, B = 100)
#' ####################
#' # Two-Sample Tests #
#' ####################
#' y <- aneurisk %>%
#'  dplyr::filter(variable == "curvature" & group == "up") %>%
#'  dplyr::select(data) %>%
#'  purrr::flatten_df()
#' # Parametric test
#' test_twosample(x = x, y = y, mu = 0, step_size = 0.01, B = 0)
#' # Permutation test
#' test_twosample(x = x, y = y, mu = 0, step_size = 0.01, B = 100)
#'
#' @name tests
NULL

#' @rdname tests
#' @export
test_onesample <- function(x,
                           mu = 0, step_size = 0, B = 1000,
                           statistic = "Hotelling", verbose = TRUE, skip_check = FALSE) {
  if (B == 0)
    return(parametric_test(
      x = x, mu = mu,
      step_size = step_size,
      skip_check = skip_check
    ))

  permutation_test(
    x = x, mu = mu,
    step_size = step_size,
    B = B,
    statistic = statistic,
    verbose = verbose,
    skip_check = skip_check
  )
}

#' @rdname tests
#' @export
test_twosample <- function(x, y,
                           mu = 0, paired = FALSE, step_size = 0, B = 1000,
                           statistic = "Hotelling", verbose = TRUE, skip_check = FALSE) {
  if (B == 0)
    return(parametric_test(
      x = x, y = y, mu = mu,
      paired = paired,
      step_size = step_size,
      skip_check = skip_check
    ))

  permutation_test(
    x = x, y = y, mu = mu,
    paired = paired,
    step_size = step_size,
    B = B,
    statistic = statistic,
    verbose = verbose,
    skip_check = skip_check
  )
}

parametric_test <- function(x, y = NULL, mu = NULL,
                            paired = FALSE, step_size = 0, skip_check = FALSE) {
  if (!skip_check) {
    l <- check_arguments(x = x, y = y, mu = mu, step_size = step_size)
    x <- l$x
    y <- l$y
    mu <- l$mu
  }
  oneSample <- dim(y)[2] == 0
  p <- dim(x)[2]
  if (!oneSample && dim(y)[2] != p)
    stop("Both groups should have the same number of components.")
  nx <- dim(x)[1]
  ny <- ifelse(oneSample, 1, dim(y)[1])
  stat <- stat_hotelling_impl(x = x, y = y, mu = mu,
                             paired = paired, step_size = step_size,
                             use_correction = TRUE)
  df1 <- min(nx + ny - 2, p)
  df2 <- abs(nx + ny - 2 - p) + 1
  pvalue <- 1 - stats::pf(stat, df1, df2)
  dplyr::data_frame(
    statName = names(stat),
    statVal = stat,
    pValue = pvalue
  )
}

permutation_test <- function(x, y = NULL, mu = 0,
                             paired = FALSE, step_size = 0,
                             B = 1000, statistic = "Hotelling", verbose = TRUE,
                             skip_check = FALSE) {
  if (!skip_check) {
    l <- check_arguments(x = x, y = y, mu = mu, step_size = step_size)
    x <- l$x
    y <- l$y
    mu <- l$mu
  }

  oneSample <- dim(y)[2] == 0

  stat_fun <- switch (statistic,
    "Hotelling" = stat_hotelling_impl,
    "L1" = stat_L1_impl,
    "L2" = stat_L2_impl,
    "Linf" = stat_Linf_impl,
    "StandardizedL1" = stat_L1_std_impl,
    "StandardizedL2" = stat_L2_std_impl,
    "StandardizedLinf" = stat_Linf_std_impl,
    "All" = stat_all_impl,
    stop("Unrecognised statistic. Please choose between Hotelling, L1, L2, Linf,
         StandardizedL1, StandardizedL2, StandardizedLinf and All.")
  )

  # Get statistic value
  stat <- stat_fun(x = x, y = y, mu = mu, paired = paired, step_size = step_size)

  if (oneSample)
    boot_data <- perm_onesample(x, mu, step_size, B, stat_fun, verbose)
  else
    boot_data <- perm_twosample(x, y, mu, paired, step_size, B, stat_fun, verbose)

  boot_data %>%
    dplyr::mutate(test = purrr::map(stat_boot, `>=`, stat)) %>%
    `$`(test) %>%
    purrr::transpose() %>%
    purrr::simplify_all() %>%
    purrr::map_dbl(mean) %>%
    tibble::tibble(statName = names(.), statVal = stat, pValue = .)
}

perm_onesample <- function(x,
                           mu,
                           step_size,
                           B,
                           statistic,
                           verbose) {
  n <- dim(x)[1]
  p <- dim(x)[2]

  # Decide on bootstrap vs exact test
  n_comb <- 2^n
  p_min <- 1 / min(B, n_comb)
  if (verbose)
    writeLines(paste(" - P-value resolution:", p_min))
  exactTest <- B >= n_comb
  if (exactTest) { # Case of exact test
    B <- t(expand.grid(rep(list(c(0, 1)), n)))
    if (verbose) {
      writeLines(" - Computing exact p-value.")
      writeLines(paste(" - P-value will never drop below", p_min))
    }
  } else { # Case of approximate test
    n_comb <- B
    if (verbose) {
      writeLines(paste(" - Computing approximate p-value using",
                       B, "random reflexions."))
      writeLines(paste(" - P-value will not drop below", 1 / n_comb, "in average."))
    }
  }

  matrixMu <- matrix(data = mu, nrow = n, ncol = p, byrow = TRUE)
  y <- matrix(nrow = 0, ncol = 0)

  tibble::tibble(bootId = seq_len(n_comb)) %>%
    dplyr::mutate(
      r = purrr::map(bootId, ~ if (exactTest) {
        B[, .x]
      } else {
        sample.int(n = 2, size = n, replace = TRUE) - 1
      }),
      data = purrr::map(r, ~ matrixMu + diag((-1)^(.x)) %*% (x - matrixMu)),
      stat_boot = purrr::map(data, statistic, y = y, mu = mu, step_size = step_size)
    )
}

perm_twosample <- function(x,
                           y,
                           mu,
                           paired,
                           step_size,
                           B,
                           statistic,
                           verbose) {
  if (dim(y)[2] != dim(x)[2])
    stop("Both groups should have the same number of components.")

  # Get sample sizes
  nx <- dim(x)[1]
  ny <- dim(y)[1]
  px <- nx / (nx + ny)
  py <- ny / (nx + ny)

  # Decide on MC vs exact test
  n_comb <- choose(nx + ny, nx)
  p_min <- 1 / min(B, n_comb)
  if (verbose)
    writeLines(paste(" - P-value resolution:", p_min))
  if (B >= n_comb) { # Case of exact test
    B <- combn(nx + ny, nx)
    if (verbose) {
      writeLines(" - Computing exact p-value.")
      writeLines(paste(" - P-value will never drop below", p_min))
    }
  } else { # Case of approximate test
    if (verbose) {
      writeLines(paste(" - Computing approximate p-value using",
                       B, "random permutations."))
      writeLines(paste(" - P-value will not drop below",
                       1 / n_comb, "in average."))
    }
  }

  # Generate boostrap data
  rbind(x, y) %>%
    partition(B, c(D1 = px, D2 = py)) %>%
    dplyr::mutate(
      stat_boot = purrr::map2(D1, D2, statistic, mu = mu,
                              paired = paired,
                              step_size = step_size)
    )
}