R/outlier_tests.R

Defines functions globaltest suptest sumtest multi_cutoff_to_fodr_vec counttest proptest multi_cutoff simes test_cpv

Documented in counttest globaltest multi_cutoff multi_cutoff_to_fodr_vec proptest simes sumtest suptest test_cpv

#' Critical and p-value for test statistic relative to simulated distribution
#'
#' \code{test_cpv} returns the critical value corresponding to a given
#' quantile of the simulated distribution and the p-value of the test statistic.
#'
#' @param dist A numeric vector of simulated values approximating the
#' distribution of the test statistic, e.g. generated as in \code{mvn_sup()}.
#' @param teststat A numeric value of the test statistic.
#' @param p A numeric vector of probabilities with values in [0,1] for which the
#' corresponding quantiles are calculated.
#'
#' @return A list with two named entries. \code{$pval} is the p-value of the
#' test statistic with respect to the distribution \code{dist}. \code{$q} is the
#' vector of sample quantiles in the distribution \code{dist} corresponding to
#' the probabilities specified in \code{p}.
#'
#' @keywords internal

test_cpv <- function(dist, teststat, p) {

  if (!is.numeric(dist) | !is.vector(dist)) {
    stop("Argument 'dist' must be a numeric vector.")
  }
  if (!is.numeric(teststat) | !identical(length(teststat), 1L)) {
    stop("Argument 'teststat' must be a single numeric value.")
  }
  if (!is.numeric(p) | !is.vector(p)) {
    stop("Argument 'p' must be a numeric vector.")
  }
  if (!all(p >= 0 & p <= 1)) {
    stop("Elements of 'p' must lie between 0 and 1.")
  }

  # find the p-value of the value of the test statistic
  pval <- mean(dist > teststat)

  # find the critical value corresponding to q (can be vector)
  dist <- sort(dist, decreasing = FALSE)
  sampleq <- stats::quantile(x = dist, probs = p, type = 8)

  return(list(pval = pval, critical = sampleq))

}


#' Simes (1986) procedure for multiple testing
#'
#' \code{simes()} takes a vector of p-values corresponding to individual null
#' hypotheses and performs the Simes (1986) procedure for the global null
#' hypothesis. The global null hypothesis is the intersection of all individual
#' null hypotheses.
#'
#' @param pvals A numeric vector of p-values corresponding to the p-values of
#' the individual null hypotheses.
#' @param alpha A numeric value representing the global significance level.
#'
#' @return \code{simes()} returns a list with three named elements.
#' \code{$reject} stores a logical value whether the global null hypothesis has
#' been rejected. \code{$alpha} stores the significance level that was chosen.
#' \code{$details} stores a matrix of the individual null hypothesis p-values,
#' the adjusted significance level according to Simes' procedure, and the
#' rejection decision for each individual hypothesis test.
#'
#' @details See
#' \href{https://academic.oup.com/biomet/article/73/3/751/250538}{Simes (1986)}.
#'
#' @keywords internal
#' @export

simes <- function(pvals, alpha) {

  if(!is.numeric(pvals) | !is.vector(pvals)) {
    stop("Argument 'pvals' must be a numeric vector.")
  }
  if(!all(pvals >= 0 & pvals <= 1)) {
    stop("Elements of 'pvals' must lie between 0 and 1.")
  }
  if(!is.numeric(alpha) | !identical(length(alpha), 1L)) {
    stop("Argument 'alpha' must be a single numeric value.")
  }
  if(!(alpha >= 0 & alpha <= 1)) {
    stop("Argument 'alpha' must lie between 0 and 1.")
  }

  pvals <- sort(pvals, decreasing = FALSE)
  alpha_adj <- seq(1:length(pvals)) * alpha / length(pvals)
  comp <- pvals <= alpha_adj
  reject <- any(comp)

  details <- data.frame(pvals = pvals, alpha_adj = alpha_adj, reject_adj = comp)
  out <- list(reject = reject, alpha = alpha, details = details)

}


#' Multiple models, varying cut-off
#'
#' \code{multi_cutoff()} runs several outlier detection algorithms that differ
#' in the value of the cut-off that determines whether an observation is
#' classified as an outlier or not.
#'
#' @param gamma A numeric vector representing the probability of falsely
#' classifying an observation as an outlier. One setting of the algorithm per
#' element of \code{gamma} is being run.
#' @param ... Arguments for specifying the other settings of the outlier
#' detection algorithm, \code{\link{outlier_detection}}.
#'
#' @details \code{mutli_cutoff} uses the
#'   \code{\link[foreach:foreach-package]{foreach}} and
#'   \code{\link[future:future]{future}} packages to run several models at the
#'   same time in parallel. This means the user has to register a backend and
#'   thereby determine how the code should be executed. The default is
#'   sequential, i.e. not in parallel. See
#'   \code{\link[future:plan]{future::plan()}} for details.
#'
#' @return A list containing the \code{robust2sls} objects, one per setting of
#' \code{gamma}. The length of the list therefore corresponds to the length of
#' the vector \code{gamma}.
#'
#' @importFrom foreach %dopar% %do%
#' @export

multi_cutoff <- function(gamma, ...) {

  if (!is.numeric(gamma) | !is.vector(gamma)) {
    stop("Argument 'gamma' must be a numeric vector.")
  }
  if (!all(gamma >= 0 & gamma <= 1)) {
    stop("Argument 'gamma' must lie between 0 and 1.")
  }

  # binding for i
  i <- NULL

  results <- foreach::foreach(i = (1:length(gamma))) %dopar% {
    g <- gamma[i]
    outlier_detection(..., sign_level = g)
  }

  # name the elements: "gamma(gamma_i)"
  elenames <- paste("gamma", as.character(gamma), sep = "")
  names(results) <- elenames

  return(results)

}


#' Proportion test
#'
#' \code{proptest()} conducts a test whether the false outlier detection rate
#' (FODR) in the sample deviates significantly from its expected value
#' (population FODR) under the null hypothesis that there are no outliers in the
#' sample.
#'
#' @param robust2sls_object An object of class \code{"robust2sls"} or a
#' list of such objects.
#' @param alpha A numeric value between 0 and 1 representing the significance
#' level of the test.
#' @param iteration An integer >= 0 or the character "convergence" that
#' determines which iteration is used for the test.
#' @param one_sided A logical value whether a two-sided test (\code{FALSE})
#'   should be conducted or a one-sided test (\code{TRUE}) that rejects only
#'   when the false outlier detection rate is above its expected value.
#'
#' @details See \code{\link[=outlier_detection]{outlier_detection()}} and
#' \code{\link[=multi_cutoff]{multi_cutoff()}} for creating an object of class
#' \code{"robust2sls"} or a list thereof.
#'
#' @return \code{proptest()} returns a data frame with the iteration (m) to be
#' tested, the actual iteration that was tested (generally coincides with the
#' iteration that was specified to be tested but is the convergent iteration if
#' the fixed point is tested), the setting of the probability of exceeding the
#' cut-off (gamma), the type of t-test (one- or two-sided), the value of the
#' test statistic, its p-value, the significance level \code{alpha}, and the
#' decision. The number of rows of the data frame corresponds to the length of
#' the argument \code{robust2sls_object}.
#'
#' @export

proptest <- function(robust2sls_object, alpha, iteration, one_sided = FALSE) {

  # check input values
  if (!(identical(class(robust2sls_object), "robust2sls") | identical(class(robust2sls_object), "list"))) {
    stop("Argument 'robust2sls_object' must be of class 'robust2sls' or a list of such objects.")
  }
  classes <- sapply(X = robust2sls_object, FUN = class)
  if (identical(class(robust2sls_object), "list") && !all(classes == "robust2sls")) {
    stop("Argument 'robust2sls_object' is a list but not all elements have class 'robust2sls'.")
  }
  if (!is.numeric(alpha) | !identical(length(alpha), 1L)) {
    stop("Argument 'alpha' must be a numeric value of length one.")
  }
  if (!(alpha >= 0 & alpha <= 1)) {
    stop("Argument 'alpha' must be between 0 and 1.")
  }
  if (!(is.numeric(iteration) | identical(iteration, "convergence"))) {
    stop("Argument 'iteration' must either be a numeric or the string 'convergence'.")
  }
  if (is.numeric(iteration) & !identical(length(iteration), 1L)) {
    stop("Argument 'iteration' must be of length one.")
  }
  if (is.numeric(iteration) && !((iteration %% 1) == 0)) {
    stop("Argument 'iteration' must be an integer.")
  }
  if (!is.logical(one_sided) | !identical(length(one_sided), 1L)) {
    stop("Argument 'one_sided' must be a logical value of length one.")
  }

  # define auxiliary function
  proptest_fun <- function(r, iter) {
    if (iter == "convergence") {
      final_iter <- r$cons$iterations$actual
      actual_prop <- outliers_prop(robust2sls_object = r, iteration = final_iter)
    } else {
      actual_prop <- outliers_prop(robust2sls_object = r, iteration = iter)
    }
    expected_prop <- r$cons$sign_level
    n <- sum(nonmissing(data = r$cons$data, formula = r$cons$formula))
    if (r$cons$reference == "normal") { # then no need to estimate parameters
      est_param <- NULL
    } else { # nocov start
      est_param <- estimate_param_null(robust2SLS_object = r)
    } # nocov end
    est_avar <- gauge_avar(ref_dist = r$cons$reference,
                           sign_level = expected_prop,
                           initial_est = r$cons$initial$estimator,
                           iteration = iter,
                           parameters = est_param,
                           split = r$cons$initial$split)
    t <- sqrt(n)*(actual_prop - expected_prop)/sqrt(est_avar)
    return(t)
  }

  # check that selected iteration is actually available in robust2sls_object(s)
  if (identical(class(robust2sls_object), "robust2sls")) {
    iter_set <- robust2sls_object$cons$iterations$setting
    if (identical(iteration, "convergence")) {
      iter_act <- robust2sls_object$cons$iterations$actual
    } else {
      iter_act <- iteration
    }
    iter_max <- robust2sls_object$cons$convergence$max_iter
    gamma <- robust2sls_object$cons$sign_level
    t <- proptest_fun(r = robust2sls_object, iter = iteration)
    if (isTRUE(one_sided)) {
      type <- "one-sided"
      pval <- stats::pnorm(q = t, mean = 0, sd = 1, lower.tail = FALSE)
    } else {
      type <- "two-sided"
      pval <- 2 * stats::pnorm(q = abs(t), mean = 0, sd = 1, lower.tail = FALSE)
    }
  } else if (identical(class(robust2sls_object), "list")) {
    # return list because could be different types (numeric or character for convergence)
    iter_set <- sapply(X = robust2sls_object, FUN = function(x) x$cons$iterations$setting)
    if (identical(iteration, "convergence")) {
      iter_act <- sapply(X = robust2sls_object, FUN = function(x) x$cons$iterations$actual)
    } else {
      iter_act <- iteration
    }
    iter_max <- sapply(X = robust2sls_object, FUN = function(x) x$cons$convergence$max_iter)
    # return vector because know it must be numeric
    gamma <- sapply(X = robust2sls_object, FUN = function(x) x$cons$sign_level)
    t <- sapply(X = robust2sls_object, FUN = proptest_fun, iter = iteration)
    if (isTRUE(one_sided)) {
      type <- "one-sided"
      pval <- sapply(X = t, FUN = function(x) stats::pnorm(q = x, mean = 0, sd = 1, lower.tail = FALSE))
    } else {
      type <- "two-sided"
      pval <- sapply(X = t, FUN = function(x) 2 * stats::pnorm(q = abs(x), mean = 0, sd = 1, lower.tail = FALSE))
    }
  # below is never reached b/c of first input check
  } else {  # nocov start
    stop("Argument 'robust2sls_object' invalid input type.")
  } # nocov end

  out <- data.frame(iter_test = iteration, iter_act = iter_act, gamma = gamma, t = t, type = type,
                    pval = pval, alpha = alpha, reject = (pval <= alpha))

  return(out)

}

#' Count test
#'
#' \code{counttest()} conducts a test whether the number of detected outliers
#' deviates significantly from the expected number of outliers under the null
#' hypothesis that there are no outliers in the sample.
#' @inheritParams proptest
#' @param one_sided A logical value whether a two-sided test (\code{FALSE})
#'   should be conducted or a one-sided test (\code{TRUE}) that rejects only
#'   when the number of detected outliers is above the expected number.
#' @param tsmethod A character specifying the method for calculating two-sided
#' p-values. Ignored for one-sided test.
#'
#' @details See \code{\link[=outlier_detection]{outlier_detection()}} and
#' \code{\link[=multi_cutoff]{multi_cutoff()}} for creating an object of class
#' \code{"robust2sls"} or a list thereof.
#'
#' See \code{\link[exactci:poisson.exact]{exactci::poisson.exact()}} for the
#' different methods of calculating two-sided p-values.
#'
#' @return \code{counttest()} returns a data frame with the iteration (m) to be
#'   tested, the actual iteration that was tested (generally coincides with the
#'   iteration that was specified to be tested but is the convergent iteration
#'   if the fixed point is tested), the setting of the probability of exceeding
#'   the cut-off (gamma), the number of detected outliers, the expected number
#'   of outliers under the null hypothesis that there are no outliers, the type
#'   of test (one- or two-sided), the p-value, the significance level
#'   \code{alpha}, the decision, and which method was used to calculate
#'   (two-sided) p-values. The number of rows of the data frame corresponds to
#'   the length of the argument \code{robust2sls_object}.
#'
#' @export

counttest <- function(robust2sls_object, alpha, iteration, one_sided = FALSE,
                      tsmethod = c("central", "minlike", "blaker")) {

  tsmethod <- match.arg(tsmethod)

  # check input values
  if (!(identical(class(robust2sls_object), "robust2sls") | identical(class(robust2sls_object), "list"))) {
    stop("Argument 'robust2sls_object' must be of class 'robust2sls' or a list of such objects.")
  }
  classes <- sapply(X = robust2sls_object, FUN = class)
  if (identical(class(robust2sls_object), "list") && !all(classes == "robust2sls")) {
    stop("Argument 'robust2sls_object' is a list but not all elements have class 'robust2sls'.")
  }
  if (!is.numeric(alpha) | !identical(length(alpha), 1L)) {
    stop("Argument 'alpha' must be a numeric value of length one.")
  }
  if (!(alpha >= 0 & alpha <= 1)) {
    stop("Argument 'alpha' must be between 0 and 1.")
  }
  if (!(is.numeric(iteration) | identical(iteration, "convergence"))) {
    stop("Argument 'iteration' must either be a numeric or the string 'convergence'.")
  }
  if (is.numeric(iteration) & !identical(length(iteration), 1L)) {
    stop("Argument 'iteration' must be of length one.")
  }
  if (is.numeric(iteration) && !((iteration %% 1) == 0)) {
    stop("Argument 'iteration' must be an integer.")
  }
  if (!is.logical(one_sided) | !identical(length(one_sided), 1L)) {
    stop("Argument 'one_sided' must be a logical value of length one.")
  }
  # below not necessary, automatically tested via match.arg()
  # if (!is.character(tsmethod) | !identical(length(tsmethod), 1L)) {
  #   stop("Argument 'tsmethod' must be a character vector of length one.")
  # }
  # if (!(tsmethod %in% c("central", "minlike", "blaker"))) {
  #   stop("Argument 'tsmethod' must be one of 'central', 'minlike', or 'blaker'.")
  # }

  # define auxiliary function
  counttest_fun <- function(r, iter, type, tsmethod) {
    if (isTRUE(type)) {
      alt <- "greater"
    } else {
      alt <- "two.sided"
    }
    if (iter == "convergence") {
      iter_act <- r$cons$iterations$actual
    } else {
      iter_act <- iter
    }
    num_act <- outliers(robust2sls_object = r, iteration = iter_act)
    n <- sum(nonmissing(data = r$cons$data, formula = r$cons$formula))
    gamma <- r$cons$sign_level
    num_exp <- gamma * n
    pvalue <- exactci::poisson.exact(x = num_act, r = num_exp,
                                     alternative = alt,
                                     tsmethod = tsmethod)$p.value

    if (identical(alt, "greater")) {
      alt2 <- "one-sided"
    } else {
      alt2 <- "two-sided"
    }

    out <- c(iter, iter_act, gamma, num_act, num_exp, alt2, pvalue)
    out <- data.frame(iter_test = iter, iter_act = iter_act, gamma = gamma,
                      num_act = num_act, num_exp = num_exp, type = alt2,
                      pval = pvalue)
    return(out)

  }

  if (identical(class(robust2sls_object), "robust2sls")) {
    result <- counttest_fun(r = robust2sls_object, iter = iteration,
                            type = one_sided, tsmethod = tsmethod)
  } else if (identical(class(robust2sls_object), "list")) {
    iter_max <- sapply(X = robust2sls_object, FUN = function(x) x$cons$convergence$max_iter)
    result <- do.call(rbind, lapply(X = robust2sls_object, FUN = counttest_fun, iter = iteration,
                                    type = one_sided, tsmethod = tsmethod))
  # part below never reached because of first input check
  } else { # nocov start
    stop("Argument 'robust2sls_object' invalid input type.")
  } # nocov end

  colnames(result) <- c("iter_test", "iter_act", "gamma", "num_act", "num_exp",
                        "type", "pval")
  result <- as.data.frame(result)
  result <- cbind(result, data.frame(alpha = alpha))
  result <- cbind(result, reject = (result$pval <= result$alpha))
  # add tsmethod as column of output (value NA if one-sided)
  result <- cbind(result, tsmethod = if (isFALSE(one_sided)) {tsmethod} else {NA_character_})

  # add attribute that stores p-value method if two-sided
  if (isFALSE(one_sided)) {
    attr(result, which = "tsmethod") <- tsmethod
  }

  return(result)

}

#' Creates a vector of the centered FODR across different cut-offs
#'
#' \code{multi_cutoff_to_fodr_vec()} takes a list of \code{"robust2sls"} objects
#' and returns a vector of the centered FODR (sample - expected) for different
#' values of the cut-off c (equivalently gamma):
#' \loadmathjax
#' \mjdeqn{ \sqrt{n}(\widehat{\gamma_{c}} - \gamma_{c}) }{sqrt(n)(gamma_hat - gamma)}
#'
#' @param robust2sls_object A list of \code{"robust2sls"} objects.
#' @inheritParams proptest
#'
#' @details See \code{\link[=outlier_detection]{outlier_detection()}} and
#' \code{\link[=multi_cutoff]{multi_cutoff()}} for creating an object of class
#' \code{"robust2sls"} or a list thereof.
#'
#' @return A numeric vector of the centered FODR values.
#'
#' @keywords internal
#' @import mathjaxr

multi_cutoff_to_fodr_vec <- function(robust2sls_object, iteration) {

  if (!identical(class(robust2sls_object), "list")) {
    stop("Argument 'robust2sls_object' must be a list of 'robust2sls' objects.")
  }
  classes <- sapply(X = robust2sls_object, FUN = class)
  if (is.list(robust2sls_object) && !all(classes == "robust2sls")) {
    stop("Argument 'robust2sls_object' must be a list of 'robust2sls' objects.")
  }
  if (!(is.numeric(iteration) | identical(iteration, "convergence"))) {
    stop("Argument 'iteration' must be numeric or the string 'convergence'.")
  }
  if (is.numeric(iteration)) {
    if (!((iteration %% 1) == 0) | iteration < 0) {
      stop("Argument 'iteration' must be an integer >= 0 if numeric.")
    }
  }

  centered_fodr <- function(r, iteration) {
    if (identical(iteration, "convergence")) {
      iter_act <- r$cons$iterations$actual
    } else {
      iter_act <- iteration
    }
    n <- sum(nonmissing(data = r$cons$data, formula = r$cons$formula))
    gamma_hat <- outliers_prop(robust2sls_object = r, iteration = iter_act)
    gamma <- r$cons$sign_level
    out <- sqrt(n) * (gamma_hat - gamma)
    return(out)
  }

  vec <- sapply(X = robust2sls_object, FUN = centered_fodr, iteration = iteration)
  return(vec)

}


#' Scaling sum proportion test across different cut-offs
#'
#' \code{sumtest()} uses the estimations across several cut-offs to test whether
#' the sum of the deviations between sample and population FODR differ
#' significantly from its expected value.
#' \loadmathjax
#' \mjdeqn{\sum_{k = 1}^{K} \sqrt{n}(\widehat{\gamma}_{c_{k}} - \gamma_{c_{k}}) }{sum k = (1 to K) sqrt(n) (gammahat_{c_{k}} - gamma_{c{k}})}
#'
#' @param robust2sls_object A list of \code{"robust2sls"} objects.
#' @inheritParams proptest
#'
#' @return \code{sumtest()} returns a data frame with one row storing the
#' iteration that was tested, the value of the test statistic (t-test), the
#' type of the test (one- or two-sided), the corresponding p-value, the
#' significance level, and whether the null hypothesis is rejected. The data
#' frame also contains an attribute named \code{"gammas"} that records which
#' gammas determining the different cut-offs were used in the scaling sum test.
#'
#' @export
#' @import mathjaxr

sumtest <- function(robust2sls_object, alpha, iteration, one_sided = FALSE) {

  # input checks
  if (!identical(class(robust2sls_object), "list")) {
    stop("Argument 'robust2sls_object' must be a list of 'robust2sls' objects.")
  }
  classes <- sapply(X = robust2sls_object, FUN = class)
  if (is.list(robust2sls_object) && !all(classes == "robust2sls")) {
    stop("Argument 'robust2sls_object' must be a list of 'robust2sls' objects.")
  }
  if (identical(length(robust2sls_object), 1L)) {
    stop("sumtest() requires several different cutoffs. See proptest() and counttest() for single-value tests.")
  }
  if (!is.numeric(alpha) | !identical(length(alpha), 1L)) {
    stop("Argument 'alpha' must be a numeric value of length one.")
  }
  if (!(alpha >= 0 & alpha <= 1)) {
    stop("Argument 'alpha' must be between 0 and 1.")
  }
  if (!(is.numeric(iteration) | identical(iteration, "convergence"))) {
    stop("Argument 'iteration' must either be a numeric or the string 'convergence'.")
  }
  if (is.numeric(iteration) & !identical(length(iteration), 1L)) {
    stop("Argument 'iteration' must be of length one.")
  }
  if (is.numeric(iteration) && !((iteration %% 1) == 0)) {
    stop("Argument 'iteration' must be an integer.")
  }
  if (!is.logical(one_sided) | !identical(length(one_sided), 1L)) {
    stop("Argument 'one_sided' must be a logical value of length one.")
  }

  # extract settings of the outlier detection
  ref <- robust2sls_object[[1]]$cons$reference
  init <- robust2sls_object[[1]]$cons$initial$estimator
  split <- robust2sls_object[[1]]$cons$initial$split # could be NULL
  if (robust2sls_object[[1]]$cons$reference == "normal") {
    param_est <- NULL
  } else { # nocov start
    param_est <- estimate_param_null(robust2SLS_object = robust2sls_object[[1]])
  } # nocov end

  # extract which gammas are in the models
  gammas <- sapply(X = robust2sls_object, FUN = function(x) x$cons$sign_level)

  # turn list of robust2sls_objects to vector of the scaled FODR deviations
  vec <- multi_cutoff_to_fodr_vec(robust2sls_object = robust2sls_object,
                                  iteration = iteration)

  # create value of the test statistic
  test_value <- sum(vec)

  # calculate the asymptotic variance of the test statistic
  # technically can rely completely on gauge_covar and sum all entries
  # as internal check, also calculate explicitly with gauge_avar
  # variance part
  var_part <- sapply(X = gammas, FUN = gauge_avar, ref_dist = ref,
                     initial_est = init, iteration = iteration,
                     parameters = param_est, split = split)
  covar_part <- sapply(X = gammas, FUN = function(x) sapply(X = gammas,
                                                            FUN = gauge_covar,
                                                            ref_dist = "normal",
                                                            initial_est = init,
                                                            iteration = iteration,
                                                            parameters = param_est,
                                                            split = split,
                                                            sign_level1 = x))

  # covariance matrix should be symmetric, so implement an internal check (fail-save)
  if (!isSymmetric.matrix(covar_part)) {
    stop("Internal error. Covariance matrix of vector should be symmetric.") # nocov
  }

  test_var1 <- sum(var_part) + 2 * sum(covar_part[upper.tri(covar_part,
                                                            diag = FALSE)])
  test_var2 <- sum(covar_part)

  if (!isTRUE(all.equal(test_var1, test_var2))) { # (fail-save)
    stop("Internal error. Two ways of calculating avar should coincide.") # nocov
  }

  # calculate test statistic and other outputs to be returned
  t <- test_value / sqrt(test_var2)
  if (isTRUE(one_sided)) {
    type <- "one-sided"
    pval <- stats::pnorm(q = t, mean = 0, sd = 1, lower.tail = FALSE)
  } else {
    type <- "two-sided"
    pval <- 2 * stats::pnorm(q = abs(t), mean = 0, sd = 1, lower.tail = FALSE)
  }

  out <- data.frame(iter_test = iteration, t = t, type = type,
                    pval = pval, alpha = alpha, reject = (pval <= alpha))
  names(gammas) <- NULL
  attr(x = out, which = "gammas") <- gammas

  return(out)

}

#' Supremum proportion test across different cut-offs
#'
#' \code{suptest()} uses the estimations across several cut-offs to test whether
#' the supremum/maximum of the deviations between sample and population FODR
#' differs significantly from its expected value.
#' \loadmathjax
#' \mjdeqn{ \sup_{c} |\sqrt{n}(\widehat{\gamma}_{c} - \gamma_{c})| }{sup_c |sqrt(n)(gammahat_c - gamma_c)|}
#'
#' @inheritParams sumtest
#' @inheritParams test_cpv
#' @param R An integer specifying the number of replications for simulating the
#' distribution of the test statistic.
#'
#' @return \code{suptest()} returns a data frame with one row storing the
#' iteration that was tested, the value of the test statistic, the corresponding
#' p-value, the significance level, and whether the null hypothesis is rejected.
#' The data frame also contains two named attributes. The first attribute is
#' named \code{"gammas"} and records which gammas determining the different
#' cut-offs were used in the scaling sup test. The second attribute is named
#' \code{"critical"} and records the critical values corresponding to the
#' different quantiles in the limiting distribution that were specified in
#' \code{p}.
#'
#' @export
#' @import mathjaxr

suptest <- function(robust2sls_object, alpha, iteration, p = c(0.9, 0.95, 0.99),
                    R = 50000) {

  # input checks
  if (!identical(class(robust2sls_object), "list")) {
    stop("Argument 'robust2sls_object' must be a list of 'robust2sls' objects.")
  }
  classes <- sapply(X = robust2sls_object, FUN = class)
  if (is.list(robust2sls_object) && !all(classes == "robust2sls")) {
    stop("Argument 'robust2sls_object' must be a list of 'robust2sls' objects.")
  }
  if (identical(length(robust2sls_object), 1L)) {
    stop("suptest() requires several different cutoffs. See proptest() and counttest() for single-value tests.")
  }
  if (!is.numeric(alpha) | !identical(length(alpha), 1L)) {
    stop("Argument 'alpha' must be a numeric value of length one.")
  }
  if (!(alpha >= 0 & alpha <= 1)) {
    stop("Argument 'alpha' must be between 0 and 1.")
  }
  if (!(is.numeric(iteration) | identical(iteration, "convergence"))) {
    stop("Argument 'iteration' must either be a numeric or the string 'convergence'.")
  }
  if (is.numeric(iteration) & !identical(length(iteration), 1L)) {
    stop("Argument 'iteration' must be of length one.")
  }
  if (is.numeric(iteration) && !((iteration %% 1) == 0)) {
    stop("Argument 'iteration' must be an integer.")
  }
  if (!is.numeric(p)) {
    stop("Argument 'p' must be a numeric vector.")
  }
  if (!all(p >= 0 & p <= 1)) {
    stop("Argument 'p' can only contain elements between 0 and 1.")
  }

  # extract settings of the outlier detection
  ref <- robust2sls_object[[1]]$cons$reference
  init <- robust2sls_object[[1]]$cons$initial$estimator
  split <- robust2sls_object[[1]]$cons$initial$split # could be NULL
  if (robust2sls_object[[1]]$cons$reference == "normal") {
    param_est <- NULL
  } else { # nocov start
    param_est <- estimate_param_null(robust2SLS_object = robust2sls_object[[1]])
  } # nocov end

  # extract which gammas are in the models
  gammas <- sapply(X = robust2sls_object, FUN = function(x) x$cons$sign_level)

  # turn list of robust2sls_objects to vector of the scaled FODR deviations
  vec <- multi_cutoff_to_fodr_vec(robust2sls_object = robust2sls_object,
                                  iteration = iteration)

  # create value of the test statistic
  test_value <- max(abs(vec))

  # for critical value / p-value need to simulate the distribution of the sup
  # create var-cov matrix of the Gaussian process
  varcov <- sapply(X = gammas, FUN = function(x) sapply(X = gammas,
                                                        FUN = gauge_covar,
                                                        ref_dist = "normal",
                                                        initial_est = init,
                                                        iteration = iteration,
                                                        parameters = param_est,
                                                        split = split,
                                                        sign_level1 = x))

  sup_distr <- mvn_sup(n = R, mu = rep(0, length(gammas)), Sigma = varcov)

  res <- test_cpv(dist = sup_distr, teststat = test_value,
                  p = p)
  pvalue <- res$pval
  criticalvalues <- res$critical
  rej <- pvalue <= alpha

  out <- data.frame(iter_test = iteration, test_value = test_value,
                    pval = pvalue, alpha = alpha, reject = rej)
  names(gammas) <- NULL
  attr(x = out, which = "gammas") <- gammas
  attr(x = out, which = "critical") <- criticalvalues

  return(out)

}

#' Global test correcting for multiple hypothesis testing
#'
#' \code{globaltest()} uses several proportion or count tests with different
#' cut-offs to test a global hypothesis of no outliers using the Simes (1986)
#' procedure to account for multiple testing.
#'
#' @param tests A data frame that contains a column named \code{$pval}
#' containing the different p-values for different hypothesis tests, each
#' stored in a row.
#' @param global_alpha A numeric value representing the global significance
#' level.
#'
#' @seealso [proptest()], [counttest()]
#'
#' @details See
#' \href{https://academic.oup.com/biomet/article/73/3/751/250538}{Simes (1986)}.
#'
#' @return A list with three entries. The first entry named \code{$reject}
#'   contains the global rejection decision. The second entry named
#'   \code{$global_alpha} stores the global significance level. The third entry
#'   named \code{$tests} returns the input data frame \code{tests}, appended
#'   with two columns containing the adjusted significance level and respective
#'   rejection decision.
#'
#' @export

globaltest <- function(tests, global_alpha) {

  if (!is.data.frame(tests)) {
    stop("Argument 'tests' must be a data frame.")
  }
  if (!("pval" %in% colnames(tests))) {
    stop("Argument 'tests' must contain a column named 'pval' containing the p-values of the individual tests.")
  }
  if (!(is.numeric(global_alpha)) | !identical(length(global_alpha), 1L)) {
    stop("Argument 'global_alpha' must be a numeric value of length one.")
  }
  if (!(global_alpha >= 0 & global_alpha <= 1)) {
    stop("Argument 'global_alpha' must be between 0 and 1.")
  }

  # calculate decisions based on Simes procedure
  pvalues <- tests$pval
  simes_res <- simes(pvals = pvalues, alpha = global_alpha)

  # add column that saves the original order
  colno <- NCOL(tests)
  tests$id <- 1:NROW(tests)
  # sort "tests" by p-value, as the simes() function does
  tests <- tests[order(tests$pval), ]
  tests$alpha_adj <- simes_res$details$alpha_adj
  tests$reject_adj <- simes_res$details$reject_adj
  tests <- tests[order(tests$id), ]
  tests <- tests[, -(colno+1)]

  out <- list(reject = simes_res$reject, global_alpha = global_alpha,
              tests = tests)
  return(out)

}

Try the robust2sls package in your browser

Any scripts or data that you put into this service are public.

robust2sls documentation built on Jan. 11, 2023, 5:13 p.m.