# R/basis.R In cmstatr: Statistical Methods for Composite Material Data

#### Documented in basis_anovabasis_hk_extbasis_lognormalbasis_nonpara_large_samplebasis_normalbasis_pooled_cvbasis_pooled_sdbasis_weibullglance.basishk_ext_zhk_ext_z_j_optk_factor_normalnonpara_binomial_rank

#' Calculate k factor for basis values (\eqn{kB}, \eqn{kA}) with normal
#' distribution
#'
#' @description
#' The factors returned by this function are used when calculating basis
#' values (one-sided confidence bounds) when the data are normally
#' distributed. The basis value will
#' be equal to \eqn{\bar{x} - k s}{x_bar - k s},
#' where \eqn{\bar{x}}{x_bar} is the sample mean,
#' \eqn{s} is the sample standard deviation and \eqn{k} is the result
#' of this function.
#' This function is internally used by [basis_normal()] when
#' computing basis values.
#'
#' @param n the number of observations (i.e. coupons)
#' @param p the desired content of the tolerance bound.
#'          Should be 0.90 for B-Basis and 0.99 for A-Basis
#' @param conf confidence level. Should be 0.95 for both A- and B-Basis
#'
#' @details
#' This function calculates the k factors used when determining A- and
#' B-Basis values for normally distributed data. To get \eqn{kB}, set
#' the content of the tolerance bound to p = 0.90 and
#' the confidence level to conf = 0.95. To get \eqn{kA}, set
#' p = 0.99 and conf = 0.95. While other tolerance bound
#' contents and confidence levels may be computed, they are infrequently
#' needed in practice.
#'
#' The k-factor is calculated using equation 2.2.3 of
#' Krishnamoorthy and Mathew (2008).
#'
#' This function has been validated against the \eqn{kB} tables in
#' CMH-17-1G for each value of \eqn{n} from \eqn{n = 2} to \eqn{n = 95}.
#' It has been validated against the \eqn{kA} tables in CMH-17-1G for each
#' value of \eqn{n} from \eqn{n = 2} to \eqn{n = 75}. Larger values of \eqn{n}
#' also match the tables in CMH-17-1G, but R
#' emits warnings that "full precision may not have been achieved." When
#' validating the results of this function against the tables in CMH-17-1G,
#' the maximum allowable difference between the two is 0.002. The tables in
#' CMH-17-1G give values to three decimal places.
#'
#' Meeker, et. al. (2017).
#'
#' @return the calculated factor
#'
#' @references
#' K. Krishnamoorthy and T. Mathew, Statistical Tolerance Regions: Theory,
#' Applications, and Computation. Hoboken: John Wiley & Sons, 2008.
#'
#' W. Meeker, G. Hahn, and L. Escobar, Statistical Intervals: A Guide
#' for Practitioners and Researchers, Second Edition.
#' Hoboken: John Wiley & Sons, 2017.
#'
#' “Composite Materials Handbook, Volume 1. Polymer Matrix Composites
#' Guideline for Characterization of Structural Materials,” SAE International,
#' CMH-17-1G, Mar. 2012.
#'
#' @seealso
#' [basis_normal()]
#'
#' @examples
#' kb <- k_factor_normal(n = 10, p = 0.9, conf = 0.95)
#' print(kb)
#'
#' ##  2.35464
#'
#' # This can be used to caclulate the B-Basis if
#' # the sample mean and sample standard deviation
#' # is known, and data is assumed to be normally
#' # distributed
#'
#' sample_mean <- 90
#' sample_sd <- 5.2
#' print("B-Basis:")
#' print(sample_mean - sample_sd * kb)
#'
#' ##  B-Basis:
#' ##  77.75587
#'
#' @importFrom stats qnorm qt
#'
#' @export
k_factor_normal <- function(n, p = 0.90, conf = 0.95) {
z <- qnorm(p)
suppressWarnings(
t <- qt(conf, df = n - 1, ncp = z * sqrt(n))
)
return(t / sqrt(n))
}

#' Calculate basis values
#'
#' @description
#' Calculate the basis value for a given data set. There are various functions
#' to calculate the basis values for different distributions.
#' The basis value is the lower one-sided tolerance bound of a certain
#' see Meeker, et. al. (2017).
#' For B-Basis, set the content of tolerance bound to \eqn{p=0.90} and
#' the confidence level to \eqn{conf=0.95}; for A-Basis, set \eqn{p=0.99} and
#' \eqn{conf=0.95}. While other tolerance bound
#' contents and confidence levels may be computed, they are infrequently
#' needed in practice.
#'
#' These functions also perform some automated diagnostic
#' tests of the data prior to calculating the basis values. These diagnostic
#' tests can be overridden if needed.
#'
#' @param data a data.frame
#' @param x the variable in the data.frame for which to find the basis value
#' @param batch the variable in the data.frame that contains the batches.
#' @param groups the variable in the data.frame representing the groups
#' @param p the content of the tolerance bound. Should be 0.90 for B-Basis
#'          and 0.99 for A-Basis
#' @param conf confidence level Should be 0.95 for both A- and B-Basis
#' @param override a list of names of diagnostic tests to override,
#'                 if desired. Specifying "all" will override all diagnostic
#'                 tests applicable to the current method.
#' @param modcv a logical value indicating whether the modified CV approach
#'              should be used. Only applicable to pooling methods.
#' @param method the method for Hanson--Koopmans nonparametric basis values.
#'               should be "optimum-order" for B-Basis and "woodward-frawley"
#'               for A-Basis.
#'
#' @details
#' data is an optional argument. If data is given, it should
#' be a
#' data.frame (or similar object). When data is specified, the
#' value of x is expected to be a variable within data. If
#' data is not specified, x must be a vector.
#'
#' When modcv=TRUE is set, which is only applicable to the
#' pooling methods,
#' the data is first modified according to the modified coefficient
#' of variation (CV)
#' rules. This modified data is then used when both calculating the
#' basis values and
#' also when performing the diagnostic tests. The modified CV approach
#' is a way of
#' adding extra variance to datasets with unexpectedly low variance.
#'
#' basis_normal calculate the basis value by subtracting \eqn{k} times
#' the standard deviation from the mean. \eqn{k} is given by
#' the function [k_factor_normal()]. The equations in
#' Krishnamoorthy and Mathew (2008) are used.
#' basis_normal also
#' performs a diagnostic test for outliers (using
#' [maximum_normed_residual()])
#' and a diagnostic test for normality (using
#' [anderson_darling_normal()]).
#' If the argument batch is given, this function also performs
#' a diagnostic test for outliers within
#' each batch (using [maximum_normed_residual()])
#' and a diagnostic test for between batch variability (using
#' [ad_ksample()]). The argument batch is only used
#' for these diagnostic tests.
#'
#' basis_lognormal calculates the basis value in the same way
#' that basis_normal does, except that the natural logarithm of the
#' data is taken.
#'
#' basis_lognormal function also performs
#' a diagnostic test for outliers (using
#' [maximum_normed_residual()])
#' and a diagnostic test for normality (using
#' [anderson_darling_lognormal()]).
#' If the argument batch is given, this function also performs
#' a diagnostic test for outliers within
#' each batch (using [maximum_normed_residual()])
#' and a diagnostic test for between batch variability (using
#' [ad_ksample()]). The argument batch is only used
#' for these diagnostic tests.
#'
#' basis_weibull calculates the basis value for data distributed
#' according to a Weibull distribution. The confidence level for the
#' content requested is calculated using the conditional method, as
#' described in Lawless (1982) Section 4.1.2b. This has good agreement
#' with tables published in CMH-17-1G. Results differ between this function
#' and STAT17 by approximately 0.5\%.
#'
#' basis_weibull function also performs
#' a diagnostic test for outliers (using
#' [maximum_normed_residual()])
#' and a diagnostic test for normality (using
#' [anderson_darling_weibull()]).
#' If the argument batch is given, this function also performs
#' a diagnostic test for outliers within
#' each batch (using [maximum_normed_residual()])
#' and a diagnostic test for between batch variability (using
#' [ad_ksample()]). The argument batch is only used
#' for these diagnostic tests.
#'
#' basis_hk_ext calculates the basis value using the Extended
#' Hanson--Koopmans method, as described in CMH-17-1G and Vangel (1994).
#' For nonparametric distributions, this function should be used for samples
#' up to n=28 for B-Basis and up to \eqn{n=299} for A-Basis.
#' This method uses a pair of order statistics to determine the basis value.
#' CMH-17-1G suggests that for A-Basis, the first and last order statistic
#' is used: this is called the "woodward-frawley" method in this package,
#' after the paper in which this approach is described (as referenced
#' by Vangel (1994)). For B-Basis, another approach is used whereby the
#' first and j-th order statistic are used to calculate the basis value.
#' In this approach, the j-th order statistic is selected to minimize
#' the difference between the tolerance limit (assuming that the order
#' statistics are equal to the expected values from a standard normal
#' distribution) and the population quantile for a standard normal
#' distribution. This approach is described in Vangel (1994). This second
#' method (for use when calculating B-Basis values) is called
#' "optimum-order" in this package.
#' The results of basis_hk_ext have been
#' verified against example results from the program STAT-17. Agreement is
#' typically well within 0.2%.
#'
#' Note that the implementation of hk_ext_z_j_opt changed after cmstatr
#' version 0.8.0. This function is used internally by basis_hk_ext
#' when method = "optimum-order". This implementation change may mean
#' that basis values computed using this method may change slightly
#' after version 0.8.0. However, both implementations seem to be equally
#' valid. See the included vignette
#' for a discussion of the differences between the implementation before
#' and after version 0.8.0, as well as the factors given in CMH-17-1G.
#' To access this vignette, run: vignette("hk_ext", package = "cmstatr")
#'
#' basis_hk_ext also performs
#' a diagnostic test for outliers (using
#' [maximum_normed_residual()])
#' and performs a pair of tests that the sample size and method selected
#' follow the guidance described above.
#' If the argument batch is given, this function also performs
#' a diagnostic test for outliers within
#' each batch (using [maximum_normed_residual()])
#' and a diagnostic test for between batch variability (using
#' [ad_ksample()]). The argument batch is only used
#' for these diagnostic tests.
#'
#' basis_nonpara_large_sample calculates the basis value
#' using the large sample method described in CMH-17-1G. This method uses
#' a sum of binomials to determine the rank of the ordered statistic
#' corresponding with the desired tolerance limit (basis value). Results
#' of this function have been verified against results of the STAT-17
#' program.
#'
#' basis_nonpara_large_sample also performs
#' a diagnostic test for outliers (using
#' [maximum_normed_residual()])
#' and performs a test that the sample size is sufficiently large.
#' If the argument batch is given, this function also performs
#' a diagnostic test for outliers within
#' each batch (using [maximum_normed_residual()])
#' and a diagnostic test for between batch variability (using
#' [ad_ksample()]). The argument batch is only used
#' for these diagnostic tests.
#'
#' basis_anova calculates basis values using the ANOVA method.
#' x specifies the data (normally strength) and groups
#' indicates the group corresponding to each observation. This method is
#' described in CMH-17-1G, but when the ratio of between-batch mean
#' square to the within-batch mean square is less than or equal
#' to one, the tolerance factor is calculated based on pooling the data
#' from all groups. This approach is recommended by Vangel (1992)
#' and by Krishnamoorthy and Mathew (2008), and is also implemented
#' by the software CMH17-STATS and STAT-17.
#' This function automatically performs a diagnostic
#' test for outliers within each group
#' (using [maximum_normed_residual()]) and a test for between
#' group variability (using [ad_ksample()]) as well as checking
#' that the data contains at least 5 groups.
#' This function has been verified against the results of the STAT-17 program.
#'
#' basis_pooled_sd calculates basis values by pooling the data from
#' several groups together. x specifies the data (normally strength)
#' and group indicates the group corresponding to each observation.
#' This method is described in CMH-17-1G and matches the pooling method
#' implemented in ASAP 2008.
#'
#' basis_pooled_cv calculates basis values by pooling the data from
#' several groups together. x specifies the data (normally strength)
#' and group indicates the group corresponding to each observation.
#' This method is described in CMH-17-1G.
#'
#' basis_pooled_sd and basis_pooled_cv both automatically
#' perform a number of diagnostic tests. Using
#' [maximum_normed_residual()], they check that there are no
#' outliers within each group and batch (provided that batch is
#' specified). They check the between batch variability using
#' [ad_ksample()]. They check that there are no outliers within
#' each group (pooling all batches) using
#' [maximum_normed_residual()]. They check for the normality
#' of the pooled data using [anderson_darling_normal()].
#' basis_pooled_sd checks for equality of variance of all
#' data using [levene_test()] and basis_pooled_cv
#' checks for equality of variances of all data after transforming it
#' using [normalize_group_mean()]
#' using [levene_test()].
#'
#' The object returned by these functions includes the named vector
#' diagnostic_results. This contains all of the diagnostic tests
#' performed. The name of each element of the vector corresponds with the
#' name of the diagnostic test. The contents of each element will be
#' "P" if the diagnostic test passed, "F" if the diagnostic test failed,
#' "O" if the diagnostic test was overridden and NA if the
#' diagnostic test was skipped (typically because an optional
#' argument was not supplied).
#'
#' The following list summarizes the diagnostic tests automatically
#' performed by each function.
#'
#' - basis_normal
#'   * outliers_within_batch
#'   * between_batch_variability
#'   * outliers
#'   * anderson_darling_normal
#' - basis_lognormal
#'   * outliers_within_batch
#'   * between_batch_variability
#'   * outliers
#'   * anderson_darling_lognormal
#' - basis_weibull
#'   * outliers_within_batch
#'   * between_batch_variability
#'   * outliers
#'   * anderson_darling_weibull
#' - basis_pooled_cv
#'   * outliers_within_batch
#'   * between_group_variability
#'   * outliers_within_group
#'   * pooled_data_normal
#'   * normalized_variance_equal
#' - basis_pooled_sd
#'   * outliers_within_batch
#'   * between_group_variability
#'   * outliers_within_group
#'   * pooled_data_normal
#'   * pooled_variance_equal
#' - basis_hk_ext
#'   * outliers_within_batch
#'   * between_batch_variability
#'   * outliers
#'   * sample_size
#' - basis_nonpara_large_sample
#'   * outliers_within_batch
#'   * between_batch_variability
#'   * outliers
#'   * sample_size
#' - basis_anova
#'   * outliers_within_group
#'   * equality_of_variance
#'   * number_of_groups
#'
#' @return an object of class basis
#' This object has the following fields:
#' - call the expression used to call this function
#' - distribution the distribution used (normal, etc.)
#' - p the value of \eqn{p} supplied
#' - conf the value of \eqn{conf} supplied
#' - modcv a logical value indicating whether the modified
#'   CV approach was used. Only applicable to pooling methods.
#' - data a copy of the data used in the calculation
#' - groups a copy of the groups variable.
#'   Only used for pooling and ANOVA methods.
#' - batch a copy of the batch data used for diagnostic tests
#' - modcv_transformed_data the data after the modified CV transformation
#' - override a vector of the names of diagnostic tests that
#'   were overridden. NULL if none were overridden
#' - diagnostic_results a named character vector containing the
#'   results of all the diagnostic tests. See the Details section for
#' - diagnostic_failures a vector containing any diagnostic tests
#'   that produced failures
#' - n the number of observations
#' - r the number of groups, if a pooling method was used.
#'   Otherwise it is NULL.
#' - basis the basis value computed. This is a number
#'   except when pooling methods are used, in which case it is a data.frame.
#'
#' @seealso [hk_ext_z_j_opt()]
#' @seealso [k_factor_normal()]
#' @seealso [transform_mod_cv()]
#' @seealso [maximum_normed_residual()]
#' @seealso [anderson_darling_normal()]
#' @seealso [anderson_darling_lognormal()]
#' @seealso [anderson_darling_weibull()]
#' @seealso [normalize_group_mean()]
#'
#' @references
#' J. F. Lawless, Statistical Models and Methods for Lifetime Data.
#' New York: John Wiley & Sons, 1982.
#'
#' “Composite Materials Handbook, Volume 1. Polymer Matrix Composites
#' Guideline for Characterization of Structural Materials,” SAE International,
#' CMH-17-1G, Mar. 2012.
#'
#' M. Vangel, “One-Sided Nonparametric Tolerance Limits,”
#' Communications in Statistics - Simulation and Computation,
#' vol. 23, no. 4. pp. 1137–1154, 1994.
#'
#' K. Krishnamoorthy and T. Mathew, Statistical Tolerance Regions: Theory,
#' Applications, and Computation. Hoboken: John Wiley & Sons, 2008.
#'
#' W. Meeker, G. Hahn, and L. Escobar, Statistical Intervals: A Guide
#' for Practitioners and Researchers, Second Edition.
#' Hoboken: John Wiley & Sons, 2017.
#'
#' M. Vangel, “New Methods for One-Sided Tolerance Limits for a One-Way
#' Balanced Random-Effects ANOVA Model,” Technometrics, vol. 34, no. 2.
#' Taylor & Francis, pp. 176–185, 1992.
#'
#' @examples
#' library(dplyr)
#'
#' # A single-point basis value can be calculated as follows
#' # in this example, three failed diagnostic tests are
#' # overridden.
#'
#' carbon.fabric %>%
#'   filter(test == "FC") %>%
#'   filter(condition == "RTD") %>%
#'   basis_normal(strength, batch,
#'                override = c("outliers",
#'                             "outliers_within_batch",
#'                             "anderson_darling_normal"))
#'
#' ## Call:
#' ## basis_normal(data = ., x = strength, batch = batch,
#' ##     override = c("outliers", "outliers_within_batch",
#' ##    "anderson_darling_normal"))
#' ##
#' ## Distribution:  Normal 	( n = 18 )
#' ## The following diagnostic tests were overridden:
#' ##     outliers,
#' ##     outliers_within_batch,
#' ##     anderson_darling_normal
#' ## B-Basis:   ( p = 0.9 , conf = 0.95 )
#' ## 76.94656
#'
#' # A set of pooled basis values can also be calculated
#' # using the pooled standard deviation method, as follows.
#' # In this example, one failed diagnostic test is overridden.
#' carbon.fabric %>%
#'   filter(test == "WT") %>%
#'   basis_pooled_sd(strength, condition, batch,
#'                   override = c("outliers_within_batch"))
#'
#' ## Call:
#' ## basis_pooled_sd(data = ., x = strength, groups = condition,
#' ##                 batch = batch, override = c("outliers_within_batch"))
#' ##
#' ## Distribution:  Normal - Pooled Standard Deviation 	( n = 54, r = 3 )
#' ## The following diagnostic tests were overridden:
#' ##     outliers_within_batch
#' ## B-Basis:   ( p = 0.9 , conf = 0.95 )
#' ## CTD  127.6914
#' ## ETW  125.0698
#' ## RTD  132.1457
#'
#' @name basis
NULL

new_basis <- function(
call,
distribution,
modcv,
p,
conf,
override,
data,
groups,
batch
) {
res <- list()
class(res) <- "basis"

res$call <- call res$distribution <- distribution
res$modcv <- modcv res$p <- p
res$conf <- conf res$data <- data
res$groups <- groups res$batch <- batch
res$modcv_transformed_data <- NA res$override <- override
res$diagnostic_results <- character(0L) res$diagnostic_failures <- character(0L)
res$n <- length(res$data)
res$r <- NA if (!is.null(groups) & !all(is.na(groups))) { res$r <- length(levels(as.factor(groups)))
}
res$basis <- NA return(res) } #' Glance at a basis object #' #' @description #' Glance accepts an object of type basis and returns a #' [tibble::tibble()] with #' one row of summaries for each basis value. #' #' Glance does not do any calculations: it just gathers the results in a #' tibble. #' #' @param x a basis object #' @param include_diagnostics a logical value indicating whether to include #' columns for diagnostic tests. Default FALSE. #' @param ... Additional arguments. Not used. Included only to match generic #' signature. #' #' #' @return #' A [tibble::tibble()] with the following #' columns: #' #' - p the the content of the tolerance bound. Normally 0.90 or 0.99 #' - conf the confidence level. Normally 0.95 #' - distribution a string representing the distribution assumed #' when calculating the basis value #' - modcv a logical value indicating whether the modified #' CV approach was used. Only applicable to pooling methods. #' - n the sample size #' - r the number of groups used in the calculation. This will #' be NA for single-point basis values #' - basis the basis value #' #' @details #' For the pooled basis methods (basis_pooled_cv and #' basis_pooled_sd), the [tibble::tibble()] #' returned by glance will have one row for each group included in #' the pooling. For all other basis methods, the resulting tibble #' will have a single row. #' #' If include_diagnostics=TRUE, there will be additional columns #' corresponding with the diagnostic tests performed. These column(s) will #' be of type character and will contain a "P" if the diagnostic test #' passed, a "F" if the diagnostic test failed, an "O" if the diagnostic #' test was overridden or NA if the test was not run (typically #' because an optional argument was not passed to the function that #' computed the basis value). #' #' #' @seealso #' [basis()] #' #' @examples #' set.seed(10) #' x <- rnorm(20, 100, 5) #' b <- basis_normal(x = x) #' glance(b) #' #' ## # A tibble: 1 x 7 #' ## p conf distribution modcv n r basis #' ## <dbl> <dbl> <chr> <lgl> <int> <lgl> <dbl> #' ## 1 0.9 0.95 Normal FALSE 20 NA 92.0 #' #' #' glance(b, include_diagnostics = TRUE) #' #' ## # A tibble: 1 x 11 #' ## p conf distribution modcv n r basis outliers_within… #' ## <dbl> <dbl> <chr> <lgl> <int> <lgl> <dbl> <chr> #' ## 1 0.9 0.95 Normal FALSE 20 NA 92.0 NA #' ## # … with 3 more variables: between_batch_variability <chr>, #' ## # outliers <chr>, anderson_darling_normal <chr> #' #' @method glance basis #' @importFrom tibble tibble #' #' @export glance.basis <- function(x, include_diagnostics = FALSE, ...) { # nolint res <- tibble::tibble( p = x$p,
conf = x$conf, distribution = x$distribution,
modcv = x$modcv, n = x$n,
r = x$r, basis = x$basis
)

if (include_diagnostics) {
for (dn in names(x$diagnostic_results)) { res[[dn]] <- x$diagnostic_results[[dn]]
}
}

res
}

if (!is.null(vec) & length(vec) > 0) {
return(paste0(
"    ",
paste(vec, collapse = ",\n    "),
"\n"
))
}
return("")
}

#' @export
print.basis <- function(x, ...) {
cat("\nCall:\n",
paste(deparse(x$call), sep = "\n", collapse = "\n"), "\n\n", sep = "") cat("Distribution: ", x$distribution, "\t")

cat("( n =", x$n) if (!is.null(x$r) & !all(is.na(x$r))) { cat(", r =", x$r)
}
cat(" )\n")

if (x$modcv == TRUE) { cat("Modified CV Approach Used", "\n") } cat(format_diagnostic_helper( "The following diagnostic tests failed:", x$diagnostic_failures)
)
cat(format_diagnostic_helper(
"The following diagnostic tests were overridden:",
x$override) ) if (x$conf == 0.95 & x$p == 0.9) { cat("B-Basis: ", " ( p =", x$p, ", conf =", x$conf, ")\n") } else if (x$conf == 0.95 & x$p == 0.99) { cat("A-Basis: ", " ( p =", x$p, ", conf =", x$conf, ")\n") } else { cat("Basis: ", " ( p =", x$p, ", conf =", x$conf, ")\n") } if (is.numeric(x$basis)) {
cat(x$basis, "\n") } else if (is.data.frame(x$basis)) {
col_width <- max(nchar(as.character(x$basis[["group"]]))) + 2 for (j in seq(along.with = x$basis$group)) { cat(format(x$basis[["group"]][j], width = col_width))
cat(x$basis[["value"]][j], "\n") } } else { stop("basis is an unexpected data type") # nocov } cat("\n") } single_point_rules <- list( outliers_within_batch = function(x, batch, ...) { group_mnr <- vapply(unique(batch), function(b) { x_group <- x[batch == b] mnr <- maximum_normed_residual(x = x_group) mnr$n_outliers == 0
}, FUN.VALUE = logical(1L))
ifelse(all(group_mnr), "",
paste0("Maximum normed residual test detected ",
"outliers within one or more batch"))
},
between_batch_variability = function(x, batch, ...) {
ifelse(!adk$reject_same_dist, "", paste0("Anderson-Darling k-Sample test indicates that ", "batches are drawn from different distributions")) }, outliers = function(x, ...) { mnr <- maximum_normed_residual(x = x) ifelse(mnr$n_outliers == 0, "",
paste0("Maximum normed residual test detected outliers ",
"within data"))
}
)

basis_normal_rules <- single_point_rules
basis_normal_rules[["anderson_darling_normal"]] <-
function(x, ...) {
ifelse(!ad$reject_distribution, "", paste0("Anderson-Darling test rejects hypothesis that data ", "is drawn from a normal distribution")) } #' @rdname basis #' @importFrom rlang enquo eval_tidy #' @importFrom stats sd #' @export basis_normal <- function(data = NULL, x, batch = NULL, p = 0.90, conf = 0.95, override = c()) { verify_tidy_input( df = data, x = x, c = match.call(), arg_name = "x") verify_tidy_input( df = data, x = batch, c = match.call(), arg_name = "batch") override <- process_overrides(override, basis_normal_rules) res <- new_basis( call = match.call(), distribution = "Normal", modcv = FALSE, p = p, conf = conf, override = override, data = eval_tidy(enquo(x), data), groups = NA, batch = eval_tidy(enquo(batch), data) ) res$diagnostic_results <- perform_checks(
basis_normal_rules, x = res$data, batch = res$batch, override = override
)
res$diagnostic_failures <- get_check_failure_names(res$diagnostic_results)

k <- k_factor_normal(n = res$n, p = p, conf = conf) cv <- sd(res$data) / mean(res$data) res$basis <- mean(res$data) * (1 - k * cv) return(res) } basis_lognormal_rules <- single_point_rules basis_lognormal_rules[["anderson_darling_lognormal"]] <- function(x, ...) { ad <- anderson_darling_lognormal(x = x) ifelse(!ad$reject_distribution, "",
paste0("Anderson-Darling test rejects hypothesis that data ",
"is drawn from a log-normal distribution"))
}

#' @rdname basis
#' @importFrom rlang enquo eval_tidy
#' @importFrom stats sd
#' @export
basis_lognormal <- function(data = NULL, x, batch = NULL, p = 0.90,
conf = 0.95, override = c()) {
verify_tidy_input(
df = data,
x = x,
c = match.call(),
arg_name = "x")

verify_tidy_input(
df = data,
x = batch,
c = match.call(),
arg_name = "batch")

override <- process_overrides(override, basis_lognormal_rules)

res <- new_basis(
call = match.call(),
distribution = "Lognormal",
modcv = FALSE,
p = p,
conf = conf,
override = override,
data = eval_tidy(enquo(x), data),
groups = NA,
batch = eval_tidy(enquo(batch), data)
)

res$diagnostic_results <- perform_checks( basis_lognormal_rules, x = res$data, batch = res$batch, override = override ) res$diagnostic_failures <- get_check_failure_names(res$diagnostic_results) k <- k_factor_normal(n = res$n, p = p, conf = conf)
res$basis <- exp(mean(log(res$data)) - k * sd(log(res$data))) return(res) } basis_weibull_rules <- single_point_rules basis_weibull_rules[["anderson_darling_weibull"]] <- function(x, ...) { ad <- anderson_darling_weibull(x = x) ifelse(!ad$reject_distribution, "",
paste0("Anderson-Darling test rejects hypothesis that data ",
"is drawn from a Weibull distribution"))
}

#' @rdname basis
#' @importFrom rlang enquo eval_tidy
#' @importFrom stats qweibull integrate pchisq uniroot
#' @importFrom MASS fitdistr
#'
#' @export
basis_weibull <- function(data = NULL, x, batch = NULL, p = 0.90,
conf = 0.95, override = c()) {
verify_tidy_input(
df = data,
x = x,
c = match.call(),
arg_name = "x")

verify_tidy_input(
df = data,
x = batch,
c = match.call(),
arg_name = "batch")

override <- process_overrides(override, basis_weibull_rules)

res <- new_basis(
call = match.call(),
distribution = "Weibull",
modcv = FALSE,
p = p,
conf = conf,
override = override,
data = eval_tidy(enquo(x), data),
groups = NA,
batch = eval_tidy(enquo(batch), data)
)

res$diagnostic_results <- perform_checks( basis_weibull_rules, x = res$data, batch = res$batch, override = override ) res$diagnostic_failures <- get_check_failure_names(res$diagnostic_results) dist <- fitdistr(res$data, "weibull")

alpha_hat <- dist$estimate[["scale"]] beta_hat <- dist$estimate[["shape"]]

# The data must be transformed to fit an extreme value distribution
data_evd <- log(res$data) u_hat <- log(alpha_hat) b_hat <- 1 / beta_hat # Next, find the ancillary statistic for the data a <- (data_evd - u_hat) / b_hat k_integrand <- function(z) { return( z ^ (res$n - 2) * exp((z - 1) * sum(a)) /
((1 / res$n) * sum(exp(a * z))) ^ res$n
)
}

k_inv <- integrate(Vectorize(k_integrand), lower = 0, upper = Inf)
k <- 1 / k_inv$value incomplete_gamma <- function(ki, xi) { pchisq(xi * 2, ki * 2) } h2 <- function(z) { return( k * z ^ (res$n - 2) * exp((z - 1) * sum(a)) /
((1 / res$n) * sum(exp(a * z))) ^ res$n
)
}

wp <- log(-log(p))

pr_zp_integrand <- function(z, t) {
h2(z) * incomplete_gamma(res$n, exp(wp + t * z) * sum(exp(a * z))) } pr_fcn <- function(t) { int_res <- integrate(Vectorize(function(z) pr_zp_integrand(z, t)), 0, Inf) return(int_res$value - conf)
}

res_root <- uniroot(pr_fcn, c(0, 10), extendInt = "yes")

res$basis <- exp(u_hat - res_root$root * b_hat)

return(res)
}

pooled_rules <- list(
outliers_within_batch = function(x, groups, batch, ...) {
group_batch_mnr <- vapply(unique(groups), function(g) {
batch_mnr <- vapply(unique(batch), function(b) {
x_group <- x[batch == b & groups == g]
if (length(x_group) == 0) {
return(TRUE)
}
mnr <- maximum_normed_residual(x = x_group)
return(mnr$n_outliers == 0) }, FUN.VALUE = logical(1L)) all(batch_mnr) }, FUN.VALUE = logical(1L)) ifelse(all(group_batch_mnr), "", paste0("Maximum normed residual test detected ", "outliers within one or more batch and group")) }, between_group_variability = function(x_ad, groups, batch, ...) { group_adk <- vapply(unique(groups), function(g) { x_group <- x_ad[groups == g] batch_group <- batch[groups == g] adk <- ad_ksample(x = x_group, groups = batch_group) !adk$reject_same_dist
}, FUN.VALUE = logical(1L))
paste0("Anderson-Darling k-Sample test indicates that ",
"batches are drawn from different distributions"))
},
outliers_within_group = function(x, groups, ...) {
group_mnr <- vapply(unique(groups), function(g) {
x_group <- x[groups == g]
mnr <- maximum_normed_residual(x = x_group)
return(mnr$n_outliers == 0) }, FUN.VALUE = logical(1L)) ifelse(all(group_mnr), "", paste0("Maximum normed residual test detected ", "outliers within one or more group")) }, pooled_data_normal = function(x_ad, groups, ...) { norm_x <- normalize_group_mean(x = x_ad, group = groups) ad <- anderson_darling_normal(x = norm_x) ifelse(!ad$reject_distribution, "",
paste0("Anderson-Darling test rejects hypothesis that pooled ",
"data is drawn from a normal distribution"))
}
)

pooled_rules_cv <- pooled_rules
pooled_rules_cv[["normalized_variance_equal"]] <- function(x, groups, ...) {
norm_x <- normalize_group_mean(x = x, group = groups)
lev <- levene_test(x = norm_x, groups = groups)
return(ifelse(!lev$reject_equal_variance, "", paste0("Levene's test rejected the hypothesis that the ", "variance of all groups are equal"))) } #' @rdname basis #' @importFrom rlang enquo eval_tidy #' @export basis_pooled_cv <- function(data = NULL, x, groups, batch = NULL, p = 0.90, conf = 0.95, modcv = FALSE, override = c()) { verify_tidy_input( df = data, x = x, c = match.call(), arg_name = "x") verify_tidy_input( df = data, x = groups, c = match.call(), arg_name = "groups") verify_tidy_input( df = data, x = batch, c = match.call(), arg_name = "batch") override <- process_overrides(override, pooled_rules_cv) res <- new_basis( call = match.call(), distribution = "Normal - Pooled CV", modcv = modcv, p = p, conf = conf, override = override, data = eval_tidy(enquo(x), data), groups = eval_tidy(enquo(groups), data), batch = eval_tidy(enquo(batch), data) ) if (modcv == TRUE) { res$modcv <- TRUE
res$modcv_transformed_data <- transform_mod_cv_grouped(res$data, res$groups) data_to_use <- res$modcv_transformed_data
x_ad <- transform_mod_cv_ad(res$data, res$groups, res$batch) } else { res$modcv <- FALSE
data_to_use <- res$data x_ad <- data_to_use } res$diagnostic_results <- perform_checks(
groups = res$groups, batch = res$batch, override = override
)
res$diagnostic_failures <- get_check_failure_names(res$diagnostic_results)

norm_data <- normalize_group_mean(data_to_use, res$groups) pooled_sd <- sqrt(sum((norm_data - 1) ^ 2) / (res$n - res$r)) basis <- vapply(levels(as.factor(res$groups)), function(g) {
nj <- length(data_to_use[res$groups == g]) z <- qnorm(p) suppressWarnings( kj <- qt(conf, df = res$n - res$r, ncp = z * sqrt(nj)) / sqrt(nj) ) xj_bar <- mean(data_to_use[res$groups == g])
xj_bar * (1 - kj * pooled_sd)
}, FUN.VALUE = numeric(1L))

res$basis <- data.frame(group = names(basis), value = basis) return(res) } pooled_rules_sd <- pooled_rules pooled_rules_sd[["pooled_variance_equal"]] <- function(x, groups, ...) { lev <- levene_test(x = x, groups = groups) return(ifelse(!lev$reject_equal_variance, "",
paste0("Levene's test rejected the hypothesis that the ",
"variance of all conditions are equal")))
}

#' @rdname basis
#' @importFrom rlang enquo eval_tidy
#' @export
basis_pooled_sd <- function(data = NULL, x, groups, batch = NULL,
p = 0.90, conf = 0.95, modcv = FALSE,
override = c()) {
verify_tidy_input(
df = data,
x = x,
c = match.call(),
arg_name = "x")

verify_tidy_input(
df = data,
x = groups,
c = match.call(),
arg_name = "groups")

verify_tidy_input(
df = data,
x = batch,
c = match.call(),
arg_name = "batch")

override <- process_overrides(override, pooled_rules_sd)

res <- new_basis(
call = match.call(),
distribution = "Normal - Pooled Standard Deviation",
modcv = modcv,
p = p,
conf = conf,
override = override,
data = eval_tidy(enquo(x), data),
groups = eval_tidy(enquo(groups), data),
batch = eval_tidy(enquo(batch), data)
)

if (modcv == TRUE) {
res$modcv <- TRUE res$modcv_transformed_data <- transform_mod_cv_grouped(res$data, res$groups)
data_to_use <- res$modcv_transformed_data x_ad <- transform_mod_cv_ad(res$data, res$groups, res$batch)
} else {
res$modcv <- FALSE data_to_use <- res$data
}

res$diagnostic_results <- perform_checks( pooled_rules_sd, x = data_to_use, x_ad = x_ad, groups = res$groups, batch = res$batch, override = override ) res$diagnostic_failures <- get_check_failure_names(res$diagnostic_results) pooled_sd <- sqrt( sum( vapply(levels(as.factor(res$groups)), function(g) {
xj_bar <- mean(data_to_use[res$groups == g]) sum((data_to_use[res$groups == g] - xj_bar) ^ 2)
}, FUN.VALUE = numeric(1L))
) / (res$n - res$r))

basis <- vapply(levels(as.factor(res$groups)), function(g) { nj <- length(data_to_use[res$groups == g])
z <- qnorm(p)
suppressWarnings(
kj <- qt(conf, df = res$n - res$r, ncp = z * sqrt(nj)) / sqrt(nj)
)
xj_bar <- mean(data_to_use[res$groups == g]) xj_bar - kj * pooled_sd }, FUN.VALUE = numeric(1L)) res$basis <- data.frame(group = names(basis), value = basis)

return(res)
}

#' @importFrom stats pbeta dbeta
hk_ext_h <- function(z, n, i, j, p) {
if (!(1 <= i && i < j && j <= n)) {
# This function is called internally, so i, j and n should always be valid
stop("Error: The condition 1 <= i < j <= n must be true.")  # nocov
}

# for z >= 1
qb <- pbeta(1 - p, j, n - j + 1)
int <- integrate(function(t) {
pbeta(((1 - p) / t) ^ (1 / z), i, j - i) * dbeta(t, j, n - j + 1)
},
lower = 1 - p, upper = 1)
if (int$message != "OK") { warning(int$message)  # nocov
}
qb + int$value } #' Calculate values related to Extended Hanson--Koopmans tolerance bounds #' #' @description #' Calculates values related to Extended Hanson--Koopmans tolerance bounds #' as described by Vangel (1994). #' #' @param n the sample size #' @param i the first order statistic (1 <= i < j) #' @param j the second order statistic (i < j <= n) #' @param p the content of the tolerance bound (normally 0.90 or 0.99) #' @param conf the confidence level (normally 0.95) #' #' @return #' For hk_ext_z, the return value is a numeric value representing #' the parameter z (denoted as k in CMH-17-1G). #' #' For hk_ext_z_j_opt, the return value is named list containing #' z and k. The former is the value of z, as defined by #' Vangel (1994), and the latter is the corresponding order statistic. #' #' @details #' Hanson (1964) presents a nonparametric method for determining #' tolerance bounds based on consecutive order statistics. #' Vangel (1994) extends this method using non-consecutive order statistics. #' #' The extended Hanson--Koopmans method calculates a tolerance bound #' (basis value) based on two order statistics and a weighting value #' z. The value of z is based on the sample size, which #' order statistics are selected, the desired content of the tolerance #' bond and the desired confidence level. #' #' The function hk_ext_z calculates the weighting variable z #' based on selected order statistics i and j. Based on this #' value z, the tolerance bound can be calculated as: #' #' \deqn{S = z X_{(i)} + (1 - z) X_{(j)}}{S = z X(i) + (1 - z) X(j)} #' #' Where \eqn{X_{(i)}}{X(i)} and \eqn{X_{(j)}}{X(j)} are the i-th #' and j-th ordered observation. #' #' The function hk_ext_z_j_opt determines the value of j and #' the corresponding value of z, assuming i=1. The value #' of j is selected such that the computed tolerance limit is #' nearest to the desired population quantile for a standard normal #' distribution when the order statistics are equal to the expected #' value of the order statistics for the standard normal distribution. #' #' @references #' M. Vangel, “One-Sided Nonparametric Tolerance Limits,” #' Communications in Statistics - Simulation and Computation, #' vol. 23, no. 4. pp. 1137–1154, 1994. #' #' D. L. Hanson and L. H. Koopmans, #' “Tolerance Limits for the Class of Distributions with Increasing #' Hazard Rates,” The Annals of Mathematical Statistics, #' vol. 35, no. 4. pp. 1561–1570, 1964. #' #' @examples #' # The factors from Table 1 of Vangel (1994) can be recreated #' # using the hk_ext_z function. For the sample size n=21, #' # the median is the 11th ordered observation. The factor #' # required for calculating the tolerance bound with a content #' # of 0.9 and a confidence level of 0.95 based on the median #' # and first ordered observation can be calculated as follows. #' hk_ext_z(n = 21, i = 1, j = 11, p = 0.9, conf = 0.95) #' #' ##  1.204806 #' #' # The hk_ext_z_j_opt function can be used to refine this value #' # of z by finding an optimum value of j, rather than simply #' # using the median. Here, we find that the optimal observation #' # to use is the 10th, not the 11th (which is the median). #' hk_ext_z_j_opt(n = 21, p = 0.9, conf = 0.95) #' #' ##$z
#' ##  1.217717
#' ##
#' ## $j #' ##  10 #' #' @seealso [basis_hk_ext()] #' #' @name hk_ext #' #' @rdname hk_ext #' @export hk_ext_z <- function(n, i, j, p, conf) { res <- uniroot( function(z) { hk_ext_h(z, n, i, j, p) - conf }, lower = 1, upper = 10, extendInt = "upX") z <- res$root
z
}

#' @rdname hk_ext
#' @export
hk_ext_z_j_opt <- function(n, p, conf) {
i <- 1  # i is always 1

if (n < 2) {
stop("n must be >= 2")
}

expected_order_statistic <- function(i, n) {
# ref: https://www.gwern.net/docs/statistics/order/1961-harter.pdf
int <- function(x) {
x * pnorm(-x) ^ (i - 1) * pnorm(x) ^ (n - i) * dnorm(x)
}
integral <- integrate(int, -Inf, Inf)
stopifnot(integral$message == "OK") factorial(n) / (factorial(n - i) * factorial(i - 1)) * integral$value
}

# Try all the allowable values of j to find the value of T
# that is closest to the population quantile for a standard
# normal distribution
j <- (i + 1):n
z_vals <- vapply(j, function(ji) {
hk_ext_z(n, i, ji, p, conf)
}, FUN.VALUE = numeric(1L))

err_vals <- vapply(seq(along.with = j), function(index) {
ji <- j[index]
zi <- z_vals[index]
e1 <- expected_order_statistic(i, n)
e2 <- expected_order_statistic(ji, n)
abs(zi * e1 + (1 - zi) * e2 - qnorm(p))
}, FUN.VALUE = numeric(1L))

list(
z = z_vals[err_vals == min(err_vals)],
j = j[err_vals == min(err_vals)]
)
}

basis_hk_ext_rules <- single_point_rules
basis_hk_ext_rules[["correct_method_used"]] <-
function(method, p, conf, ...) {
if (p == 0.90 & conf == 0.95) {
# B-Basis
return(ifelse(method == "optimum-order", "",
paste0("For B-Basis, the optimum order method ",
"should be used")))
} else if (p == 0.99 & conf == 0.95) {
# A-Basis
return(ifelse(method == "woodward-frawley", "",
paste0("For A-Basis, the Woodward-Frawley method ",
"should be used")))
} else {
return("")
}
}
basis_hk_ext_rules[["sample_size"]] <-
function(n, p, conf, ...) {
if (p == 0.90 & conf == 0.95) {
# B-Basis
return(ifelse(n <= 28, "",
paste0("For B-Basis, Hanson-Koopmans should only be ",
"used for samples of 28 or fewer observations")))
} else if (p == 0.99 & conf == 0.95) {
# A-Basis
return(ifelse(n <= 299, "",
paste0("For A-Basis, Hanson-Koopmans should only be ",
"used for samples of 299 or fewer observations")))
} else {
return("")
}
}

#' @rdname basis
#' @importFrom rlang enquo eval_tidy
#'
#' @export
basis_hk_ext <- function(data = NULL, x, batch = NULL, p = 0.90, conf = 0.95,
method = c("optimum-order", "woodward-frawley"),
override = c()) {
method <- match.arg(method)

verify_tidy_input(
df = data,
x = x,
c = match.call(),
arg_name = "x")

verify_tidy_input(
df = data,
x = batch,
c = match.call(),
arg_name = "batch")

override <- process_overrides(override, basis_hk_ext_rules)

res <- new_basis(
call = match.call(),
distribution = paste0(
"Nonparametric (Extended Hanson-Koopmans, ",
ifelse(method == "optimum-order", "optimum two-order-statistic method",
"Woodward-Frawley method"),
")"),
modcv = FALSE,
p = p,
conf = conf,
override = override,
data = eval_tidy(enquo(x), data),
groups = NA,
batch = eval_tidy(enquo(batch), data)
)

res$diagnostic_results <- perform_checks( basis_hk_ext_rules, x = res$data, batch = res$batch, n = res$n,
p = res$p, conf = res$conf, method = method, override = override
)
res$diagnostic_failures <- get_check_failure_names(res$diagnostic_results)

if (method == "optimum-order") {
zj <- hk_ext_z_j_opt(res$n, p, conf) z <- zj$z
j <- zj$j } else if (method == "woodward-frawley") { j <- res$n
z <- hk_ext_z(res$n, 1, j, p, conf) } else { stop("Invalid value for method.") # nocov } x_ordered <- sort(res$data)
res$basis <- x_ordered[j] * (x_ordered / x_ordered[j]) ^ z return(res) } #' Rank for distribution-free tolerance bound #' #' @description #' Calculates the rank order for finding distribution-free tolerance #' bounds for large samples. This function should only be used for #' computing B-Basis for samples larger than 28 or A-Basis for samples #' larger than 298. This function is used by #' [basis_nonpara_large_sample()]. #' #' @param n the sample size #' @param p the desired content for the tolerance bound #' @param conf the confidence level for the desired tolerance bound #' #' @return #' The rank corresponding with the desired tolerance bound #' #' @details #' This function uses the sum of binomial terms to determine the rank #' of the ordered statistic that corresponds with the desired tolerance #' limit. This approach does not assume any particular distribution. This #' approach is described by Guenther (1969) and by CMH-17-1G. #' #' The results of this function have been verified against the tables in #' CMH-17-1G and agreement was found for all sample sizes published in #' CMH-17-1G for both A- and B-Basis, as well as the sample sizes #' n+1 and n-1, where #' n is the sample size published in CMH-17-1G. #' #' The tables in CMH-17-1G purportedly list the smallest sample sizes #' for which a particular rank can be used. That is, for a sample size #' one less than the n published in the table, the next lowest rank #' would be used. In some cases, the results of this function disagree by a #' rank of one for sample sizes one less than the n published in the #' table. This indicates a disagreement in that sample size at which #' the rank should change. This is likely due to numerical #' differences in this function and the procedure used to generate the tables. #' However, the disagreement is limited to sample 6500 for A-Basis; no #' discrepancies have been identified for B-Basis. Since these sample sizes are #' uncommon for composite materials #' testing, and the difference between subsequent order statistics will be #' very small for samples this large, this difference will have no practical #' effect on computed tolerance bounds. #' #' @references #' W. Guenther, “Determination of Sample Size for Distribution-Free #' Tolerance Limits,” Jan. 1969. #' Available online: <https://www.duo.uio.no/handle/10852/48686> #' #' “Composite Materials Handbook, Volume 1. Polymer Matrix Composites #' Guideline for Characterization of Structural Materials,” SAE International, #' CMH-17-1G, Mar. 2012. #' #' @seealso [basis_nonpara_large_sample()] #' #' @examples #' nonpara_binomial_rank(n = 1693, p = 0.99, conf = 0.95) #' ##  11 #' #' # The above example indicates that for a sample of 1693 observations, #' # the A-Basis is best approximated as the 11th ordered observation. #' # In the example below, the same ordered observation would also be used #' # for a sample of size 1702. #' #' nonpara_binomial_rank(n = 1702, p = 0.99, conf = 0.95) #' ##  11 #' #' @export nonpara_binomial_rank <- function(n, p, conf) { p_orig <- p p <- 1 - p e_fcn <- function(r) { sum(vapply(r:n, function(w) { exp(lchoose(n, w) + w * log(p) + (n - w) * log(1 - p)) }, FUN.VALUE = numeric(1L))) } r1 <- 1 e1 <- e_fcn(r1) if (e1 < conf) { stop(paste0( "Sample size ", n, " is too small to compute a non-parametric ", "tolerance limit for p=", p_orig, " and conf=", conf)) } r2 <- n e2 <- e_fcn(r2) if (e2 > conf) { stop(paste0( "No rank found for n=", n, ", p=", p_orig, " conf=", conf)) } for (i in 1:n) { # use a for loop just to give it a limit to prevent infinite loope if (abs(r2 - r1) == 1) { break } rm <- round((r1 + r2) / 2, digits = 0) em <- e_fcn(rm) # nolint start # We know that the following holds, and we want this to continue to hold: # E1 > conf # E2 < conf # nolint end if (em > conf) { r1 <- rm e1 <- em } else { r2 <- rm e2 <- em } } r1 } nonpara_large_sample_rules <- single_point_rules nonpara_large_sample_rules[["sample_size"]] <- function(n, p, conf, ...) { if (p == 0.90 & conf == 0.95) { # B-Basis return(ifelse(n >= 28, "", paste0("This method should only be used for ", "B-Basis for sample sizes larger than 28"))) } else if (p == 0.99 & conf == 0.95) { # A-Basis return(ifelse(n >= 299, "", paste0("This method should only be used for ", "A-Basis for sample sizes larger than 299"))) } else { return(TRUE) } } #' @rdname basis #' @importFrom rlang enquo eval_tidy #' #' @export basis_nonpara_large_sample <- function(data = NULL, x, batch = NULL, p = 0.90, conf = 0.95, override = c()) { verify_tidy_input( df = data, x = x, c = match.call(), arg_name = "x") verify_tidy_input( df = data, x = batch, c = match.call(), arg_name = "batch") override <- process_overrides(override, nonpara_large_sample_rules) res <- new_basis( call = match.call(), distribution = "Nonparametric (large sample)", modcv = FALSE, p = p, conf = conf, override = override, data = eval_tidy(enquo(x), data), groups = NA, batch = eval_tidy(enquo(batch), data) ) res$diagnostic_results <- perform_checks(
nonpara_large_sample_rules, x = res$data, batch = res$batch, n = res$n, p = res$p, conf = res$conf, override = override ) res$diagnostic_failures <- get_check_failure_names(res$diagnostic_results) x_ordered <- sort(res$data)
r <- nonpara_binomial_rank(res$n, p, conf) res$basis <- x_ordered[r]

return(res)
}

anova_rules <- list(
outliers_within_group = function(x, groups, ...) {
group_mnr <- vapply(unique(groups), function(b) {
x_group <- x[groups == b]
mnr <- maximum_normed_residual(x = x_group)
mnr$n_outliers == 0 }, FUN.VALUE = logical(1L)) ifelse(all(group_mnr), "", paste0("Maximum normed residual test detected ", "outliers within one or more batch")) }, equality_of_variance = function(x, groups, ...) { lt <- levene_test(x = x, groups = groups) ifelse(!lt$reject_equal_variance, "",
paste0("Levene's test rejected the hypothesis that the ",
"variance of all groups is equal"))
},
number_of_groups = function(r, ...) {
ifelse(r >= 5, "",
"ANOVA should only be used for 5 or more groups")
}
)

#' @rdname basis
#' @importFrom rlang enquo eval_tidy
#' @export
basis_anova <- function(data = NULL, x, groups, p = 0.90, conf = 0.95,
override = c()) {
verify_tidy_input(
df = data,
x = x,
c = match.call(),
arg_name = "x")

verify_tidy_input(
df = data,
x = groups,
c = match.call(),
arg_name = "groups")

override <- process_overrides(override, anova_rules)

res <- new_basis(
call = match.call(),
distribution = "ANOVA",
modcv = FALSE,
p = p,
conf = conf,
override = override,
data = eval_tidy(enquo(x), data),
groups = eval_tidy(enquo(groups), data),
batch = NA
)

if (res$r < 2) { stop("ANOVA cannot be computed with fewer than 2 groups") } res$diagnostic_results <- perform_checks(
rules = anova_rules, x = res$data, groups = res$groups,
r = res$r, override = override ) res$diagnostic_failures <- get_check_failure_names(res$diagnostic_results) grand_mean <- mean(res$data)

ssb <- sum(vapply(
levels(as.factor(res$groups)), function(g) { group_data <- res$data[res$groups == g] length(group_data) * mean(group_data) ^ 2 }, FUN.VALUE = numeric(1L) )) - res$n * grand_mean ^ 2

sst <- sum(vapply(
res$data, function(xi) { xi ^ 2 }, FUN.VALUE = numeric(1L) )) - res$n * grand_mean ^ 2

sse <- sst - ssb

msb <- ssb / (res$r - 1) mse <- sse / (res$n - res$r) n_star <- sum(vapply( levels(as.factor(res$groups)),
function(g) {
group_data <- res$data[res$group == g]
length(group_data) ^ 2 / res$n }, FUN.VALUE = numeric(1L) )) effective_batch <- (res$n - n_star) / (res$r - 1) pop_sd <- sqrt( msb / effective_batch + (effective_batch - 1) / effective_batch * mse ) u <- msb / mse k0 <- k_factor_normal(res$n, p, conf)
k1 <- k_factor_normal(res$r, p, conf) w <- sqrt(u / (u + effective_batch - 1)) tol_factor <- (k0 - k1 / sqrt(effective_batch) + (k1 - k0) * w) / (1 - 1 / sqrt(effective_batch)) if (msb / mse <= 1) { tol_factor <- k0 } res$basis <- grand_mean - tol_factor * pop_sd

return(res)
}


## Try the cmstatr package in your browser

Any scripts or data that you put into this service are public.

cmstatr documentation built on Sept. 30, 2021, 5:08 p.m.