R/mcgoftest.R

Defines functions jackknife_index sw_sta_pval shapiro GoFtest DoIt stat_fun hdiv distfn ks_stat chisq rmse FREQs freqnd freqs ad_stat

## ########################################################################### #
##
## Copyright (C) 2019 Robersy Sanchez <https://genomaths.com/>
##
## Author: Robersy Sanchez
##
## This file is part of the R package "usefr".
##
## 'usefr' is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.
##
## ########################################################################### #
#
#' @rdname mcgoftest
#' @title Bootstrap test for Goodness of fit (GoF)
#' @description To accomplish the nonlinear fit of a probability distribution
#' function \emph{(PDF)}, different optimization algorithms can be used. Each
#' algorithm will return a different set of estimated parameter values. AIC and
#' BIC are not useful (in this case) to decide which parameter set of values is
#' the best. The goodness-of-fit tests \emph{(GOF)} can help in this case.
#' Please, see below the examples on how to use this function.
#'
#' @details The test is intended mostly for continuous distributions. Basically,
#' given the set of parameter values \strong{\emph{pars}} from distribution
#' \strong{\emph{distr}}, \strong{\emph{num.sampl}} sets of random samples will
#' be generated, each one of them with \strong{\emph{sample.size}} element. The
#' selected statistic \strong{\emph{pars}} will be computed for each randomly
#' generated set (\emph{b_stats}) and for sample \strong{\emph{varobj}}
#' (\emph{s_stat}). Next, the bootstrap \emph{p-value} will be computed as:
#' \eqn{mean(c(s_stat, b_stats) >= s_stat)}.
#'
#' If both variables, \strong{\emph{varobj}} and \strong{\emph{distr}}, are
#' numerical vectors, then \code{\link{tableBoots}} function is applied. That
#' is, the problem is confronted as a \eqn{n x m} contingency independence test,
#' since there is no way to prove that two arbitrary sequences of integer
#' numbers would follow the same probability distribution without provide
#' additional information/knowledge.
#'
#' If sampling size is lesser the size of the sample, then the test becomes a
#' Monte Carlo test. The test is based on the use of measures of goodness of
#' fit, statistics. The following statistics are available (and some
#' limitations for their application to continuous variables are given):
#'
#'     \itemize{
#'         \item Kolmogorov- Smirnov statistic ('ks'). Limitations: sensitive to
#'               ties [1]. Only the parametric Monte Carlo resampling
#'               (provided that there is not ties in the data) can be used.
#'
#'         \item Anderson–Darling statistic ('ad') [2]. Limitation: by
#'               construction, it depends on the sample size. So, the size of
#'               the sampling must be close to the sample size if Monte Carlo
#'               resampling is used, which could be a limitation if the sample
#'               size is too large [2]. In particular, could be an issue in some
#'               genomic applications. It is worth highlighting that, for the
#'               current application, the Anderson–Darling statistic is not
#'               standardized as typically done in testing GoF for normal
#'               distribution with Anderson–Darling test. It is not required
#'               since, the statistic is not compared with a corresponding
#'               theoretical value. In addition, since the computation of this
#'               statistic requires for the data to be put in order [2], it does
#'               not make sense to perform a permutation test. That is, the
#'               maximum sampling size is the sample size less 1.
#'
#'         \item Shapiro-Wilk statistic ('sw') [5]. A Jackknife resampling is
#'              applied, instead of a Monte Carlo resampling. Simulation studies
#'              suggest that Shapiro-Wilk test is the most powerful normality
#'              test, followed by Anderson-Darling test, Lillie/ors test and
#'              Kolmogorov-Smirnov test [6]. In this case, a jackknife
#'              resampling is applied (leave-one-out) and the
#'              \emph{\strong{p-value}} for the mean of Shapiro-Wilk statistic
#'              is returned.
#'
#'         \item Pearson's Chi-squared statistic ('chisq'). Limitation: the
#'               sample must be discretized (partitioned into bins), which is
#'               could be a source of bias that leads to the rejection of the
#'               null hypothesis. Here, the discretization is done using
#'               function the resources from function
#'               \code{\link[graphics]{hist}}.
#'
#'         \item Root Mean Square statistic ('rmse'). Limitation: the same
#'               as 'chisq'.
#'         \item Hellinger Divergence statistic ('hd'). Limitation: the same
#'               as 'chisq'.
#'        }
#' If the argument \strong{\emph{distr}} must be defined in
#' environment-namespace from any package or the environment defined by the
#' user. if  \eqn{missing( sample.size )}, then
#' \eqn{sample.size <- length(varobj) - 1}.
#'
#' Notice that 'chisq', 'rmse', and 'hd' tests can be applied to testing two
#' discrete probability distributions as well. However, here,
#' \strong{\emph{mcgoftest}} function is limited to continuous probability
#' distributions.
#'
#' Additionally, the only supported n-dimensional probability distribution is
#' Dirichlet Distribution (\emph{Dir}). The GOF for Dir is based on the fact
#' that if a variable \eqn{x = (x_1, x_2, ...x_n)} follows Dirichlet
#' Distribution with parameters \eqn{\alpha = \alpha_1, ... , \alpha_n} (all
#' positive reals), in short, \eqn{x ~ Dir(\alpha)}, then \eqn{x_i ~
#' Beta(\alpha_i, \alpha_0 - \alpha_i)}, where Beta(.) stands for the Beta
#' distribution and \eqn{\alpha_0 = \sum \alpha_i} (see Detail section,
#' \code{\link{dirichlet}} function, and example 5).
#'
#' @param varobj A a vector containing observations, the variable for which the
#' CDF parameters was estimated or the discrete absolute frequencies of each
#' observation category.
#' @param model A nonlinear regression model from one of the following classes:
#' "CDFmodel", "CDFmodelList", "NLM", and "nls.lm".
#' @param distr The possible options are:
#' \enumerate{
#'    \item The name of the cumulative distribution function (CDF).
#'    \item A concrete CDF, defined by the user,
#'          from where to estimate the cumulative probabilities.
#'    \item A numerical vector carrying discrete absolute frequencies for the
#'          same categories provided in \strong{\emph{varobj}}.
#'    \item NULL value (default). In this cases the Shapiro-Wilk test of
#'          normality is applied. The jackknife estimator of a parameter of
#'          Shapiro-Wilk statistic is accomplished.
#' }
#'
#' @param pars CDF model parameters. A list of parameters to evaluate the CDF.
#' @param stat One string denoting the statistic to used in the testing:
#' \enumerate{
#'    \item "ks": Kolmogorov–Smirnov.
#'    \item "ad": Anderson–Darling statistic.
#'    \item "sw": Shapiro-Wilk test of normality.
#'    \item "chisq: Pearson's Chi-squared.
#'    \item "rmse": Root Mean Square of the error.
#'    \item "hd": Hellinger Divergence statistics.
#' }
#' @param min.val A number denoting the lower bound of the domain where CDF
#' is defined.
#' @param breaks Default is NULL. Basically, the it is same as in function
#' \code{\link[graphics]{hist}}. If \emph{breaks} = NULL, then function
#' 'nclass.FD' (see \code{\link[grDevices]{nclass}} is applied to estimate
#' the breaks.
#' @param par.names (Optional) The names of the parameters from
#' \strong{\emph{distr}} function. Some distribution functions would require to
#' provide the names of their parameters.
#' @param num.sampl Number of resamplings.
#' @param sample.size Size of the samples used for each sampling.
#' @param seed An integer used to set a 'seed' for random number generation.
#' @param num.cores,tasks Parameters for parallel computation using package
#' \code{\link[BiocParallel]{BiocParallel-package}}: the number of cores to use,
#' i.e. at most how many child processes will be run simultaneously (see
#' \code{\link[BiocParallel]{bplapply}} and the number of tasks per job (only
#' for Linux OS).
#' @param verbose if verbose, comments and progress bar will be printed.
#' @importFrom BiocParallel MulticoreParam SnowParam bplapply
#' @importFrom stats ks.test rmultinom
#' @return A numeric vector with the following data:
#'     \enumerate{
#'
#'         \item Statistic value.
#'
#'         \item mc_p.value: the probability of finding the observed, or more
#'               extreme, results when the null hypothesis \eqn{H_0} of a study
#'               question is true obtained Monte Carlo resampling approach.
#'     }
#' @export
#' @author Robersy Sanchez (\url{https://genomaths.com}).
#' @references
#'     \enumerate{
#'         \item Feller, W. On the Kolmogorov-Smirnov Limit Theorems for
#'             Empirical Distributions. Ann. Math. Stat. 19, 177–189 (1948).
#'         \item Anderson, T. . & Darling, D. A. A Test Of Goodness Of Fit. J.
#'             Am. Stat. Assoc. 49, 765–769 (1954).
#'         \item Watson, G. S. On Chi-Square Goodness-Of-Fit Tests for
#'             Continuous Distributions. J. R. Stat. Soc. Ser. B Stat.
#'             Methodol. 20, 44–72 (1958).
#'         \item A. Basu, A. Mandal, L. Pardo, Hypothesis testing for two
#'             discrete populations based on the Hellinger distance. Stat.
#'             Probab. Lett. 80, 206–214 (2010).
#'         \item Patrick Royston (1982). An extension of Shapiro and Wilk's W
#'             test for normality to large samples. Applied Statistics, 31,
#'             115–124. doi: 10.2307/2347973.
#'         \item Y. Bee Wah, N. Mohd Razali, Power comparisons of Shapiro-Wilk,
#'             Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests.
#'             J. Stat. Model. Anal. 2, 21–33 (2011).
#'     }
#' @seealso Distribution fitting: \code{\link{fitMixDist}},
#'     \code{\link[MASS]{fitdistr}}, \code{\link{fitCDF}}, and
#'     \code{\link{bicopulaGOF}}.
#' @examples
#' ## ======== Example 1 =======
#' # Let us generate a random sample a from a specified Weibull distribution:
#' # Set a seed
#' set.seed(1)
#' # Random sample from Weibull( x | shape = 0.5, scale = 1.2 )
#' x <- rweibull(10000, shape = 0.5, scale = 1.2)
#'
#' # MC KS test accept the null hypothesis that variable x comes
#' # from Weibull(x | shape = 0.5, scale = 1.2), while the standard
#' # Kolmogorov-Smirnov test reject the Null Hypothesis.
#' mcgoftest(x,
#'     distr = "weibull", pars = c(0.5, 1.2), num.sampl = 500,
#'     sample.size = 1000, num.cores = 4
#' )
#'
#' ## ========= Example 2 ======
#' # Let us generate a random sample a random sample from a specified Normal
#' # distribution:
#' # Set a seed
#' set.seed(1)
#' x <- rnorm(10000, mean = 1.5, sd = 2)
#'
#' # MC KS test accept the null hypothesis that variable x comes
#' # from N(x | mean = 0.5, sd = 1.2), while the standard
#' # Kolmogorov-Smirnov test reject the Null Hypothesis. This an old KS issue,
#' # well known by statisticians since the 1970s.
#' mcgoftest(x,
#'     distr = "norm", pars = c(1.5, 2), num.sampl = 500,
#'     sample.size = 1000, num.cores = 1
#' )
#'
#' ## ========= Example 3 ======
#' ## Define a Weibull 3-parameter distribution function.
#' ## Functions must be defined out of the example section.
#' \dontrun{
#' pwdist <- function(x, pars) {
#'     pweibull(x - pars[1],
#'         shape = pars[2],
#'         scale = pars[3]
#'     )
#' }
#' rwdist <- function(n, pars) {
#'     rweibull(n,
#'         shape = pars[2],
#'         scale = pars[3]
#'     ) + pars[1]
#' }
#'
#' ## A random generation from Weibull-3P
#' set.seed(123)
#' pars <- c(mu = 0.9, shape = 1.4, scale = 3.7)
#' w <- rwdist(200, pars = pars)
#'
#' ## Testing GoF
#' mcgoftest(
#'     varobj = w, distr = "wdist", pars = list(pars), num.sampl = 100,
#'     sample.size = 199, stat = "chisq", num.cores = 4, breaks = 100,
#'     seed = 123
#' )
#' }
#'
#' ## ========= Example 4 ======
#' ## ----- Testing GoF of a mixture distribution. ----
#' ## Define a mixture distribution to be evaluated with functions 'mixtdistr'
#' ## (see ?mixtdistr). In the current case, it will be mixture of a Log-Normal
#' ## and a Weibull distributions:
#'
#' phi <- c(0.37, 0.63) # Mixture proportions
#' args <- list(
#'     lnorm = c(meanlog = 0.837, sdlog = 0.385),
#'     weibull = c(shape = 2.7, scale = 5.8)
#' )
#'
#' ##  Sampling from the specified mixture distribution
#' set.seed(123)
#' x <- rmixtdistr(n = 1e5, phi = phi, arg = args)
#' hist(x, 100, freq = FALSE)
#' x1 <- seq(0, 10, by = 0.001)
#' lines(x1, dmixtdistr(x1, phi = phi, arg = args), col = "red")
#'
#' ## The GoF for the simulated sample
#' pars <- c(list(phi = phi), arg = list(args))
#' mcgoftest(
#'     varobj = x, distr = "mixtdistr", pars = pars, num.sampl = 999,
#'     sample.size = 999, stat = "chisq", num.cores = 4, breaks = 200,
#'     seed = 123
#' )
#'
##' #' ## ========= Example 5 ======
#' ## Shapiro-Wilk test of normality.
#'
#' set.seed(151)
#' r <- rnorm(21, mean = 5, sd = 1)
#' shapiro.test(r) # Classical test
#'
#' mcgoftest(r, stat = "sw")
#'
#' ## ========= Example 6 ======
#' ## GoF for Dirichlet Distribution (these examples can be run,
#' ## the 'do not-run' is only to prevent time consuming in R checking)
#' \dontrun{
#' set.seed(1)
#' alpha <- c(2.1, 3.2, 3.3)
#' x <- rdirichlet(n = 100, alpha = alpha)
#'
#' mcgoftest(
#'     varobj = x, distr = "dirichlet",
#'     pars = alpha, num.sampl = 999,
#'     sample.size = 100, stat = "chisq",
#'     par.names = "alpha",
#'     num.cores = 4, breaks = 50,
#'     seed = 1
#' )
#'
#' ## Now, adding some noise to the sample
#' set.seed(1)
#' x <- x + replicate(3, runif(100, max = 0.1))
#' x <- x / rowSums(x)
#'
#' mcgoftest(
#'     varobj = x, distr = "dirichlet",
#'     pars = alpha, num.sampl = 999,
#'     sample.size = 100, stat = "chisq",
#'     par.names = "alpha",
#'     num.cores = 4, breaks = 50,
#'     seed = 1
#' )
#' }
#'
#' ## ========= Example 7 ======
#' ## Testing multinomially distributed vectors
#'
#' ## A vector of probability parameters
#' set.seed(123)
#' prob <- round(runif(12, min = 0.1, max = 0.8), 2)
#' prob <- prob / sum(prob) # To normalize the probability vector
#'
#' ## Generate multinomially distributed random numeric vectors
#' r <- rmultinom(12, size = 120, prob = prob)
#'
#' ## A list the parameter values must be provided
#' pars <- list(size = 120, prob = prob)
#'
#' mcgoftest(r,
#'     distr = "multinom", stat = "chisq",
#'     pars = pars, sample.size = 20,
#'     num.sampl = 50
#' )
#'
#' ## ========= Example 8 ======
#' ##  Testing whether two discrete probabilities vectors
#' ##  would come from different probability distributions.
#' set.seed(1)
#' ## Vector of absolute frequencies
#' n <- 12
#' alpha <- round(runif(n, min = 0.1, max = 0.8) * 10, 1)
#' x1 <- round(runif(n = n) * 100)
#'
#' ## Add some little noise
#' x2 <- x1 + round(runif(n, min = 0.1, max = 0.2) * 10)
#'
#' mcgoftest(
#'     varobj = x1, distr = x2,
#'     num.sampl = 999,
#'     stat = "chisq",
#'     seed = 1
#' )
#'
#' ## Compare it with Chi-square test from R package 'stat'
#' chisq.test(rbind(x1, x2), simulate.p.value = TRUE, B = 2e3)$p.value
#'
#' ## Add bigger noise to 'x1'
#' x3 <- x1 + round(rnorm(n, mean = 5, sd = 1) * 10)
#' mcgoftest(
#'     varobj = x1, distr = x3,
#'     num.sampl = 999,
#'     stat = "chisq",
#'     seed = 1
#' )
#'
#' ## Chi-square test from R package 'stat' is consistent with 'mcgoftest'
#' chisq.test(rbind(x1, x3), simulate.p.value = TRUE, B = 2e3)$p.value
#'
#' ## We will always fail in to detect differences between 'crude' probability
#' ## vectors without additional information.
#' chisq.test(x = x1, y = x3, simulate.p.value = TRUE, B = 2e3)$p.value
#'
#' @aliases mcgoftest
setGeneric(
    "mcgoftest",
    def = function(
        varobj,
        model,
    ...) {
        standardGeneric("mcgoftest")
    }
)


setClassUnion("numeric_OR_matrix", c("numeric", "matrix", "data.frame"))
setOldClass(c("CDFmodel", "CDFmodelList"))

#' @rdname mcgoftest
#' @aliases mcgoftest
#' @export
setMethod(
    "mcgoftest", signature(
        varobj = "numeric_OR_matrix",
        model = "missing"
    ),
    function(varobj,
    model = NULL,
    distr = NULL,
    pars = NULL,
    num.sampl = 999,
    sample.size = NULL,
    stat = c("ks", "ad", "sw", "rmse", "chisq", "hd"),
    min.val = NULL,
    breaks = NULL,
    par.names = NULL,
    seed = 1,
    num.cores = 1L,
    tasks = 0L,
    verbose = TRUE) {
        ## A starting permutation script for permutation test used the idea
        ## published on \href{https://goo.gl/hfnNy8}{Alastair Sanderson} An idea
        ## presented in his post: \emph{Using R to analyse data statistical and
        ## numerical data analysis with R}. Herein, it is modified and extended.
        set.seed(seed)
        stat <- match.arg(stat)
        require_pars <- is.element(stat, c("ks", "ad", "rmse", "chisq", "hd"))
        if (require_pars && is.null(pars) && is.character(distr)) {
            stop(
                "\n*** If model = NULL, a list of parameter values is ",
                "required for the application of '",
                c("ks", "ad", "rmse", "chisq", "hd")[require_pars],
                "' approach.",
                "\nYou can see the examples."
            )
        }

        ## Remove NA values
        varobj <- na.omit(varobj)

        if (!is.null(min.val)) {
            varobj <- varobj[ which(varobj > min.val) ]
        }

        if (is.null(distr) && stat != "sw") {
            if (length(varobj) < 7) {
                stop("\n*** At least 7 observation/values must be provided")
            }
            stat <- "sw"
            message("--> Applying Shapiro-Wilk test of normality.")
        }

        if (stat == "sw") {
            parametric <- TRUE
            distr <- NULL
            num.sampl <- length(varobj)
            if (is.null(sample.size)) {
                sample.size <- length(varobj) - 1
            }
            if (sample.size > 2000) {
                stop(
                    "\n*** For Shapiro-Wilk test sample size  must be",
                    " between 7 and 2000"
                )
            }
        }

        if (stat == "sw" && !is.null(distr)) {
            distr <- NULL
            warning(
                "*** 'sw' approach does not require for ",
                "a 'distr' value, since it is a test of normality without ",
                "further assumptions.\n"
            )
        }

        if (is.character(distr)) {
            if (distr == "dirichlet" && is.element(stat, c("ks", "ad"))) {
                stop(
                    "\n*** 'ks' and 'ad' approaches are not available ",
                    "for n-dimensional distributions"
                )
            }
        }

        if (is.numeric(distr)) {
            if (is.element(stat, c("ks", "ad"))) {
                distr <- "chisq"
                warning(
                    "*** 'ks' and 'ad' approaches are not",
                    " available for discrete distributions",
                    "\n'chisq' approach was applied"
                )
            }

            if (!is.null(dim(distr))) {
                stop(
                    "*** 'distr' must be a character string naming a ",
                    "distribution function e.g, 'norm' will permit accessing ",
                    "functions 'pnorm' and 'rnorm', or a numerical vector"
                )
            }
            if (!is.null(dim(varobj))) {
                stop(
                    "\n*** if 'distr' is a numerical vector,",
                    " then 'varobj' must be a numerical vector as well"
                )
            }
            if (length(distr) != length(varobj)) {
                stop(
                    "\n*** if 'distr' is a numeric vector,",
                    " then length(distr) == length(varobj)"
                )
            }
            parametric <- FALSE
        }

        if (is.character(distr)) {
            is.discr <- is.element(distr, c("dirichlet", "multinom"))
            pdistr <- paste0("p", distr)
            pdistr <- try(get(pdistr,
                mode = "function",
                envir = parent.frame()
            ), silent = TRUE)

            if (inherits(pdistr, "try-error") && !is.discr) {
                pdistr <- distr
                pdistr <- try(get(pdistr,
                    mode = "function",
                    envir = parent.frame()
                ), silent = TRUE)

                if (inherits(pdistr, "try-error")) {
                    stop(
                        "*** 'distr' must be a character string naming a ",
                        "distribution function e.g, 'norm' will permit ",
                        "accessing functions 'pnorm' and 'rnorm', or a",
                        " numerical vector."
                    )
                }
            }
            rdistr <- paste0("r", distr)
            rdistr <- try(get(rdistr,
                mode = "function",
                envir = parent.frame()
            ), silent = TRUE)

            if (inherits(rdistr, "try-error") && !is.discr) {
                stop(
                    "*** 'distr' must be a character string naming a ",
                    "distribution function e.g, 'norm' will permit accessing ",
                    "functions 'pnorm' and 'rnorm'"
                )
            }
            parametric <- TRUE
        }

        if (inherits(varobj, c("matrix", "data.frame"))) {
            l <- nrow(varobj)
        } else {
            l <- length(varobj)
        }

        if (is.null(sample.size)) {
            sample.size <- l
        }

        if (sample.size > l && !parametric) {
            sample.size <- l
        }
        statis <- switch(stat,
            ks = "Kolmogorov-Smirnov",
            ad = "Anderson–Darling",
            sw = "Shapiro-Wilk",
            rmse = "Root Mean Square",
            chisq = "Pearson's Chi-squared",
            hd = "Hellinger test"
        )

        if (verbose) {
            method <- ifelse(sample.size < num.sampl,
                "Monte Carlo GoF testing",
                "Permutation GoF testing"
            )

            approach <- ifelse(parametric, "parametric approach",
                "non-parametric approach"
            )

            cat(
                "***", method, "based on",
                statis, "statistic",
                "(", approach, ")",
                " ...\n"
            )
        }

        if (stat == "ks" && !parametric && sample.size < length(varobj)) {
            warning(
                "*** ", method, " based on ", statis, " statistic ",
                "(", approach, ") ", "is not reliable. \n",
                "Use the parametric approach or permutation instead."
            )
        }

        if (parametric) {
            res <- GoFtest(
                x = varobj,
                R = num.sampl,
                distr = distr,
                szise = sample.size,
                pars = pars,
                stat = stat,
                breaks = breaks,
                par.names = par.names,
                num.cores = num.cores,
                tasks = tasks,
                verbose = verbose
            )
        } else {
            if (is.numeric(distr)) {
                if (stat == "rmse") stat <- "rmst"
                res <- tableBoots(
                    rbind(varobj, distr),
                    stat = stat,
                    num.permut = num.sampl
                )
            }
        }
        return(res)
    }
)


#' @rdname mcgoftest
#' @aliases mcgoftest
#' @export
setMethod(
    "mcgoftest", signature(varobj = "numeric", model = "CDFmodel"),
    function(varobj,
    model,
    num.sampl = 999,
    sample.size = NULL,
    stat = c("ks", "ad", "sw", "rmse", "chisq", "hd"),
    min.val = NULL,
    breaks = NULL,
    par.names = NULL,
    seed = 1,
    num.cores = 1L,
    tasks = 0L,
    verbose = TRUE) {
        res <- mcgoftest(
            varobj = varobj,
            distr = model$cdf,
            pars = coef(model$bestfit),
            num.sampl = num.sampl,
            sample.size = sample.size,
            stat = stat,
            min.val = min.val,
            breaks = breaks,
            par.names = par.names,
            seed = seed,
            num.cores = num.cores,
            tasks = tasks,
            verbose = verbose
        )


        return(res)
    }
)


#' @rdname mcgoftest
#' @aliases mcgoftest
#' @export
setMethod(
    "mcgoftest",
    signature(varobj = "numeric_OR_matrix", model = "CDFmodelList"),
    function(varobj,
    model,
    num.sampl = 999,
    sample.size = NULL,
    stat = c("ks", "ad", "sw", "rmse", "chisq", "hd"),
    min.val = NULL,
    breaks = NULL,
    par.names = NULL,
    seed = 1,
    num.cores = 1L,
    tasks = 0L,
    verbose = TRUE) {
        l <- seq_len(length(model) - 1)
        nms <- names(model[l])

        if (inherits(varobj, c("matrix", "data.frame"))) {
            res <- lapply(
                l,
                function(k) {
                    mcgoftest(
                        model = model[[k]],
                        varobj = varobj[, k],
                        num.sampl = num.sampl,
                        sample.size = sample.size,
                        stat = stat,
                        min.val = min.val,
                        breaks = breaks,
                        par.names = par.names,
                        seed = seed,
                        num.cores = num.cores,
                        tasks = tasks,
                        verbose = verbose
                    )
                }
            )
        }

        if (inherits(varobj, "list")) {
            res <- lapply(
                l,
                function(k) {
                    mcgoftest(
                        model = model[[k]],
                        varobj = varobj[[k]],
                        num.sampl = num.sampl,
                        sample.size = sample.size,
                        stat = stat,
                        min.val = min.val,
                        breaks = breaks,
                        par.names = par.names,
                        seed = seed,
                        num.cores = num.cores,
                        tasks = tasks,
                        verbose = verbose
                    )
                }
            )
        }

        names(res) <- nms
        res <- do.call(rbind, res)
        return(res)
    }
)


#' @rdname mcgoftest
#' @aliases mcgoftest
#' @export
setMethod(
    "mcgoftest", signature(model = "nls"),
    function(
        varobj,
        model,
        num.sampl = 999,
        sample.size = NULL,
        stat = c("ks", "ad", "sw", "rmse", "chisq", "hd"),
        min.val = NULL,
        breaks = NULL,
        par.names = NULL,
        seed = 1,
        num.cores = 1L,
        tasks = 0L,
        verbose = TRUE) {

        form <- model$m$formula()

        res <- mcgoftest(
            varobj = varobj,
            distr = form[[3L]][[1L]],
            pars = coef(model),
            num.sampl = num.sampl,
            sample.size = sample.size,
            stat = stat,
            min.val = min.val,
            breaks = breaks,
            par.names = par.names,
            seed = seed,
            num.cores = num.cores,
            tasks = tasks,
            verbose = verbose
        )

        return(res)
    }
)

#' @rdname mcgoftest
#' @aliases mcgoftest
#' @export
setMethod(
    "mcgoftest", signature(model = "nlsModel"),
    function(
        varobj,
        model,
        num.sampl = 999,
        sample.size = NULL,
        stat = c("ks", "ad", "sw", "rmse", "chisq", "hd"),
        min.val = NULL,
        breaks = NULL,
        par.names = NULL,
        seed = 1,
        num.cores = 1L,
        tasks = 0L,
        verbose = TRUE) {

        form <- model$m$formula()

        res <- mcgoftest(
            varobj = varobj,
            distr = form[[3L]][[1L]],
            pars = coef(model),
            num.sampl = num.sampl,
            sample.size = sample.size,
            stat = stat,
            min.val = min.val,
            breaks = breaks,
            par.names = par.names,
            seed = seed,
            num.cores = num.cores,
            tasks = tasks,
            verbose = verbose
        )

        return(res)
    }
)


#' @rdname mcgoftest
#' @aliases mcgoftest
#' @export
setMethod(
    "mcgoftest", signature(model = "nls.lm"),
    function(
        varobj,
        model,
        distr,
        num.sampl = 999,
        sample.size = NULL,
        stat = c("ks", "ad", "sw", "rmse", "chisq", "hd"),
        min.val = NULL,
        breaks = NULL,
        par.names = NULL,
        seed = 1,
        num.cores = 1L,
        tasks = 0L,
        verbose = TRUE) {

        res <- mcgoftest(
            varobj = varobj,
            distr = distr,
            pars = coef(model),
            num.sampl = num.sampl,
            sample.size = sample.size,
            stat = stat,
            min.val = min.val,
            breaks = breaks,
            par.names = par.names,
            seed = seed,
            num.cores = num.cores,
            tasks = tasks,
            verbose = verbose
        )

        return(res)
    }
)


# ======================= Anderson–Darling statistic ========================= #

ad_stat <- function(x, distr, pars = NULL, par.names = NULL) {
    x <- sort(x[complete.cases(x)])
    if (!missing(distr)) {
        x <- distfn(
            x = x,
            dfn = distr,
            type = "p",
            arg = pars,
            par.names = par.names
        )
    }

    x <- x[x > 0]
    n <- length(x)
    if (n < 8) {
        stop("AD statistic the sample size must be greater than 7")
    }

    h <- x * (1 - rev(x))
    h <- (2 * seq(x) - 1) * log(h)
    return(-n - mean(h, na.rm = TRUE))
}


# ============================================================================ #
# ======================= Auxiliary functions ================================ #
# ============================================================================ #

# ================ Auxiliary function to compute the frequencies ============= #

freqs <- function(x, distr, pars = NULL, breaks = NULL,
    par.names = NULL) {
    n <- length(x)

    if (is.null(breaks)) {
        breaks <- nclass.FD(x)
    }
    DENS <- hist(x, breaks = breaks, plot = FALSE)
    freq0 <- DENS$counts
    bounds <- data.frame(
        lower = DENS$breaks[-length(DENS$breaks)],
        upper = DENS$breaks[-1]
    )
    up_val <- distfn(
        x = bounds$upper, dfn = distr,
        type = "p", arg = pars,
        par.names = par.names
    )
    low_val <- distfn(
        x = bounds$lower, dfn = distr,
        type = "p", arg = pars,
        par.names = par.names
    )
    freq <- round(n * (up_val - low_val))

    return(data.frame(obsf = freq0, expf = freq))
}

# ------------- n-dimensional frequency based on marginals
# So far, only for Dirichlet distribution

freqnd <- function(x, distr, breaks, pars,
    par.names = NULL) {
    fq <- switch(distr,
        beta = {
            alfa <- sum(pars)
            lapply(
                seq_len(ncol(x)),
                function(k) {
                    freqs(
                        x = x[, k],
                        distr = distr,
                        pars = list(
                            shape1 = pars[k],
                            shape2 = alfa - pars[k]
                        ),
                        breaks = breaks,
                        par.names = par.names
                    )
                }
            )
        },
        binom = {
            lapply(
                seq_len(ncol(x)),
                function(k) {
                    freqs(
                        x = x[, k],
                        distr = distr,
                        pars = list(
                            size = pars[[1]],
                            prob = pars[[2]][k]
                        ),
                        breaks = breaks,
                        par.names = par.names
                    )
                }
            )
        }
    )

    fq <- do.call(rbind, fq)
    rownames(fq) <- NULL
    return(fq)
}

FREQs <- function(x, distr, pars = NULL, breaks = NULL,
    par.names = NULL) {
    if (is.element(distr, c("dirichlet", "multinom"))) {
        distr <- switch(distr,
            dirichlet = "beta",
            multinom = "binom"
        )
        fq <- freqnd(
            x = x, distr = distr,
            breaks = breaks, pars = pars,
            par.names = NULL
        )
    } else {
        fq <- freqs(
            x = x, distr, breaks = breaks,
            pars = pars, par.names = NULL
        )
    }
    return(fq)
}

# ====================== Root-Mean-Square statistic ========================== #

rmse <- function(x, distr, pars, breaks = NULL, par.names) {
    freq <- FREQs(
        x = x, distr = distr, pars = pars,
        breaks = breaks, par.names = par.names
    )
    return(sqrt(sum((freq$obsf - freq$expf)^2, na.rm = TRUE)) / length(freq$obsf))
}

# =================== Pearson's Chi-squared  statistic ======================= #

chisq <- function(x, distr, pars, breaks = NULL, par.names) {
    freq <- FREQs(
        x = x, distr = distr, pars = pars,
        breaks = breaks, par.names = par.names
    )
    if (any(freq$expf == 0)) {
        freq$expf <- freq$expf + 0.5
        freq$obsf <- freq$obsf + 0.5
    }
    return(sum((freq$obsf - freq$expf)^2 / freq$expf))
}

# ====================== Kolmogorov-Smirnov  statistic ======================= #

ks_stat <- function(x, distr, pars, par.names) {
    cdf <- function(x, pars) {
        distfn(
            x = x, dfn = distr, type = "p",
            arg = pars, par.names = par.names
        )
    }
    res <- suppressWarnings(ks.test(x = x, y = cdf, pars))
    return(list(stat = unname(res$statistic), p.value = res$p.value))
}

# ================== Auxiliary function to get distribution ================== #

distfn <- function(x, dfn, type = "r", arg, par.names = NULL) {
    if (dfn == "dirichlet") {
        if (type == "r")
            res <- rdirichlet(n = x, arg)
        else
            res <- pdirichlet(q = x, arg)
    }
    else {
        if (!is.null(par.names)) {
            arg <- as.list(arg)
            args <- c(list(x), arg)
            if (type == "r") {
                names(args) <- c("n", par.names)
            } else {
                names(args) <- c("q", par.names)
            }
        } else {
            args <- c(list(x), arg)
        }
        res <- do.call(paste0(type, dfn), args)
    }
    return(res)
}

# ==================== Hellinger divergence  statistic ======================= #

hdiv <- function(x, distr, pars, breaks = NULL, par.names) {
    p <- FREQs(
        x = x, distr = distr, pars = pars,
        breaks = breaks, par.names = par.names
    )
    n1 <- sum(p$obsf, na.rm = TRUE)
    n2 <- sum(p$expf, na.rm = TRUE)
    n <- c(n1, n2)

    if (any(p$expf == 0)) {
        p$obsf <- (p$obsf + 1) / (n1 + length(p$obsf))
        p$expf <- (p$expf + 1) / (n2 + length(p$expf))
    } else {
        p$expf <- p$expf / n1
        p$obsf <- p$obsf / n2
    }

    w <- (2 * n[1] * n[2]) / (n[1] + n[2])
    sum_hd <- sapply(
        seq_along(x),
        function(k) (sqrt(p[k, 1]) - sqrt(p[k, 2]))^2
    )

    return(w * sum(sum_hd, na.rm = TRUE))
}


# ======================= To compute the statistic ========================== #

# ------- To compute the statistic

stat_fun <- function(a, distr, pars, stat, breaks, par.names) {
    switch(stat,
        ks = ks_stat(
            x = a, distr = distr, pars = pars,
            par.names = par.names
        )$stat,
        ad = ad_stat(
            x = a, distr = distr, pars = pars,
            par.names = par.names
        ),
        rmse = rmse(
            x = a, distr = distr, pars = pars,
            par.names = par.names,
            breaks = breaks
        ),
        chisq = chisq(
            x = a, distr = distr, pars = pars,
            par.names = par.names, breaks = breaks
        ),
        hd = hdiv(
            x = a, distr = distr, pars = pars,
            par.names = par.names,
            breaks = breaks
        )
    )
}


# ------- To iterate the statistic computation

DoIt <- function(r, x, distr, pars, stat, breaks, par.names, szise) {

    if (distr == "multinom") {
        # to test empirical versus theoretical values
        stat_fun(
            a = x, distr = distr, pars = pars, stat = stat,
            breaks = breaks, par.names = par.names
        )
    }
    else {
        a <- distfn(
            x = szise, dfn = distr,
            type = "r", arg = pars,
            par.names = par.names
        )

        # to test empirical versus theoretical values
        stat_fun(
            a = a, distr = distr, pars = pars, stat = stat,
            breaks = breaks, par.names = par.names
        )
    }
}

# DoIt(1, distr= distr, pars=pars, stat=stat, breaks=breaks,
#      par.names= par.names)

# ======================= GoF test function caller =========================== #

GoFtest <- function(
    x,
    R,
    distr,
    szise,
    pars,
    stat,
    breaks,
    par.names,
    num.cores,
    tasks,
    verbose) {
    if (verbose) progressbar <- TRUE else progressbar <- FALSE
    if (num.cores > 1 && stat != "sw") {
        if (Sys.info()["sysname"] == "Linux") {
            bpparam <- MulticoreParam(
                workers = num.cores, tasks = tasks,
                progressbar = progressbar
            )
        } else {
            bpparam <- SnowParam(
                workers = num.cores, type = "SOCK",
                progressbar = progressbar
            )
        }

        bstats <- unlist(bplapply(1:R, DoIt, x,
            distr = distr,
            pars = pars, stat = stat,
            breaks = breaks,
            szise = szise,
            par.names = par.names,
            BPPARAM = bpparam
        ))
    }
    else {
        if (stat != "sw") {
            bstats <- c()
            for (k in seq_len(R)) {
                bstats <- c(
                    bstats,
                    DoIt(
                        1, x, distr, pars, stat, breaks,
                        par.names, szise
                    )
                )
            }
        }
    }

    if (stat == "sw") {
        bstats <- shapiro(x)
    }


    if (is.element(stat, c("ks", "ad")) && is.numeric(distr)) {
        stop(
            "\n*** 'ks' and 'ad' are only available for continuous",
            " distribution"
        )
    }

    res <- switch(stat,
        ks = {
            statis <- ks_stat(
                x = x, distr = distr, pars = pars,
                par.names = par.names
            )
            p.value <- statis$p.value
            statis <- statis$stat
            c(
                KS.stat.D = statis,
                mc_p.value = mean(c(statis, bstats) >= statis,
                    na.rm = TRUE
                ),
                KS.stat.p.value = p.value,
                sample.size = szise,
                num.sampl = R
            )
        },
        ad = {
            statis <- ad_stat(
                x = x, distr = distr, pars = pars,
                par.names = par.names
            )
            c(
                AD.stat = statis,
                mc_p.value = mean(c(statis, bstats) >= statis,
                    na.rm = TRUE
                ),
                sample.size = szise,
                num.sampl = R
            )
        },
        sw = {
            statis <- shapiro.test(x = x)
            w <- mean(statis$statistic, na.rm = TRUE)
            c(
                SW.stat.p.value = statis$p.value,
                jackknife_p.value = sw_sta_pval(w, szise),
                sample.size = szise,
                num.sampl = R
            )
        },
        rmse = {
            statis <- rmse(
                x = x, distr = distr,
                pars = pars,
                par.names = par.names,
                breaks = breaks
            )
            c(
                rmse = statis,
                mc_p.value = mean(c(statis, bstats) >= statis,
                    na.rm = TRUE
                ),
                sample.size = szise,
                num.sampl = R
            )
        },
        chisq = {
            statis <- chisq(
                x = x, distr = distr, pars = pars,
                par.names = par.names,
                breaks = breaks
            )
            c(
                Chisq = statis,
                mc_p.value = mean(c(statis, bstats) >= statis,
                    na.rm = TRUE
                ),
                sample.size = szise,
                num.sampl = R
            )
        },
        hd = {
            statis <- hdiv(
                x = x, distr = distr, pars = pars,
                par.names = par.names,
                breaks = breaks
            )
            c(
                hd = statis,
                mc_p.value = mean(c(statis, bstats) >= statis,
                    na.rm = TRUE
                ),
                sample.size = szise, num.sampl = R
            )
        }
    )
    return(res)
}

# ============== Shapiro-Wilk statistic & related functions ================== #

shapiro <- function(r) {
    ## A jackknife approach (live out one)
    idx <- jackknife_index(r)
    statis <- unlist(lapply(idx, function(i) {
        shapiro.test(r[i])$statistic
    }))
    return(statis)
}

## ------------ Calculate significance level for Shapiro-Wilk statistic

sw_sta_pval <- function(w, n) {
    ## polynomial coefficients
    lambda_7_20 <- c(0.118898, 0.133414, 0, 327907)
    lambda_21_2000 <- c(
        0.480385, 0.318828, 0, -0.0241665,
        0.00879701, 0.002989646
    )
    log_mu_7_20 <- c(-0.37542, -0.492145, -1.124332, -0.199422)
    log_mu_21_2000 <- c(
        -1.91487, -1.37888, -0.04183209,
        0.1066339, -0.03513666, -0.01504614
    )

    log_s_7_20 <- c(-3.15805, 0.729399, 3.01855, 1.558776)
    log_s_21_2000 <- c(
        -3.73538, -1.015807, -0.331885, 0.1773538,
        -0.01638782, -0.03215018, 0.003852646
    )

    nlog <- log(n)
    if (n > 6 && n < 21) {
        x <- nlog - 3
        lambda <- lambda_7_20[1] + lambda_7_20[2] * x + lambda_7_20[3] * x^2
        mu <- log_mu_7_20[1] + log_mu_7_20[2] * x + log_mu_7_20[3] * x^2 +
            log_mu_7_20[4] * x^3
        sd <- log_s_7_20[1] + log_s_7_20[2] * x + log_s_7_20[3] * x^2 +
            log_s_7_20[4] * x^3
        pval <- pnorm(((1 - w)^lambda - exp(mu)) / exp(sd),
            lower.tail = FALSE
        )
    }

    if (n > 20 && n < 2001) {
        x <- nlog - 5
        m <- seq_len(6)
        pars <- sapply(m, function(k) {
            lambda <- lambda_21_2000[k] * x^(k - 1)
            mu <- log_mu_21_2000[k] * x^(k - 1)
            sd <- log_s_21_2000[k] * x^(k - 1)
            return(c(lambda = lambda, mu = mu, sd = sd))
        })

        pars <- rowSums(pars)
        pval <- pnorm(((1 - w)^pars[1] - exp(pars[2])) / exp(pars[3]),
            lower.tail = FALSE
        )
    }
    return(unname(pval))
}


# ===================== Jackknife leave-out-one indexes ====================== #

jackknife_index <- function(r) {
    l <- length(r)
    f <- function(i) c(setdiff(i, l), l)
    idx <- combn(seq_along(r), l - 1,
        FUN = f, simplify = FALSE
    )
    idx[[1]] <- idx[[1]][-l]
    return(idx)
}



# --------------------------- End auxiliary function ------------------------- #
genomaths/usefr documentation built on April 18, 2023, 3:35 a.m.