usefr: Utility Functions for Statistical Analyses

## ########################################################################## #
##
## Copyright (C) 2019-2023 Robersy Sanchez <https://genomaths.com/>
##
## Author: Robersy Sanchez
#
## This file is part of the R package "usefr".
##
## 'usefr' is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.
##
## ########################################################################## #
#
#' @rdname fitCDF2
#' @title Nonlinear fit of a commutative distribution function
#' @description Usually the parameter estimation of a cumulative distribution
#' function (*CDF*) are accomplished using the corresponding probability
#' density function (*PDF*). Different optimization algorithms can be used to
#' accomplished this task and different algorithms can yield different
#' estimated parameters. Hence, why not try to fit the CDF directly?
#' @param varobj A a vector, a named list, a matrix or a data.frame, containing
#' the observations from the variable for which the CDF parameters will be
#' estimated. When the argument is a matrix or a data.frame, the columns must
#' be named, carrying the objective variables.
#' @param distNames a vector of distribution numbers to select from the listed
#' below in details section, e.g. c(1:10, 15). If 'distNames' is not any of
#' current 20 named distributions, then it can be any arbitrary character
#' string, but the argument 'distf' must be given (see below).
#' @param start A named numerical vector giving the parameters to be optimized
#' with initial values or a list of numerical vectors (only when
#' \emph{\strong{varobj}} is a list, a matrix or a data.frame). This can be
#' omitted for some of the named distributions (see Details). This argument
#' will be used if provided for only one distribution. The default parameter
#' values are:
#' \enumerate{
#'     \item norm = c( mean = MEAN, sd = SD )
#'     \item lnorm = c(meanlog = mean( log1p(X), na.rm = TRUE),
#'             sdlog = sd( log1p( X ), na.rm = TRUE))
#'     \item hnorm = c(theta = sqrt(pi)/(SD*sqrt(2)))
#'     \item gnorm = c( mean = MEAN, sigma = SD, beta = 2)
#'     \item tgnorm = c( mean = MEAN, sigma = SD, beta = 2)
#'     \item laplace = c( mean = MEAN, sigma = sqrt( VAR))
#'     \item gamma = c(shape_scale(X, gg = FALSE))
#'     \item gamma3p = c(shape_scale(X, gg = FALSE), mu = 0),
#'     \item ggamma = c(shape_scale(X, gg = TRUE), mu = MIN, psi = 1)
#'     \item ggamma = c(shape_scale(X, gg = TRUE), psi = 1)
#'     \item weibull = c( shape = log( 2 ), scale = Q)
#'     \item weibull3p = c( mu = MIN, shape = log( 2 ), scale = Q)
#'     \item beta = c(shape1 = 1, shape2 = 2)
#'     \item beta3 = c(shape1 = 1, shape2 = 2, a = MIN)
#'     \item beta4 = c(shape1 = 2, shape2 = 3, a=0.9 * MIN, b=1.1 * MAX)
#'     \item bweibull = c(alpha=1, beta=2, shape = log( 2 ), scale = Q)
#'     \item gbeta = c( shape1 = 1, shape2 = 2, lambda = 1)
#'     \item rayleigh = c( sigma = SD )
#'     \item exp = c( rate = 1)
#'     \item exp2 = c( rate = 1, mu = 0)
#'     \item geom = c(prob = ifelse(MEAN > 0, 1/(1 + MEAN), 1))
#'     \item bgeom = c(shape1 = 1, shape2 = 1)
#'     \item lgamma = shape_scale(log1p(X), gg = FALSE)
#'     \item lpgamma3p = c(shape_scale(log1p(X), gg = FALSE), mu = 0)
#' }
#' @param loss.fun Loss function(s) used in the regression (see
#' \href{https://en.wikipedia.org/wiki/Loss_function}{(Loss function)}). After
#' \eqn{z =  1/2 * sum((f(x) - y)^2)} we have:
#' \enumerate{
#'     \item "linear": linear function which gives a standard least squares:
#'           \eqn{loss(z) = z}.
#'     \item "huber": Huber loss, \eqn{loss(z) = ifelse(z <= 1, z, sqrt(z) -1)}.
#'     \item "smooth": Smooth approximation to the sum of residues absolute
#'           values: \eqn{loss(z) = 2*(sqrt(z + 1) - 1)}.
#'     \item "cauchy": Cauchy loss: \eqn{loss(z) = log(z + 1)}.
#'     \item "arctg": arc-tangent loss function: \eqn{loss(x) = atan(z)}.
#' }
#' @param min.val A number denoting the lower bound of the domain where CDF
#' is defined. For example, for Weibull and GGamma \strong{\emph{min.val = 0}}.
#' @param plot Logical. Default FALSE Whether to produce the plots for the best
#' fitted CDF.
#' @param plot.num The number of distributions to be plotted.
#' @param distf A character string naming a cumulative distribution function(s)
#' (CDF) present in the R session environment . For example, \strong{gamma} or
#' \strong{norm}, etc, from where, internally, we can get: density,
#' distribution function, quantile function and random generation as:
#' \strong{dnorm}, \strong{pnorm}, \strong{qnorm}, and \strong{rnorm},
#' respectively. If the function is not present in the environment, then an
#' error will be returned. It must given only if 'distNames' is not any of
#' current 20 named distributions (see  details below). Default is NULL.
#' @param only.info Logic. Default TRUE. If true, only information about the
#'   parameter estimation is returned.
#' @param maxiter,maxfev,ptol Parameters to control of various aspects of the
#' Levenberg-Marquardt algorithm through function
#' \code{\link[minpack.lm]{nls.lm.control}} from *minpack.lm* package.
#' @param nls.model Logical. Whether to return the best fitted model as an
#' object from \strong{nlsModel} class. Default is FALSE. If TRUE, then
#' the estimated parameters are used new fitting with \code{\link[stats]{nls}}
#' function.
#' @param algorithm Only if \strong{nls.model} = TRUE. The same as for
#' \code{\link[stats]{nls}} function.
#' @param xlabel (Optional) Label for variable \strong{\emph{varobj}}.
#' Default is \emph{xlabel = "x"}.
#' @param mar,mgp,las,cex.main (Optional) Graphical parameters (see
#' \code{\link[graphics]{par}}).
#' @param cex.text,cex.point Numerical value to scale text and points.
#' @param ... (Optional) Further graphical parameters (see
#' \code{\link[graphics]{par}}). Graphical parameter will simultaneously affect
#' all the plots.
#' @param verbose Logic. If TRUE, prints the function log to stdout
#' @details This function works as function \code{\link{fitCDF}}, but without
#' parallel computation. The reason is the parallelization fails in Windows OS.
#' CDFs defined out the package are not passed to the parallel processes on
#' Windows (so far).
#'
#' The nonlinear fit (NLF) problem for CDFs is addressed with
#' Levenberg-Marquardt algorithm implemented in function
#' \code{\link[minpack.lm]{nls.lm}} from package *minpack.lm*. The Stein's rho
#' for adjusted R squared (rho) is applied as an estimator of the average
#' cross-validation predictive power [1]. This function is inspired in a script
#' for the function \code{\link[propagate]{fitDistr}} from the package
#' propagate [2]. Some parts or script ideas from function
#' \code{\link[propagate]{fitDistr}} are used, but here we to estimate CDF and
#' not the PDF as in the case of "\code{\link[propagate]{fitDistr}}. More
#' informative results are given now. The studentized residuals are provided as
#' well. The list (so far) of possible CDFs is:
#'     \enumerate{
#'         \item Normal \href{https://goo.gl/xaEAdT}{(Wikipedia)}
#'         \item Log-normal \href{https://goo.gl/a7MtYq}{(Wikipedia)}. This
#'         This function is set to fit \eqn{log(1+x)}. Users can transform
#'         their variable by themself and then try the fitting to Normal
#'         distribution.
#'         \item Half-normal \href{https://goo.gl/yxMF6T}{(Wikipedia)}. An
#'             Alternatively using a scaled precision (inverse of the variance)
#'             parametrization (to avoid issues if \eqn{\sigma} is near zero),
#'             obtained by setting \eqn{\theta=sqrt(\pi)/\sigma*sqrt(2)}.
#'         \item Generalized Normal \href{https://goo.gl/EPk8mH}{(Wikipedia)}
#'         \item T-Generalized Normal [3].
#'         \item Laplace \href{https://goo.gl/fCykV9}{(Wikipedia)}
#'         \item Gamma \href{https://goo.gl/cYkvar}{(Wikipedia)}
#'         \item 3P Gamma [4].
#'         \item Generalized 4P Gamma [4]
#'                 \href{https://goo.gl/1n4kpW.}{(Wikipedia)}
#'         \item Generalized 3P Gamma [4].
#'         \item Weibull \href{https://goo.gl/WMXmQP}{(Wikipedia)}
#'         \item 3P Weibull \href{https://goo.gl/WMXmQP}{(Wikipedia)}
#'         \item Beta \href{https://goo.gl/893wzR}{(Wikipedia)}
#'         \item 3P Beta \href{https://goo.gl/893wzR}{(Wikipedia)}
#'         \item 4P Beta \href{https://goo.gl/893wzR}{(Wikipedia)}
#'         \item Beta-Weibull \href{https://goo.gl/dpaG8h}{ReliaWiki}
#'         \item Generalized Beta \href{https://goo.gl/UcVsct}{(Wikipedia)}
#'         \item Rayleigh \href{https://goo.gl/d9b3zv}{(Wikipedia)}
#'         \item Exponential \href{https://goo.gl/stVsi7}{(Wikipedia)}
#'         \item 2P Exponential \href{https://goo.gl/stVsi7}{(Wikipedia)}
#'         \item Geometric \href{https://is.gd/94HW4w}{(Wikipedia)}
#'         \item "Beta-Geometric"
#'               \url{https://www.statisticshowto.com/beta-geometric-distribution/}
#'         \item Log-Gamma \href{https://is.gd/kMQVxX}{(Mathematica)}
#'         \item Log-Gamma 3P \href{https://is.gd/kMQVxX}{(Mathematica)}
#'     }
#'
#' Where, shape_scale function is an internal function that can be
#' retrieve by typing: usefr:::shape_scale.
#' @return After return the plots, a list with following values is provided:
#'     \itemize{
#'         \item aic: Akaike information creterion
#'         \item fit: list of results of fitted distribution, with parameter
#'             values
#'         \item bestfit: the best fitted distribution according to AIC
#'         \item fitted: fitted values from the best fit
#'         \item rstudent: studentized residuals
#'         \item residuals: residuals
#'     }
#'     After cdf = fitCDF2( varobj, ...), attributes( cdf$bestfit ) shows the
#'     list of objects carry on cdf$bestfit:
#'      \itemize{
#'         \item names: "par" "hessian" "fvec" "info" "message" "diag" "niter"
#'                      "rsstrace"  "deviance"
#'         \item class: "nls.lm"
#'      }
#'
#' And fitting details can be retrieved with summary(cdf$bestfit)
#'
#' @importFrom numDeriv grad
#' @importFrom minpack.lm nls.lm nls.lm.control
#' @importFrom utils flush.console
#' @importFrom mixdist weibullpar
#' @import stats
#' @importFrom graphics par grid lines mtext abline
#' @export
#' @author Robersy Sanchez (\url{https://genomaths.com}).
#' @seealso \code{\link{fitCDF}}, \code{\link[MASS]{fitdistr}},
#' \code{\link{fitMixDist}} and for goodness-of-fit: \code{\link{mcgoftest}}.
#' @references
#'   \enumerate{
#'     \item Stevens JP. Applied Multivariate Statistics for the Social
#'           Sciences. Fifth Edit. Routledge Academic; 2009.
#'     \item Andrej-Nikolai Spiess (2014). propagate: Propagation of
#'           Uncertainty. R package version 1.0-4.
#'           http://CRAN.R-project.org/package=propagate
#'     \item Abramowitz, M. and Stegun, I. A. (1972) Handbook of Mathematical
#'           Functions. New York: Dover. Chapter 6: Gamma and Related
#'           Functions.
#'     \item Hand-book on STATISTICAL DISTRIBUTIONS for experimentalists
#'           (pag 73) by Christian Walck. Particle Physics Group Fysikum.
#'           University of Stockholm (e-mail: walck@physto.se).
#'   }
#' @examples
#' set.seed(1230)
#' x1 <- rnorm(10000, mean = 0.5, sd = 1)
#' cdfp <- fitCDF2(x1, distNames = "Normal", plot = FALSE)
#' summary(cdfp$bestfit)
#'
#' ## Add some cosmetics to the plots
#' cdfp <- fitCDF2(x1,
#'     distNames = "Normal", xlabel = "My Nice Variable Label",
#'     plot = T, font.lab = 3, font = 2, font.axis = 2, family = "serif",
#'     cex.lab = 1.3, cex.axis = 1.3
#' )
#'
#' ## Fitting a Weibull distribution with 3 paramaters
#' x1 <- rweibull3p(1000, shape = 0.5, scale = 1, mu = 0.1)
#' cdfp <- fitCDF2(x1,
#'     distNames = "3P Weibull",
#'     xlabel = "My Nice Variable Label",
#'     plot = T, font.lab = 3, font = 2, font.axis = 2, family = "serif",
#'     cex.lab = 1.3, cex.axis = 1.3, cex.main = 1.1,
#'     mgp = c(2.5, 1, 0)
#' )
#' @aliases fitCDF2
setGeneric(
    "fitCDF2",
    def = function(varobj,
    ...) {
        standardGeneric("fitCDF2")
    }
)

#' @rdname fitCDF2
#' @aliases fitCDF2
#' @export
setMethod(
    "fitCDF2", signature(varobj = "numeric"),
    function(varobj,
    distNames,
    plot = FALSE,
    plot.num = 1L,
    distf = NULL,
    start = NULL,
    loss.fun = c(
        "linear", "huber", "smooth",
        "cauchy", "arctg"
    ),
    min.val = NULL,
    only.info = FALSE,
    maxiter = 1024,
    maxfev = 1e+5,
    ptol = 1e-12,
    nls.model = FALSE,
    algorithm = "default",
    xlabel = "x",
    mar = c(4, 4, 3, 1),
    mgp = c(2.5, 0.6, 0),
    las = 1,
    cex.main = 1,
    cex.text = 0.8,
    cex.point = 0.5,
    verbose = TRUE,
    ...) {
        res <- fitCDF(
            varobj = varobj,
            distNames = distNames,
            plot = plot,
            plot.num = plot.num,
            distf = distf,
            start = start,
            loss.fun = loss.fun,
            min.val = min.val,
            only.info = only.info,
            maxiter = maxiter,
            maxfev = maxfev,
            ptol = ptol,
            nls.model = nls.model,
            algorithm = algorithm,
            xlabel = xlabel,
            mar = mar,
            mgp = mgp,
            las = las,
            cex.main = cex.main,
            cex.text = cex.text,
            cex.point = cex.point,
            verbose = verbose,
            ...
        )
        return(res)
    }
)

setClassUnion(
    "list_OR_matrix_OR_dataframe",
    c("list", "matrix", "data.frame")
)

#' @rdname fitCDF2
#' @aliases fitCDF2
#' @importFrom pbapply pboptions pblapply
#' @export
setMethod(
    "fitCDF2", signature(varobj = "list_OR_matrix_OR_dataframe"),
    function(
        varobj,
        distNames,
        plot = FALSE,
        plot.num = 1L,
        distf = NULL,
        start = NULL,
        loss.fun = c(
            "linear", "huber", "smooth",
            "cauchy", "arctg"
        ),
        min.val = NULL,
        only.info = FALSE,
        maxiter = 1024,
        maxfev = 1e+5,
        ptol = 1e-12,
        nls.model = FALSE,
        algorithm = "default",
        xlabel = "x",
        mar = c(4, 4, 3, 1),
        mgp = c(2.5, 0.6, 0),
        las = 1,
        cex.main = 1,
        cex.text = 0.8,
        cex.point = 0.5,
        num.cores = 1L,
        verbose = TRUE,
        ...) {

        var_nms <- names(varobj)
        if (is.list(varobj)) {
            num_samp <- length(varobj)
        }
        if (inherits(varobj, c("matrix", "data.frame"))) {
            num_samp <- ncol(varobj)
        }

        if (!is.numeric(distNames) && !is.character(distNames)) {
            stop(
                "*** 'distNames' argument must be a 'character'",
                " or numeric."
            )
        }

        if (is.character(distNames)) {
            distNames <- list(distNames)
            if (length(distNames) < num_samp) {
                distNames <- rep(distNames, num_samp)
            }
        }

        if (is.numeric(distNames)) {
            distNames <- as.integer(distNames)
            if (any(distNames > 24)) {
                stop("*** 'distNames' must be a string or a number < 24")
            }
            distNames <- list(distNames)
            if (length(distNames) < num_samp) {
                distNames <- rep(distNames, num_samp)
            }
        }

        if (!is.character(distf) && !is.null(distf)) {
            stop(
                "*** 'distf' argument must be a 'character' class",
                "object."
            )
        }

        if (is.character(distf) && length(distf) < num_samp) {
            distf <- list(distf)
            distf <- rep(distf, num_samp)
        }
        if (is.list(start)) {
            if (!all(sapply(start, is.numeric))) {
                stop("*** 'start' must be a list of numerical vectors")
            }
        }

        if (is.numeric(start)) {
            start <- list(start)
            if (length(start) < num_samp) {
                start <- rep(start, num_samp)
            }
        }

        if (is.list(start) && length(start) < num_samp) {
            start <- list(start)[rep(1, num_samp)]
        }

        nms <- (length(loss.fun) < num_samp)
        if (num_samp > 1 && nms) {
            loss.fun <- rep(loss.fun, num_samp)
        }

        if (inherits(varobj, c("matrix", "data.frame"))) {
            varobj <- as.matrix(varobj)
            varobj <- split(varobj,
                            rep(1:ncol(varobj), each = nrow(varobj)))
        }

        if (verbose)
            pboptions(type = "timer", style = 1, char = "=")
        else
            pboptions(type = "none")

        res <- pblapply(seq_along(varobj), function(k) {
                    suppressWarnings(fitCDF(
                            varobj = varobj[[k]],
                            distNames = distNames[[k]],
                            plot = plot,
                            plot.num = plot.num,
                            distf = distf[[k]],
                            start = start[[k]],
                            loss.fun = loss.fun[k],
                            min.val = min.val,
                            only.info = only.info,
                            maxiter = maxiter,
                            maxfev = maxfev,
                            ptol = ptol,
                            nls.model = nls.model,
                            algorithm = algorithm,
                            xlabel = xlabel,
                            mar = mar,
                            mgp = mgp,
                            las = las,
                            cex.main = cex.main,
                            cex.text = cex.text,
                            cex.point = cex.point,
                            verbose = FALSE))})

        names(res) <- var_nms
        res$AICs <- summary_aic(res)
        res <- structure(res, class = "CDFmodelList")
        return(res)
    }
)