R/SDistribution_Categorical.R

# nolint start
#' @name Categorical
#' @template SDist
#' @templateVar ClassName Categorical
#' @templateVar DistName Categorical
#' @templateVar uses in classification supervised learning
#' @templateVar params a given support set, \eqn{x_1,...,x_k}, and respective probabilities, \eqn{p_1,...,p_k},
#' @templateVar pdfpmf pmf
#' @templateVar pdfpmfeq \deqn{f(x_i) = p_i}
#' @templateVar paramsupport \eqn{p_i, i = 1,\ldots,k; \sum p_i = 1}
#' @templateVar distsupport \eqn{x_1,...,x_k}
#' @templateVar default elements = 1, probs = 1
#' @details
#' Sampling from this distribution is performed with the [sample] function with the elements given
#' as the support set and the probabilities from the `probs` parameter. The cdf and quantile assumes
#' that the elements are supplied in an indexed order (otherwise the results are meaningless).
#'
#' The number of points in the distribution cannot be changed after construction.
#'
# nolint end
#
#' @template class_distribution
#' @template field_alias
#' @template method_mode
#' @template method_entropy
#' @template method_kurtosis
#' @template method_pgf
#' @template method_mgfcf
#' @template method_setParameterValue
#' @template param_decorators
#'
#' @family discrete distributions
#' @family univariate distributions
#'
#' @export
Categorical <- R6Class("Categorical",
  inherit = SDistribution, lock_objects = F,
  public = list(
    # Public fields
    name = "Categorical",
    short_name = "Cat",
    description = "Categorical Probability Distribution.",
    alias = "C",

    # Public methods
    # initialize

    #' @description
    #' Creates a new instance of this [R6][R6::R6Class] class.
    #' @param elements `list()`\cr
    #' Categories in the distribution, see examples.
    #' @param probs `numeric()`\cr
    #' Probabilities of respective categories occurring.
    #'
    #' @examples
    #' # Note probabilities are automatically normalised (if not vectorised)
    #' x <- Categorical$new(elements = list("Bapple", "Banana", 2), probs = c(0.2, 0.4, 1))
    #'
    #' # Length of elements and probabilities cannot be changed after construction
    #' x$setParameterValue(probs = c(0.1, 0.2, 0.7))
    #'
    #' # d/p/q/r
    #' x$pdf(c("Bapple", "Carrot", 1, 2))
    #' x$cdf("Banana") # Assumes ordered in construction
    #' x$quantile(0.42) # Assumes ordered in construction
    #' x$rand(10)
    #'
    #' # Statistics
    #' x$mode()
    #'
    #' summary(x)
    initialize = function(elements = NULL, probs = NULL, decorators = NULL) {
      super$initialize(
        decorators = decorators,
        support = Set$new(1),
        type = Universal$new(),
        symmetry = "sym"
      )
    },

    # stats

    #' @description
    #' The arithmetic mean of a (discrete) probability distribution X is the expectation
    #' \deqn{E_X(X) = \sum p_X(x)*x}
    #' with an integration analogue for continuous distributions.
    #' @param ... Unused.
    mean = function(...) {
      p <- self$getParameterValue("probs")
      if (checkmate::testList(p)) {
        return(rep(NaN, length(p)))
      } else {
        return(NaN)
      }
    },

    #' @description
    #' The mode of a probability distribution is the point at which the pdf is
    #' a local maximum, a distribution can be unimodal (one maximum) or multimodal (several
    #' maxima).
    mode = function(which = "all") {
      probs <- self$getParameterValue("probs")
      els <- self$getParameterValue("elements")
      if (!checkmate::testList(probs)) {
        modes <- unlist(els[probs == max(probs)])
        if (which == "all") {
          return(modes)
        } else {
          return(modes[which])
        }
      } else {
        if (which == "all") {
          stop("`which` cannot be `'all'` when vectorising.")
        } else {
          modes <- c()
          for (i in seq_along(probs)) {
            m <- (els[[i]][probs[[i]] == max(probs[[i]])])
            if (which > length(m)) {
              m <- m[length(m)]
            } else {
              m <- m[which]
            }
            modes <- c(modes, m)
          }
          return(modes)
        }
      }

    },

    #' @description
    #' The variance of a distribution is defined by the formula
    #' \deqn{var_X = E[X^2] - E[X]^2}
    #' where \eqn{E_X} is the expectation of distribution X. If the distribution is multivariate the
    #' covariance matrix is returned.
    #' @param ... Unused.
    variance = function(...) {
      p <- self$getParameterValue("probs")
      if (checkmate::testList(p)) {
        return(rep(NaN, length(p)))
      } else {
        return(NaN)
      }
    },

    #' @description
    #' The skewness of a distribution is defined by the third standardised moment,
    #' \deqn{sk_X = E_X[\frac{x - \mu}{\sigma}^3]}{sk_X = E_X[((x - \mu)/\sigma)^3]}
    #' where \eqn{E_X} is the expectation of distribution X, \eqn{\mu} is the mean of the
    #' distribution and \eqn{\sigma} is the standard deviation of the distribution.
    #' @param ... Unused.
    skewness = function(...) {
      p <- self$getParameterValue("probs")
      if (checkmate::testList(p)) {
        return(rep(NaN, length(p)))
      } else {
        return(NaN)
      }
    },

    #' @description
    #' The kurtosis of a distribution is defined by the fourth standardised moment,
    #' \deqn{k_X = E_X[\frac{x - \mu}{\sigma}^4]}{k_X = E_X[((x - \mu)/\sigma)^4]}
    #' where \eqn{E_X} is the expectation of distribution X, \eqn{\mu} is the mean of the
    #' distribution and \eqn{\sigma} is the standard deviation of the distribution.
    #' Excess Kurtosis is Kurtosis - 3.
    #' @param ... Unused.
    kurtosis = function(excess = TRUE, ...) {
      p <- self$getParameterValue("probs")
      if (checkmate::testList(p)) {
        return(rep(NaN, length(p)))
      } else {
        return(NaN)
      }
    },

    #' @description
    #' The entropy of a (discrete) distribution is defined by
    #' \deqn{- \sum (f_X)log(f_X)}
    #' where \eqn{f_X} is the pdf of distribution X, with an integration analogue for
    #' continuous distributions.
    #' @param ... Unused.
    entropy = function(base = 2, ...) {
      p <- self$getParameterValue("probs")
      if (checkmate::testList(p)) {
        return(rep(NaN, length(p)))
      } else {
        return(NaN)
      }
    },

    #' @description The moment generating function is defined by
    #' \deqn{mgf_X(t) = E_X[exp(xt)]}
    #' where X is the distribution and \eqn{E_X} is the expectation of the distribution X.
    #' @param ... Unused.
    mgf = function(t, ...) {
      return(NaN)
    },

    #' @description The characteristic function is defined by
    #' \deqn{cf_X(t) = E_X[exp(xti)]}
    #' where X is the distribution and \eqn{E_X} is the expectation of the distribution X.
    #' @param ... Unused.
    cf = function(t, ...) {
      return(NaN)
    },

    #' @description The probability generating function is defined by
    #' \deqn{pgf_X(z) = E_X[exp(z^x)]}
    #' where X is the distribution and \eqn{E_X} is the expectation of the distribution X.
    #' @param ... Unused.
    pgf = function(z, ...) {
      return(NaN)
    }
  ),

  active = list(
    #' @field properties
    #' Returns distribution properties, including skewness type and symmetry.
    properties = function() {
      prop <- super$properties
      prop$symmetry <- if (length(unique(self$getParameterValue("probs"))) == 1) {
        "symmetric"
      } else {
        "asymmetric"
      }
      prop$support <- Set$new(elements = self$getParameterValue("elements"))
      prop
    }
  ),

  private = list(
    # dpqr
    .pdf = function(x, log = FALSE) {
      probs <- self$getParameterValue("probs")
      els <- self$getParameterValue("elements")

      if (checkmate::testList(probs)) {
        probs <- matrix(unlist(probs), nrow = length(probs[[1]]), ncol = length(probs))
        els <- matrix(unlist(els), ncol = ncol(probs))
        pdf <- matrix(nrow = length(x), ncol = ncol(probs))
        for (i in seq(ncol(probs))) {
          els_ind <- seq_along(els[, i])
          new_x <- match(x, els[, i])
          pdf[, i] <- .wd_pdf(new_x, els_ind, probs[, i], log)
        }
        pdf
      } else {
        els_ind <- seq_along(els)
        new_x <- match(x, els)
        .wd_pdf(new_x, els_ind, probs, log)
      }
    },
    .cdf = function(x, lower.tail = TRUE, log.p = FALSE) {

      probs <- self$getParameterValue("probs")
      els <- self$getParameterValue("elements")

      if (checkmate::testList(probs)) {
        probs <- matrix(unlist(probs), nrow = length(probs[[1]]), ncol = length(probs))
        els <- matrix(unlist(els), ncol = ncol(probs))
        cdf <- matrix(nrow = length(x), ncol = ncol(probs))
        for (i in seq(ncol(probs))) {
          els_ind <- seq_along(els[, i])
          new_x <- match(x, els[, i])
          new_cdf <- cumsum(probs[, i])
          cdf[, i] <- .wd_cdf(new_x, els_ind, new_cdf, lower.tail, log.p)
        }
        cdf
      } else {
        els_ind <- seq_along(els)
        new_x <- match(x, els)
        cdf <- cumsum(probs)
        .wd_cdf(new_x, els_ind, cdf, lower.tail, log.p)
      }

    },
    .quantile = function(p, lower.tail = TRUE, log.p = FALSE) {

      probs <- self$getParameterValue("probs")
      els <- self$getParameterValue("elements")

      if (checkmate::testList(probs)) {
        probs <- matrix(unlist(probs), nrow = length(probs[[1]]), ncol = length(probs))
        new_els <- matrix(unlist(els), ncol = ncol(probs))
        quantile <- matrix(nrow = length(p), ncol = ncol(probs))
        for (i in seq(ncol(probs))) {
          els_ind <- seq_along(new_els[, i])
          # new_x <- match(x, els[, i])
          new_cdf <- cumsum(probs[, i])
          quantile[, i] <- C_WeightedDiscreteQuantile(p, els_ind, new_cdf, lower.tail, log.p)
          quantile[, i] <- unlist(els[[i]][quantile[, i]])
        }
        return(quantile)
      } else {
        els_ind <- seq_along(els)
        cdf <- cumsum(probs)
        quantile <- C_WeightedDiscreteQuantile(p, els_ind, cdf, lower.tail, log.p)
        return(unlist(els[quantile]))
      }
    },
    .rand = function(n) {
      els <- self$getParameterValue("elements")
      probs <- self$getParameterValue("probs")

      if (checkmate::testList(probs)) {
        rand <- matrix(nrow = n, ncol = length(probs))
        for (i in seq_along(probs)) {
          rand[, i] <- unlist(sample(els[[i]], n, TRUE, probs[[i]]))
        }
      } else {
        rand <- sample(els, n, TRUE, probs)
      }
      return(rand)
    },

    # traits
    .traits = list(valueSupport = "discrete", variateForm = "univariate")
  )
)

.distr6$distributions <- rbind(
  .distr6$distributions,
  data.table::data.table(
    ShortName = "Cat", ClassName = "Categorical",
    Type = "V", ValueSupport = "discrete",
    VariateForm = "univariate",
    Package = "-", Tags = "", Alias = "C"
  )
)
alan-turing-institute/distr6 documentation built on Feb. 26, 2024, 11 a.m.