arules: Mining Association Rules and Frequent Itemsets

Documented in confint.rules

#######################################################################
# arules - Mining Association Rules and Frequent Itemsets
# Copyright (C) 2011-2015 Michael Hahsler, Christian Buchta,
#			Bettina Gruen and Kurt Hornik
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.


# NOTE: prop.test can test differences in proportions (could be used for DOC)


#' Confidence Intervals for Interest Measures for Association Rules
#'
#' Defines a method to compute confidence intervals for interest measures for association [rules].
#'
#' This method creates a contingency table for each rule and then constructs a
#' confidence interval for the specified measures.
#'
#' Fast confidence interval approximations are currently available for the
#' measures `"support"`, `"count"`, `"confidence"`, `"lift"`, `"oddsRatio"`, and `"phi"`.
#' For all other measures, bootstrap sampling from a multinomial distribution
#' is used.
#'
#' Haldan-Anscombe correction (Haldan, 1940; Anscombe, 1956) to avoids issues
#' with zero counts can be specified by `smoothCounts = 0.5`. Here .5 is
#' added to each count in the contingency table.
#'
#' @name confint
#' @family interest measures
#' 
#' @aliases confint confint.rules
#' @param object an object of class [rules].
#' @param parm,measure name of the interest measures (see [interestMeasure()]).
#' `measure` can be used instead of `parm`.
#' @param level the confidence level required.
#' @param side Should a two-sided confidence interval or a one-sided limit be
#' returned? Lower returns an interval with only a lower limit and upper
#' returns an interval with only an upper limit.
#' @param method method to construct the confidence interval. The available
#' methods depends on the measure and the most common method is used by
#' default.
#' @param smoothCounts pseudo count for addaptive smoothing (Laplace
#' smoothing). Often a pseudo counts of .5 is used for smoothing (see Detail
#' Section).
#' @param replications number of replications for method `"simulation"`. Ignored
#' for other methods.
#' @param transactions if the rules object does not contain sufficient quality
#' information, then a set of transactions to calculate the confidence interval
#' for can be specified.
#' @param ... Additional parameters are ignored with a warning.
#' @return Returns a matrix with with one row for each rule and the two columns
#' named `"LL"` and `"UL"` with the interval boundaries.
#' The matrix has the following additional attributes:
#'
#' \item{measure}{ the interest measure.} 
#' \item{level}{ the confidence level}
#' \item{side}{ the confidence level} 
#' \item{smoothCounts}{ used count
#' smoothing. } 
#' \item{method}{ name of the method to create the interval }
#' \item{desc}{ description of the used method to calculate the confidence
#' interval. The mentioned references can be found below. }
#' @author Michael Hahsler
#' @references Wilson, E. B. (1927). "Probable inference, the law of
#' succession, and statistical inference". 
#' _Journal of the American Statistical Association,_ 22 (158): 209-212.
#' \doi{10.1080/01621459.1927.10502953}
#'
#' Clopper, C.; Pearson, E. S. (1934). "The use of confidence or fiducial
#' limits illustrated in the case of the binomial". _Biometrika,_ 26 (4):
#' 404-413. 
#' \doi{10.1093/biomet/26.4.404}
#'
#' Doob, J. L. (1935). "The Limiting Distributions of Certain Statistics".
#' _Annals of Mathematical Statistics,_ 6: 160-169.
#' \doi{10.1214/aoms/1177732594}
#'
#' Fisher, R.A. (1962). "Confidence limits for a cross-product ratio".
#' _Australian Journal of Statistics,_ 4, 41.
#'
#' Woolf, B. (1955). "On estimating the relation between blood group and
#' diseases". _Annals of Human Genetics,_ 19, 251-253.
#'
#' Haldane, J.B.S. (1940). "The mean and variance of the moments of chi-squared
#' when used as a test of homogeneity, when expectations are small".
#' _Biometrika,_ 29, 133-134.
#'
#' Anscombe, F.J. (1956). "On estimating binomial response relations".
#' _Biometrika,_ 43, 461-464.
#' @keywords manip
#' @examples
#' data("Income")
#'
#' # mine some rules with the consequent "language in home=english"
#' rules <- apriori(Income, parameter = list(support = 0.5),
#'   appearance = list(rhs = "language in home=english"))
#'
#' # calculate the confidence interval for the rules' odds ratios.
#' # note that we use Haldane-Anscombe correction (with smoothCounts = .5)
#' # to avoid issues with 0 counts in the contingency table.
#' ci <- confint(rules, "oddsRatio",  smoothCounts = .5)
#' ci
#'
#' # We add the odds ratio (with Haldane-Anscombe correction)
#' # and the confidence intervals to the quality slot of the rules.
#' quality(rules) <- cbind(
#'   quality(rules),
#'   oddsRatio = interestMeasure(rules, "oddsRatio", smoothCounts = .5),
#'   oddsRatio = ci)
#'
#' rules <- sort(rules, by = "oddsRatio")
#' inspect(rules)
#'
#' # use confidence intervals for lift to find rules with a lift significantly larger then 1.
#' # We set the confidence level to 95%, create a one-sided interval and check
#' # if the interval does not cover 1 (i.e., the lower limit is larger than 1).
#' ci <- confint(rules, "lift", level = 0.95, side = "lower")
#' ci
#'
#' inspect(rules[ci[, "LL"] > 1])
confint.rules <- function(object,
  parm = "oddsRatio",
  level = 0.95,
  measure = NULL,
  side = c("two.sided", "lower", "upper"),
  method = NULL,
  replications = 1000,
  smoothCounts = 0,
  transactions = NULL,
  ...) {
  # measure can be used instead of parm
  if (is.null(measure))
    measure <- parm
  
  measure <- match.arg(measure, choices = measuresRules)
  
  # one-sided CI (adjust level)
  level_orig <- level
  side <- match.arg(side)
  if (side != "two.sided")
    level <- 1 - (1 - level) * 2
  
  if (level < 0 ||
      level > 1)
    stop("the confidence level needs to be in [0, 1].")
  
  counts <-
    as.data.frame(
      .getCounts(
        object,
        transactions = transactions,
        reuse = ifelse(is.null(transactions), TRUE, FALSE),
        smoothCounts = smoothCounts
      )
    )
  
  # try fast approximate intervals first
  ci <- .confint_approx(counts, measure, method, level)
  
  # fall back to bootstrap if .confint_approx does not return anything?
  if (is.null(ci))
    ci <-
    .confint_bootstrap(counts,
      measure,
      level,
      smoothCounts = smoothCounts,
      replications = replications,
      ...)
  
  if (side == "lower")
    ci[, "UL"] <- +Inf
  if (side == "upper")
    ci[, "LL"] <- -Inf
  
  structure(
    ci,
    measure = measure,
    level = level_orig,
    side = side,
    smoothCounts = smoothCounts
  )
}

.confint_bootstrap <-
  function(counts,
    measure,
    level = 0.95,
    smoothCounts = 0,
    replications = 1000,
    ...) {
    method <- "bootstrap"
    desc <-
      "Confidence interval using bootstrapping (random draws from a multinomial distribution)."
    
    qs <- c((1 - level) / 2, (1 + level) / 2)
    
    # smooth for better probability estimates
    counts <- counts[, c("n11", "n10", "n01", "n00")] + smoothCounts
    n <- sum(counts[1,])
    p <- counts / n
    
    ci <-
      matrix(
        NA_real_,
        nrow = nrow(p),
        ncol = 2,
        dimnames = list(NULL, c("LL", "UL"))
      )
    
    se <- vector(length = nrow(p))
    
    ### simulated counts also need to be smoothed
    for (i in seq(nrow(p))) {
      ns <- t(stats::rmultinom(replications, n, p[i,]))
      colnames(ns) <- c("n11", "n10", "n01", "n00")
      vals <-
        .basicRuleMeasure(ns, measure, smoothCounts = smoothCounts, ...)
      if (!any(is.na(vals)))
        ci[i,] <- stats::quantile(vals, probs = qs)
        se[i] <- stats::sd(vals) / sqrt(replications)
    }
    
    structure(
      ci,
      se = se,
      measure = measure,
      level = level,
      method = method,
      desc = desc
    )
  }


# CIs

# helper to construct a CI from sample mean and variance.
# for log = TRUE, var needs the log of the sample variance
ci.norm <- function(mean,
  var,
  n,
  level = 0.95,
  log = FALSE,
  desc = NULL) {
  z <- stats::qnorm((1 + level) / 2)
  se <- sqrt(var) / sqrt(n)
  if (log)
    ci <- cbind(LL = mean / exp(z * se),
      UL = mean * exp(z * se))
  else
    ci <- cbind(LL = mean - z * se,
      UL = mean + z * se)
  
  structure(ci,
    se = se,
    desc = desc)
}

# Normal approximation population proportion confidence interval (see Wilson, 1927)
ci.prop <- function(f, n, level) {
  p <- f / n
  ci.norm(p, p * (1 - p), n, level, desc = "Normal approximation population proportion confidence interval (see Wilson, 1927)")
}

# Wilson score confidence interval (Wilson, 1927)
# https://www.itl.nist.gov/div898/handbook/prc/section2/prc241.htm
# NOTE: prop.test can also be used for the Wilson Score interval
ci.prop.wilson <- function(f, n, level) {
  z_UL <- stats::qnorm((1 + level) / 2)
  z_LL <- stats::qnorm((1 - level) / 2)
  
  p <- f / n
  
  ci <- cbind(
    LL = (p + z_LL ^ 2 / (2 * n) +
        z_LL * sqrt(p * (1 - p) / n + z_LL ^ 2 / (4 * n ^ 2))) / (1 + z_LL ^
            2 / n),
    UL = (p + z_UL ^ 2 / (2 * n) +
        z_UL * sqrt(p * (1 - p) / n + z_UL ^ 2 / (4 * n ^ 2))) / (1 + z_UL ^
            2 / n)
  )
  
  structure(ci, desc = "Wilson score confidence interval (Wilson, 1927)")
}

# Exact binomial proportion confidence interval (Clopper and Pearson, 1934)
.vector.prop.binom <- Vectorize(function(f, n, level)
  stats::binom.test(f, n, conf.level = level)$conf.int)

ci.prop.binom <-
  function(f, n, level)
    structure(t(.vector.prop.binom(f, n, level)),
      dimnames = list(NULL, c("LL", "UL")),
      desc = "Exact binomial proportion confidence interval (Clopper and Pearson, 1934)")

# Exact CI for the odds ratio
.vector_OR <- Vectorize(function(n11, n01, n10, n00, level)
  stats::fisher.test(ceil(rbind(
    c(n11, n01), c(n10, n00)
  )), conf.level = level)$conf.int,
)
# ceil rounds up in case of a non-integer smoothCounts value

ci.or.exact <-
  function(n11, n01, n10, n00, level)
    structure(t(.vector_OR(n11, n01, n10, n00, level)),
      dimnames = list(NULL, c("LL", "UL")),
      desc = "Exact confidence interval for the odds ratio (Fisher, 1962).")



# Approximate CIs
.confint_approx <-
  function(counts,
    measure,
    method = NULL,
    level = 0.95,
    smoothCounts = 0) {
    ci <- NULL
    
    # we only use these 4 counts and recompute all other counts/probabilities
    n11 <- counts[, "n11"] + smoothCounts
    n10 <- counts[, "n10"] + smoothCounts
    n01 <- counts[, "n01"] + smoothCounts
    n00 <- counts[, "n00"] + smoothCounts
    
    n <- n11 + n10 + n01 + n00
    n1x <- n11 + n10
    nx1 <- n11 + n01
    n0x <- n - n1x
    nx0 <- n - nx1
    
    p1 <-  n11 / n
    p2 <-  n10 / n
    p3 <-  n01 / n
    p4 <-  n00 / n
    
    p00 <- n00 / n
    p01 <- n01 / n
    p10 <- n10 / n
    p11 <- n11 / n
    
    p0x <- n0x / n
    p1x <- n1x / n
    px0 <- nx0 / n
    px1 <- nx1 / n
    
    ###########################################################################
    if (measure == "count") {
      method <-
        match.arg(method, choices = c("wilson", "normal", "exact", "bootstrap"))
      
      ci <- switch(
        method,
        exact = ci.prop.binom(n11, n, level) * n,
        normal = ci.prop(n11, n, level) * n,
        wilson = ci.prop.wilson(n11, n, level) * n,
        bootstrap = NULL
      )
      
      ci[ci < 0] <- 0
    }
    
    #####################################################
    else if (measure == "support") {
      method <-
        match.arg(method, choices = c("wilson", "normal", "exact", "bootstrap"))
      
      ci <- switch(
        method,
        exact = ci.prop.binom(n11, n, level),
        normal = ci.prop(n11, n, level),
        wilson = ci.prop.wilson(n11, n, level),
        bootstrap = NULL
      )
      
      ci[ci < 0] <- 0
      ci[ci > 1] <- 1
    }
    
    #####################################################3
    else if (measure == "confidence") {
      method <-
        match.arg(method,
          choices = c(
            "delta",
            "log_delta",
            "wilson",
            "normal",
            "exact",
            "bootstrap"
          ))
      
      ci <- switch(
        method,
        delta = ci.norm(p1 / (p1 + p2), p1 * p2 / (p1 + p2) ^ 3, n, level,
          desc = "Delta method confidence interval for confidence (Doob, 1935)."),
        log_delta = ci.norm(
          p1 / (p1 + p2),
          p2 / (p1 * (p1 + p2)),
          n,
          level,
          log = TRUE,
          desc = "Delta method confidence interval for log of confidence (Doob, 1936)."
        ),
        exact = ci.prop.binom(n11, n1x, level),
        normal = ci.prop(n11, n1x, level),
        wilson = ci.prop.wilson(n11, n1x, level),
        bootstrap = NULL
      )
      
      ci[ci < 0] <- 0
      ci[ci > 1] <- 1
    }
    
    #####################################################3
    else if (measure == "lift") {
      method <-
        match.arg(method, choices = c("delta", "log_delta", "bootstrap"))
      
      ci <- switch(
        method,
        delta = ci.norm(p1 / ((p1 + p2) * (p1 + p3)),
          p1 * (p1 ^ 2 * (1 - (
            p1 + p2 + p3
          )) + p2 * p3) / ((p1 + p2) ^ 3 * (p1 + p3) ^ 3),
          n, level, desc = "Delta method confidence interval for lift (Doob, 1935)."),
        
        log_delta = ci.norm(
          p1 / ((p1 + p2) * (p1 + p3)),
          (p1 ^ 2 * (1 - p1 - p2 - p3) + p2 * p3) / (p1 * (p1 + p2) * (p1 + p3)),
          n,
          level,
          log = TRUE,
          desc = "Delta method confidence interval for log of lift (Doob, 1935)."
        ),
        bootstrap = NULL
      )
      
      ci[ci < 0] <- 0
    }
    
    #####################################################3
    else if (measure == "oddsRatio") {
      method <-
        match.arg(method, choices = c("woolf", "gart", "exact", "bootstrap"))
      
      ci <- switch(
        method,
        woolf = {
          # smoothCounts = 0 for Woolf interval
          n00 <- counts[, "n00"]
          n01 <- counts[, "n01"]
          n10 <- counts[, "n10"]
          n11 <- counts[, "n11"]
          
          ci.norm(
            n11 * n00 / (n01 * n10),
            n * (1 / n00 + 1 / n01 + 1 / n10 + 1 / n11),
            n,
            level,
            log = TRUE,
            desc = "Woolf method confidence interval for log of the odds ratio (Woolf, 1955). Delta method without count smoothing."
          )
        },
        
        gart = {
          # smoothCounts = 0.5 for Haldane-Anscombe-Gart interval
          n00 <- counts[, "n00"] + .5
          n01 <- counts[, "n01"] + .5
          n10 <- counts[, "n10"] + .5
          n11 <- counts[, "n11"] + .5
          
          ci.norm(
            n11 * n00 / (n01 * n10),
            n * (1 / n00 + 1 / n01 + 1 / n10 + 1 / n11),
            n,
            level,
            log = TRUE,
            desc = "Gart method confidence interval for log of the odds ratio (Gart and Thomas, 1972). Delta method with count smoothing of .5."
          )
        },
        
        exact = ci.or.exact(n11, n01, n10, n00, level),
        bootstrap = NULL
      )
      
      ci[ci < 0] <- 0
    }
    
    #####################################################3
    else if (measure == "phi") {
      desc <- "Delta method confidence interval for the phi coefficient."
      method <- match.arg(method, choices = c("delta", "bootstrap"))
      
      phi <- (p11 * p00 - p10 * p01) / sqrt(p1x * p0x * px1 * px0)
      v1 <- 1 - phi ^ 2
      v2 <- phi + .5 * phi ^ 3
      v3 <- (p0x - p1x) * (px0 - px1) / sqrt(p0x * p1x * px0 * px1)
      v4 <-
        (.75 * phi ^ 2) * ((p0x - p1x) ^ 2 / (p0x * p1x) + (px0 - px1) ^ 2 / (px0 * px1))
      
      ci <- switch(
        method,
        delta = ci.norm(phi, (v1 + v2 * v3 + v4), n, level,
          desc = "Delta method confidence interval for the phi coefficient."),
        bootstrap = NULL
      )
      
      ci[ci < -1] <- -1
      ci[ci > 1] <- 1
    }
    
    ci
  }