R/ruleInduction.R
In arules: Mining Association Rules and Frequent Itemsets

#######################################################################
# arules - Mining Association Rules and Frequent Itemsets
# Copyright (C) 2011-2015 Michael Hahsler, Christian Buchta,
# 			Bettina Gruen and Kurt Hornik
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.


#' Association Rule Induction from Itemsets
#'
#' Provides the generic function `ruleInduction()` and the method to induce all association
#' [rules] which can be generated by the given set of [itemsets] from a [transactions]
#' dataset.
#'
#' All rules that can be created using the supplied itemsets and that surpass the
#' specified minimum confidence threshold are returned.
#' `ruleInduction()` can be used to produce
#' closed association rules defined by Pei
#' et al. (2000) as rules `X => Y` where both `X` and `Y` are
#' closed frequent itemsets. See the code example in the Example section.
#'
#' Rule induction implements several induction methods. The default method is `"ptree"`
#'
#' - `"ptree"` **method without transactions:**
#'   No transactions are need to be specified if
#'   `x` contains a complete set of frequent or
#'   itemsets. The itemsets' support counts are stored in a ptree and then retrieved to
#'   create rules and calculate rules confidence. This is very fast,
#'   but fails because of missing
#'   support values if `x` is not a complete set of frequent itemsets.
#'
#' - `"ptree"` **method with transactions:**
#'   If transactions are specified then all transactions are counted into a prefix
#'   tree and
#'   later retrieved to create rules from the itemsets and calculate confidence values.
#'   This is slower, but necessary if `x` is not a complete set of frequent itemsets.
#'   To improve speed, unused items are removed from the transaction
#'   data before creating the prefix tree (this behavior can be changed using the
#'   argument `reduce`). This might be slower for large transaction
#'   data sets. However, this is highly recommended as the items are also
#'   reordered to reduce the counting time.
#'
#' - `"apriori"` **method (always needs transactions):**
#'   All association rules are mined from the transactions data set using [apriori()]
#'   with the
#'   smallest support found in the itemsets. In a second step, all rules which cannot
#'   be generated from one of the itemsets are removed. This procedure is very slow,
#'   especially for itemsets with many elements or very low support.
#'
#' @family mining algorithms
#' @family postporocessing
#'
#' @param x the set of [itemsets] from which rules will be induced.
#' @param transactions the [transactions] used to mine the itemsets. Can
#' be omitted for method `"ptree"`,
#' if `x` contains a (complete set) of frequent itemsets
#' together with their support counts.
#' @param confidence a numeric value between 0 and 1 giving the minimum confidence
#' threshold for the rules.
#' @param method `"ptree"` or `"apriori"`
#' @param reduce remove unused items to speed up the counting process?
#' @param verbose report progress?
#' @param ... further arguments.
#' @return An object of class [rules].
#' @author Christian Buchta and Michael Hahsler
#' @references Michael Hahsler, Christian Buchta, and Kurt Hornik. Selective
#' association rule generation. _Computational Statistics,_ 23(2):303-315,
#' April 2008.
#'
#' Jian Pei, Jiawei Han, Runying Mao. CLOSET: An Efficient Algorithm for Mining
#' Frequent Closed Itemsets. _ACM SIGMOD Workshop on Research Issues in Data
#' Mining and Knowledge Discovery (DMKD 2000)._
#' @keywords models
#' @examples
#' data("Adult")
#'
#' ## find all closed frequent itemsets
#' closed_is <- apriori(Adult, target = "closed frequent itemsets", support = 0.4)
#' closed_is
#'
#' ## use rule induction to produce all closed association rules
#' closed_rules <- ruleInduction(closed_is, transactions = Adult, verbose = TRUE)
#'
#' ## inspect the resulting closed rules
#' summary(closed_rules)
#' inspect(head(closed_rules, by = "lift"))
#'
#' ## get rules from frequent itemsets. Here, transactions does not need to be
#' ## specified for rule induction.
#' frequent_is <- eclat(Adult, support = 0.4)
#' assoc_rules <- ruleInduction(frequent_is)
#' assoc_rules
#' inspect(head(assoc_rules))
#'
#' ## for itemsets that are not a complete set of frequent itemsets,
#' ## transactions need to be specified.
#' some_is <- sample(frequent_is, 10)
#' some_rules <- ruleInduction(some_is, transactions = Adult)
#' some_rules
setGeneric(
  "ruleInduction",
  function(x, ...) {
    standardGeneric("ruleInduction")
  }
)

#' @rdname ruleInduction
setMethod(
  "ruleInduction", signature(x = "itemsets"),
  function(
      x,
      transactions = NULL,
      confidence = 0.8,
      method = c("ptree", "apriori"),
      reduce = FALSE,
      verbose = FALSE,
      ...) {
    method <- match.arg(method)

    ## check transaction data
    if (!is.null(transactions)) {
      nItems <- nitems(transactions)
      if (nItems != nitems(items(x))) {
        stop("Dimensions of x and transactions do not match!")
      }
      if (any(itemLabels(x) != itemLabels(transactions))) {
        stop("Item labels for x and transactions do not match!")
      }
    }

    if (verbose) {
      cat("ruleInduction: using method", method, "\n")
    }

    ## find rules
    pt1 <- proc.time()
    rules <-
      if (method == "ptree" && !is.null(transactions)) {
        ruleInduction.ptree(
          x, transactions,
          confidence, reduce, verbose
        )
      } else if (method == "ptree" && is.null(transactions)) {
        ruleInduction.index(x, confidence, verbose)
      } # else if(method == "tidlists") ruleInduction.tidlists(x,
      #    transactions, confidence, verbose)
      else {
        ruleInduction.apriori(
          x, transactions,
          confidence, reduce, verbose
        )
      }

    pt2 <- proc.time()
    if (verbose) {
      cat("searching done [", pt2[1] - pt1[1], "s].\n", sep = "")
    }

    info <- x@info
    if (is.null(info$data)) {
      info <- c(x = match.call()$x, info)
    }
    ## apriori
    if (is.null(info$confidence)) {
      info <- c(info, confidence = confidence)
    } else {
      info$confidence <- confidence
    }
    rules@info <- info

    pt3 <- proc.time()
    if (verbose) {
      cat("postprocessing done [", pt3[1] - pt2[1], "s].\n",
        sep = ""
      )
    }

    ## return found rules
    rules
  }
)


ruleInduction.apriori <-
  function(
      x,
      transactions,
      confidence = 0.8,
      reduce = FALSE,
      verbose = FALSE) {
    if (is.null(transactions)) {
      stop("rule induction method apriori needs transactions!")
    }

    nItems <- nitems(transactions)
    itemInfo <- itemInfo(transactions)

    if (reduce) {
      ifreq <- itemFrequency(items(x), type = "abs")
      items.involved <- which(ifreq > 0)

      if (verbose) {
        cat(
          "reducing data from",
          nitems(items(x)),
          "items to",
          length(items.involved),
          "items\n"
        )
      }

      x@items <- x@items[, items.involved]
      transactions <- transactions[, items.involved]
    }

    empty_rules <- function(trans) {
      em <- as(trans, "itemMatrix")[0]
      new(
        "rules",
        lhs = em,
        rhs = em,
        quality = data.frame(
          support = numeric(0),
          confidence = numeric(0),
          lift = numeric(0)
        )
      )
    }

    if (length(transactions) < 1) {
      return(empty_rules(transactions))
    }

    ## itemset sizes
    isetsSize <- size(x)

    ## find minimal support and mine all rules
    ## Note: minSupport is reduced by epsilon so we get the rules
    ## 	with support == min support in x
    minSupport <-
      min(quality(x)$support) - 1 / length(transactions)

    ### suppress maxlen warnings
    suppressWarnings(rules <-
      apriori(
        transactions,
        parameter = list(
          support = minSupport,
          confidence = confidence,
          target = "rules",
          minlen = min(isetsSize),
          maxlen = max(isetsSize)
        ),
        control = list(verbose = verbose)
      ))

    ## find rules which were generated by the itemsets
    if (verbose) {
      cat(paste("starting to filter", length(rules), "rules.\n"))
    }
    take <- !is.na(match(items(rules), items(x)))
    if (verbose) {
      cat("filtering done.\n")
    }

    rules <- rules[take]
    if (verbose) {
      cat("left with", length(rules), "rules.\n")
    }

    if (reduce) {
      ## expand items back to full space
      ## -1 since indices in ngCMatix start with 0
      items.index <- items.involved - 1L

      ## fix dim
      rules@lhs@data@Dim[1] <- nItems
      rules@rhs@data@Dim[1] <- nItems

      ## fix column indices
      ## +1 since indices in ngCMatix start with 0
      rules@lhs@data@i <- items.index[(rules@lhs@data@i + 1L)]
      rules@rhs@data@i <- items.index[(rules@rhs@data@i + 1L)]

      ## fix item labels
      rules@lhs@itemInfo <- itemInfo
      rules@rhs@itemInfo <- itemInfo
    }

    rules
  }

# FIXME: Currently disabled
ruleInduction.tidlists <-
  function(
      x,
      transactions,
      confidence = 0.8,
      verbose = FALSE) {
    tid <- as(transactions, "tidLists")
    data <- .Call(R_tid_rules, tid@data, x@items@data)
    names(data) <- c(
      "support",
      "confidence",
      "lhs_i",
      "lhs_p",
      "rhs_i",
      "rhs_p",
      "Dim"
    )

    quality <-
      data.frame(
        support = data$support,
        confidence = data$confidence
      )

    lhs <-
      new(
        "ngCMatrix",
        i = data$lhs_i,
        p = data$lhs_p,
        Dim = data$Dim
      )
    rhs <-
      new(
        "ngCMatrix",
        i = data$rhs_i,
        p = data$rhs_p,
        Dim = data$Dim
      )

    lhs <-
      new("itemMatrix",
        data = lhs,
        itemInfo = x@items@itemInfo
      )
    rhs <-
      new("itemMatrix",
        data = rhs,
        itemInfo = x@items@itemInfo
      )

    rules <- new("rules",
      lhs = lhs,
      rhs = rhs,
      quality = quality
    )

    rules <- rules[quality(rules)$confidence > confidence]
    rules
  }

## ptree support counting

ruleInduction.ptree <-
  function(
      x,
      transactions,
      confidence = 0.8,
      reduce = FALSE,
      verbose = FALSE) {
    r <-
      .Call(
        R_pncount,
        x@items@data,
        transactions@data,
        FALSE,
        reduce,
        verbose
      )

    names(r) <-
      c(
        "data.lhs",
        "data.rhs",
        "support",
        "confidence",
        "lift",
        "itemset"
      )

    ## quality: set NAs to 0 since they are the result of items missing
    ## in transactions
    q <- as.data.frame(r[3:6])
    q[is.na(q)] <- 0

    take <- q$confidence >= confidence

    rules(
      lhs = new(
        "itemMatrix",
        data     = r$data.lhs,
        itemInfo = transactions@itemInfo
      )[take, ],
      rhs = new(
        "itemMatrix",
        data     = r$data.rhs,
        itemInfo = transactions@itemInfo
      )[take, ],
      quality = q[take, ]
    )
  }

## ptree indexing

ruleInduction.index <-
  function(
      x,
      confidence = 0.8,
      verbose = FALSE) {
    if (is.null(quality(x)$support)) {
      stop("cannot induce rules because support is missing ! Specify transactions.")
    }

    r <- data.frame(.Call(R_pnrindex, x@items@data, verbose))
    names(r) <- c("i", "li", "ri")

    if (!all(r$li) || !all(r$ri)) {
      stop("cannot induce rules because itemsets are incomplete ! Specify transactions.")
    }

    r$support <- x@quality$support[r$i]
    r$confidence <- r$support /
      x@quality$support[r$li]
    # filter
    r <- r[r$confidence >= confidence, ]
    if (dim(r)[1] == 0) {
      return(new("rules"))
    }
    r$lift <- r$confidence / x@quality$support[r$ri]

    rules(
      lhs = x@items[r$li],
      rhs = x@items[r$ri],
      quality = r[4:6]
    )
  }