R/itemFrequency.R

Defines functions .barplot_horiz .barplot_vert

#######################################################################
# arules - Mining Association Rules and Frequent Itemsets
# Copyright (C) 2011-2015 Michael Hahsler, Christian Buchta,
# 			Bettina Gruen and Kurt Hornik
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

#' Getting Frequency/Support for Single Items
#'
#' Provides the generic function `itemFrequency()` and methods to get the
#' frequency/support for all single items in an objects based on
#' [itemMatrix].  For example, it is used to get the single
#' item support from an object of class [transactions]
#' without mining.
#'
#' @name itemFrequency
#' @family itemMatrix and transactions functions
#'
#' @param x an object of class [itemMatrix] or [tidLists].
#' @param ... further arguments are passed on.
#' @param type a character string specifying if `"relative"`
#' frequency/support or `"absolute"` frequency/support (item counts) is
#' returned. (default: `"relative"`).
#' @param weighted should support be weighted by transactions weights stored as
#' column `"weight"` in transactionInfo?
#' @return `itemFrequency` returns a named numeric vector.  Each element
#' is the frequency/support of the corresponding item in object `x`.  The
#' items appear in the vector in the same order as in the binary matrix in
#' `x`.
#' @author Michael Hahsler
#' @seealso [itemFrequencyPlot()]
#' @keywords models
#' @examples
#' data("Adult")
#' itemFrequency(Adult, type = "relative")
#'
setGeneric(
  "itemFrequency",
  function(x, ...) standardGeneric("itemFrequency")
)

#' @rdname itemFrequency
setMethod(
  "itemFrequency", signature(x = "itemMatrix"),
  function(x, type = c("relative", "absolute"), weighted = FALSE) {
    type <- match.arg(type)

    if (weighted) {
      if (!is(x, "transactions")) {
        stop("weighted itemFrequency only available for transactions")
      }
      if (!("weight" %in% colnames(transactionInfo(x)))) {
        stop("transactions do not contain weights. Add a weight column to transactionInfo.")
      }

      weight <- as.numeric(transactionInfo(x)[["weight"]])
      support <- .Call(R_rowWSums_ngCMatrix, x@data, weight)
      total <- sum(weight)
    } else {
      ## we could also use rowSums
      ## support <- tabulate(x@data@i + 1L, nbins = x@data@Dim[1])

      support <- rowSums(x@data)
      total <- length(x)
    }

    names(support) <- itemLabels(x)

    switch(type,
      relative = support / total,
      absolute = support
    )
  }
)


#' @rdname itemFrequency
setMethod(
  "itemFrequency", signature(x = "tidLists"),
  function(x, type = c("relative", "absolute")) {
    type <- match.arg(type)

    supports <- size(x)
    names(supports) <- seq_len(length(supports))

    switch(type,
      relative = supports / dim(x)[2],
      absolute = supports
    )
  }
)


#' Creating a Item Frequencies/Support Bar Plot
#'
#' Provides the generic function `itemFrequencyPlot()` and the method to
#' create an item frequency bar plot for inspecting the item frequency
#' distribution for objects based on [itemMatrix] (e.g.,
#' [transactions], or items in [itemsets]
#' and [rules]).
#'
#'
#' @aliases itemFrequencyPlot
#' @family itemMatrix and transactions functions
#'
#' @param x the object to be plotted.
#' @param \dots further arguments are passed on (see
#' [graphics::barplot()] from possible arguments).
#' @param type a character string indicating whether item frequencies should be
#' displayed relative of absolute.
#' @param weighted should support be weighted by transactions weights stored as
#' column `"weight"` in transactionInfo?
#' @param support a numeric value. Only display items which have a support of
#' at least `support`. If no population is given, support is calculated
#' from `x` otherwise from the population. Support is interpreted relative
#' or absolute according to the setting of `type`.
#' @param topN a integer value. Only plot the `topN` items with the
#' highest item frequency or lift (if `lift = TRUE`).  The items are
#' plotted ordered by descending support.
#' @param population object of same class as `x`; if `x` is a segment
#' of a population, the population mean frequency for each item can be shown as
#' a line in the plot.
#' @param popCol plotting color for population.
#' @param popLwd line width for population.
#' @param lift a logical indicating whether to plot the lift ratio between
#' instead of frequencies. The lift ratio is gives how many times an item is
#' more frequent in `x` than in `population`.
#' @param horiz a logical. If `horiz = FALSE` (default), the bars are
#' drawn vertically. If `TRUE`, the bars are drawn horizontally.
#' @param names a logical indicating if the names (bar labels) should be
#' displayed?
#' @param cex.names a numeric value for the expansion factor for axis names
#' (bar labels).
#' @param xlab a character string with the label for the x axis (use an empty
#' string to force no label).
#' @param ylab a character string with the label for the y axis (see xlab).
#' @param mai a numerical vector giving the plots margin sizes in inches (see
#' `? par').
#' @return A numeric vector with the midpoints of the drawn bars; useful for
#' adding to the graph.
#' @author Michael Hahsler
#' @seealso [itemFrequency()]
#' @keywords hplot
#' @examples
#' data(Adult)
#'
#' ## the following example compares the item frequencies
#' ## of people with a large income (boxes) with the average in the data set
#' Adult.largeIncome <- Adult[Adult %in% "income=large"]
#'
#' ## simple plot
#' itemFrequencyPlot(Adult.largeIncome)
#'
#' ## plot with the averages of the population plotted as a line
#' ## (for first 72 variables/items)
#' itemFrequencyPlot(Adult.largeIncome[, 1:72],
#'   population = Adult[, 1:72]
#' )
#'
#' ## plot lift ratio (frequency in x / frequency in population)
#' ## for items with a support of 20% in the population
#' itemFrequencyPlot(Adult.largeIncome,
#'   population = Adult, support = 0.2,
#'   lift = TRUE, horiz = TRUE
#' )
#'
setGeneric(
  "itemFrequencyPlot",
  function(x, ...) standardGeneric("itemFrequencyPlot")
)

#' @rdname itemFrequencyPlot
setMethod(
  "itemFrequencyPlot", signature(x = "itemMatrix"),
  function(
      x, type = c("relative", "absolute"), weighted = FALSE,
      support = NULL, topN = NULL, population = NULL,
      popCol = "black", popLwd = 1, lift = FALSE, horiz = FALSE,
      names = TRUE, cex.names = graphics::par("cex.axis"),
      xlab = NULL, ylab = NULL, mai = NULL, ...) {
    type <- match.arg(type)

    ## force relative for lift
    if (lift == TRUE) type <- "relative"

    ## plot only items with support
    if (!is.null(support)) {
      if (!is.null(population)) {
        frequentItems <- itemFrequency(population, type, weighted = weighted) >= support
        population <- population[, frequentItems]
      } else {
        frequentItems <- itemFrequency(x, type, weighted = weighted) >= support
      }
      x <- x[, frequentItems]
    }

    ## get frequencies
    itemFrequency <- itemFrequency(x, type, weighted = weighted)
    if (!is.null(population)) {
      population.itemFrequency <- itemFrequency(population, type, weighted = weighted)
    }

    ## regular plot
    if (lift == FALSE) {
      label <- paste("item frequency (", type, ")", sep = "")
      offset <- 0
    } else {
      ## show lift instead of frequencies
      if (is.null(population)) {
        stop("population needed for plotting lift!")
      }

      ## -1 and offset are used to draw bars smaller than one
      ## upside down
      itemFrequency <- (itemFrequency / population.itemFrequency) - 1
      offset <- 1


      ## take care of div by zero
      itemFrequency[is.infinite(itemFrequency)] <- NaN

      label <- "lift ratio"
    }

    ## plot only top n items (either itemFrequency or lift)
    if (!is.null(topN)) {
      take <- order(itemFrequency, decreasing = TRUE)[1:topN]
      itemFrequency <- itemFrequency[take]
      if (!is.null(population)) {
        population.itemFrequency <- population.itemFrequency[take]
      }
    }


    ## supress names
    if (names == FALSE) names(itemFrequency) <- NA

    if (horiz == FALSE) {
      midpoints <- .barplot_vert(itemFrequency, ...,
        offset = offset,
        cex.names = cex.names, xlab = xlab,
        ylab = if (is.null(ylab)) label else ylab,
        mai = mai
      )
    } else {
      midpoints <- .barplot_horiz(itemFrequency, ...,
        offset = offset,
        cex.names = cex.names, xlab = if (is.null(xlab)) label else xlab,
        ylab = ylab, mai = mai
      )
    }



    ## add population means (we switch off clipping first!)
    if (!is.null(population) && lift == FALSE) {
      if (horiz == FALSE) {
        lines(midpoints, population.itemFrequency,
          lwd = popLwd, col = popCol, xpd = TRUE
        )
      } else {
        lines(population.itemFrequency, midpoints,
          lwd = popLwd, col = popCol, xpd = TRUE
        )
      }
    }

    ## return mitpoints
    invisible(midpoints)
  }
)


## helper functions for barplot
.barplot_vert <- function(
    height, ...,
    cex.names = graphics::par("cex.axis"),
    xlab = NULL, ylab = NULL, mai = NULL) {
  labels <- names(height)

  ## for neg. heights we use straight labels
  if (min(height, na.rm = TRUE) < 0) {
    straight <- TRUE
  } else {
    straight <- FALSE
  }

  op.mai <- graphics::par("mai")
  if (is.null(mai)) {
    mai <- op.mai
    if (straight == TRUE) {
      mai[1] <- max(graphics::strwidth(labels,
        units = "inches",
        cex = cex.names
      )) + min(graphics::par("fin")[2] * 0.1, 0.5)
    } else {
      mai[1] <- max(graphics::strwidth(labels,
        units = "inches",
        cex = cex.names
      )) / 2^.5 + min(graphics::par("fin")[2] * 0.1, 0.5)
    }
  }

  graphics::par(mai = mai)
  on.exit(graphics::par(mai = op.mai))

  ## Create plot with no x axis and no x axis label

  if (straight == TRUE) {
    bp <- graphics::barplot(height, ...,
      las = 2, cex.names = cex.names,
      xlab = xlab, ylab = ylab
    )
  } else {
    bp <- graphics::barplot(height, ..., xaxt = "n", xlab = "", ylab = ylab)

    ## move down from the lower end of the plot by 1/20 of the plotting
    ## area for labels
    graphics::text(bp, graphics::par("usr")[3] - (graphics::par("usr")[4] - graphics::par("usr")[3]) / 20,
      srt = 45, adj = 1,
      labels = labels, xpd = TRUE, cex = cex.names
    )

    ## Plot x axis label
    graphics::mtext(1, text = xlab, line = graphics::par("mar")[1] - 1)
  }
  invisible(bp)
}

.barplot_horiz <- function(
    height, ...,
    cex.names = graphics::par("cex.axis"), xlab = NULL, ylab = NULL, mai = NULL) {
  ## make enough space for item labels
  op.mai <- graphics::par("mai")
  if (is.null(mai)) {
    mai <- op.mai
    mai[2] <- max(graphics::strwidth(names(height),
      units = "inches",
      cex = cex.names
    )) + min(graphics::par("fin")[1] * 0.1, 0.5)
  }
  graphics::par(mai = mai)
  on.exit(graphics::par(mai = op.mai))

  midpoints <- graphics::barplot(height,
    las = 2, cex.name = cex.names, horiz = TRUE,
    xlab = xlab, ylab = ylab, ...
  )

  invisible(midpoints)
}

Try the arules package in your browser

Any scripts or data that you put into this service are public.

arules documentation built on June 8, 2025, 12:10 p.m.