R/DSC_BIRCH.R

Defines functions DSC_BIRCH

Documented in DSC_BIRCH

#######################################################################
# stream -  Infrastructure for Data Stream Mining
# Copyright (C) 2018 Dennis Assenmacher, Matthias Carnein and Michael Hahsler
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.


#' Balanced Iterative Reducing Clustering using Hierarchies
#'
#' Micro Clusterer.
#' BIRCH builds a balanced tree of Clustering Features (CFs) to summarize the
#' stream.
#'
#' A CF in the calanced tree is a tuple (n, LS, SS) which represents a cluster
#' by storing the number of elements (n), their linear sum (LS) and their
#' squared sum (SS). Each new observation descends the tree by following its
#' closest CF until a leaf node is reached. It is either merged into its
#' closest leaf-CF or inserted as a new one. All leaf-CFs form the
#' micro-clusters. Rebuilding the tree is realized by inserting all leaf-CF
#' nodes into a new tree structure with an increased threshold.
#'
#' @aliases DSC_BIRCH BIRCH birch
#' @family DSC_Micro
#'
#' @param formula `NULL` to use all features in the stream or a model [formula] of the form `~ X1 + X2`
#'   to specify the features used for clustering. Only `.`, `+` and `-` are currently
#'   supported in the formula.
#' @param threshold threshold used to check whether a new data point can be
#'   absorbed or not.
#' @param branching branching factor (maximum amount of child nodes for a
#'   non-leaf node) of the CF-Tree.
#' @param maxLeaf maximum number of entries within a leaf node
#' @param outlierThreshold threshold for identifying outliers when rebuilding
#'   the CF-Tree.
#' @param maxMem memory limitation for the whole CFTree in bytes. Default is 0,
#'   indicating no memory restriction.
#' @author Dennis Assenmacher (\email{Dennis.Assenmacher@@uni-muenster.de}),
#' Matthias Carnein (\email{Matthias.Carnein@@uni-muenster.de})
#' @references
#' Zhang T, Ramakrishnan R and Livny M (1996), "BIRCH: An Efficient
#' Data Clustering Method for Very Large Databases", _In Proceedings of
#' the 1996 ACM SIGMOD International Conference on Management of Data._
#' Montreal, Quebec, Canada , pp. 103-114. ACM.
#'
#' Zhang T, Ramakrishnan R and Livny M (1997), "BIRCH: A new data clustering
#' algorithm and its applications", _Data Mining and Knowledge Discovery._
#' Vol. 1(2), pp. 141-182.
#' @examples
#' stream <- DSD_Gaussians(k = 3, d = 2)
#'
#' BIRCH <- DSC_BIRCH(threshold = .1, branching = 8, maxLeaf = 20)
#' update(BIRCH, stream, n = 500)
#' BIRCH
#'
#' plot(BIRCH, stream)
#' @export
DSC_BIRCH <-
  function(formula = NULL,
    threshold,
    branching,
    maxLeaf,
    maxMem = 0,
    outlierThreshold = 0.25) {
    structure(
      list(
        description = "BIRCH - Balanced Iterative Reducing Clustering using Hierarchies",
        formula = formula,
        RObj = birch$new(
          threshold = threshold,
          branching = branching,
          maxLeaf = maxLeaf,
          maxMem = maxMem,
          outlierThreshold = outlierThreshold
        )
      ),
      class = c("DSC_BIRCH", "DSC_Micro", "DSC_R", "DSC")
    )
  }

birch <- setRefClass("BIRCH", fields = list(C = "ANY",
  colnames = "ANY"))

#  CF-Tree creation: Initializes an empty CF-Tree.
# param threshold Specifies the threshold used to check whether a new datapoint can be absorbed or not.
# param branching Specifies the branching factor (maximum amount of child nodes for a nonleaf node) of the CF-Tree.
# param maxLeaf Specifies the maximum number of entries within a leaf node.
# param maxMemory Specifies the memory limitation for the whole CFTree in bytes. Default is 0, indicating no memory restriction.
# param outlierThreshold Specifies the threshold for identifying outliers when rebuilding the CF-Tree.
birch$methods(
  initialize = function(threshold,
    branching,
    maxLeaf,
    maxMem,
    outlierThreshold) {
    C <<-
      new(BIRCH,
        threshold,
        branching,
        maxLeaf,
        maxMem,
        outlierThreshold) ## Exposed C class

    colnames <<- NULL
    .self
  }
)


birch$methods(
  cache = function() {
    stop("SaveDSC not implemented for DSC_BIRCH!")
  }
)

# CF-Tree insertion: All data objects of a matrix are row wise inserted into the CF-Tree
birch$methods(
  cluster = function(newdata) {
    .self$C$udpateTree(data.matrix(newdata))
  }
)

# Deletes the whole tree structure.
birch$methods(
  deleteTree = function() {
    .self$C$deleteTree()
    .self$tree <- NULL
  }
)

# This function returns all micro clusters of a given CF-Tree.
birch$methods(
  get_microclusters = function(x, ...) {
    centers <- .self$C$getCentroids()
    colnames(centers) <- colnames
    centers
  }
)

# This function returns all micro cluster weights of a given CF-Tree.
birch$methods(
  get_microweights = function(x, ...) {
    .self$C$getWeights()
  }
)

Try the stream package in your browser

Any scripts or data that you put into this service are public.

stream documentation built on March 7, 2023, 6:09 p.m.