#-------------------------------------------------------------------------------
#
# Package binr
#
# Implementation of greedy algorithm.
#
# Sergei Izrailev, 2011-2014
#-------------------------------------------------------------------------------
# Copyright 2011-2014 Collective, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#' \code{bins.greedy} - Wrapper around \code{bins.greedy.impl}. Goes over the
#' sorted values of \code{x} left to right and fills the bins with the values until
#' they are about the right size.
#' @param x Vector of numbers.
#' @param nbins Target number of bins.
#' @param minpts Minimum number of points in a bin. Only used if \code{naive = FALSE}.
#' @param naive When \code{TRUE}, simply calls \code{bins.greedy.impl} with data
#' derived from \code{x}. Otherwise, makes an extra step of marking the values that
#' by themselves take a whole bin to force the algorithm to place these values
#' in a bin separately.
#' @name bins.greedy
#' @title Greedy binning algorithm.
#' @seealso \code{\link{binr}}, \code{\link{bins}}, \code{\link{bins.quantiles}} \code{\link{bins.optimize}}
#' @export bins.greedy
#' @rdname bins.greedy
#' @usage bins.greedy(x, nbins, minpts = floor(0.5 * length(x)/nbins), thresh = 0.8, naive = FALSE)
bins.greedy <- function(x, nbins, minpts = floor(0.5 * length(x)/nbins), thresh = 0.8, naive = FALSE)
{
xtbl <- table(x)
xval <- sort(unique(x))
nvals <- length(xval)
xstp <- vector(nvals, mode="logical")
binsz <- floor(length(x) / nbins)
if (naive) return( bins.greedy.impl(xval, xtbl, xstp, binsz, nbins, thresh) )
ixdone <- which(xtbl >= binsz)
ixkeep <- which(xtbl < binsz)
nbins2 <- nbins - length(ixdone)
nx2 <- sum(xtbl) - sum(xtbl[ixdone]) # how many points are left
nbins.new <- min(nbins2, floor(nx2 / minpts) + 1) # make fewer bins if there's not enough points
binsz.new <- floor(nx2 / nbins.new)
istp <- ixdone + 1
istp <- istp[istp < nvals]
xstp.new <- xstp
xstp.new[istp] <- TRUE
#cuts.new <- bins.greedy.impl(xval[ixkeep], xtbl[ixkeep], xstp.new[ixkeep], binsz.new, nbins.new, thresh)
cuts.new <- bins.greedy.impl(xval, xtbl, xstp, binsz.new, nbins.new, thresh)
return(cuts.new)
}
#-------------------------------------------------------------------------------
#' \code{bins.greedy.impl} - Implementation of a single-pass binning algorithm that examines sorted data left to right
#' and builds bins of the target size. The \code{bins.greedy} wrapper around this function provides a less involved interface.
#' This is not symmetric wrt direction: symmetric distributions may not have symmetric bins if there are multiple points
#' with the same values. If a single value accounts for more than thresh * binsz points, it will be placed in
#' a new bin.
#' @param xval Sorted unique values of the data set x. This should be the numeric version of \code{names(xtbl)}.
#' @param xtbl Result of a call to \code{table(x)}.
#' @param xstp Stopping points; if \code{xstp[i] == TRUE}, the \code{i}-th value can't be merged to the \code{(i-1)}-th one.
#' \code{xstp[1]} value is ignored.
#' @param binsz Target bin size, i.e., the number of points falling into each bin; for example, \code{floor(length(x) / nbins)}
#' @param thresh Threshold fraction of bin size for the greedy algorithm.
#' Suppose there's \code{n < binsz} points in the current bin already.
#' Also suppose that the next value V is represented by \code{m} points, and \code{m + n > binsz}.
#' Then the algorithm will check if \code{m > thresh * binsz}, and if so, will place the value V into a new bin.
#' If \code{m} is below the threshold, the points having value V are added to the current bin.
#' @param verbose When \code{TRUE}, prints the number of points falling into the bins.
#' @return A list with the following items:
#' \itemize{
#' \item{binlo}{ - The "low" value falling into the bin.}
#' \item{binhi}{ - The "high" value falling into the bin.}
#' \item{binct}{ - The number of points falling into the bin.}
#' \item{xtbl}{ - The result of a call to \code{table(x)}.}
#' \item{xval}{ - The sorted unique values of the data points x. Essentially, a numeric version of \code{names(xtbl)}.}
#' }
#' @export bins.greedy.impl
#' @rdname bins.greedy
#' @usage bins.greedy.impl(xval, xtbl, xstp, binsz, nbins, thresh, verbose = F)
bins.greedy.impl <- function(xval, xtbl, xstp, binsz, nbins, thresh, verbose = F)
{
nvals <- length(xval)
if (nvals != length(xtbl) || nvals != length(xstp)) stop("bins.greedy: xval, xtbl and xstp lengths don't match")
xstp[1] = FALSE
binlo <- vector(nbins, mode="integer")
binhi <- vector(nbins, mode="integer")
binct <- vector(nbins, mode="integer")
startbin <- TRUE
k <- 1 # current bin index
s <- 0 # running number of points in the current bin
for (i in 1:nvals)
{
if (xstp[i]) # can't happen on i == 1
{
# wrap up the previous bin
binhi[k] <- xval[i - 1]
binct[k] <- s
s <- 0
k <- k + 1
startbin <- TRUE
}
s <- s + xtbl[i]
if (startbin)
{
binlo[k] <- xval[i]
startbin <- FALSE
}
if (s >= binsz)
{
# we exceeded the bin size. If there were points in this bin before, check if
# the added number of points exceeds the tolerance threshold.
if (s > xtbl[i] && (xtbl[i] >= thresh * binsz))# || (s - xtbl[i] > thresh * binsz)))
{
# not the first item; complete the previous bin and make this a starting bin
binhi[k] <- xval[i - 1]
binct[k] <- s - xtbl[i]
k <- k + 1
s <- xtbl[i]
binlo[k] <- xval[i]
startbin <- FALSE
}
else
{
# done for now
binhi[k] <- xval[i]
binct[k] <- s
k <- k + 1
s <- 0
startbin <- TRUE
}
}
}
if (!startbin)
{
# unfinished bin
binhi[k] <- xval[nvals]
binct[k] <- s
}
#if (k > nbins) k <- nbins # we are NOT guaranteed that everything fits into at most nbins bins
# Here's an example when things go bad:
# x <- rep(1:10, 10); bins.greedy(x, nbins=9, thresh=0.8)$binct
# In this case, every value exceeds the threshold and is placed in a separate bin, hence we get
# more bins than what we asked for.
# Reduce k until a valid bin.
while (is.na(binct[k])) k <- k - 1
binlo <- binlo[1:k]
binhi <- binhi[1:k]
binct <- binct[1:k]
names(binct) <- paste("[", binlo, ", ", binhi, "]", sep="")
if (verbose) print(binct)
return(list(binlo=binlo, binhi=binhi, binct=binct, xtbl=xtbl, xval=xval))
}
#-------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.