Nothing
bixplot = function(...,
names = NA,
add = FALSE,
at = NULL,
horizontal = FALSE,
col = "gray60", bodyCol = NULL,
bodyOpaque = 0.5,
bodyW = 0.80,
bodysize = "area_from_count",
modeCol = c("cadetblue3", "hotpink2",
"lawngreen", "orange","cyan3",
"coral2","gray60", "darkgoldenrod2",
"slateblue2", "brown2", "khaki3"),
curveCol = "black",
curveLwd = 1,
border = "black", boxCol = NULL,
boxOpaque = 0.5,
boxW = 0.32,
boxLwd = 2,
makewhiskers = TRUE,
innerwhiskers = TRUE,
rugCol = "black",
rugoutCol = par("fg"), # "black",
rugNumeric = NULL,
rugNumericColors = c("blue", "green"),
colorbarW = 0.12,
rugFactor = NULL,
rugFactorColors = c("red", "blue", "forestgreen"),
rugLwd = par("lwd"),
rugoutLwd = 2*rugLwd,
rugOpaque = 0.4,
jittering = TRUE,
jitteramount = NULL,
rugW = 0.12,
stickCol = "black",
stickLwd = 2,
boxwex = 1,
width = NULL, # leftover from boxplot
side = "no",
tick = TRUE,
diplevel = 0.01,
minN = 15,
kmax = NULL,
bigN = 500,
clusMinN = 3,
bw = "SJ-dpi",
kernel = "gaussian",
cut = 3,
cutmin = -Inf,
cutmax = Inf,
xlim = NULL,
ylim = NULL,
cex.axis = 1,
main = "bixplot",
cex.main = 1, line.main = 1,
cex.colorbar = 1,
xlab = "",
ylab = "",
cex.lab = 1, line.lab = 2.2,
xaxs = "r", yaxs = "r", las = 1,
ann = !add,
plot = TRUE,
log = NULL)
{ # bixplot for data input and formula input
#
# Arguments
#
# ... : a vector of numeric values to plot,
# or a dataframe with variables to plot, or a
# matrix with columns to plot, or a single list of
# vectors that may be of different lengths. The
# data may contain NAs, which will be left out.
# One can also provide a formula, such as y ~ grp,
# where y is a numeric vector of data values to be
# split into groups according to the grouping
# variable grp (usually a factor). Note that
# ~ g1 + g2 is equivalent to ~ g1:g2. If a formula
# is given, it may be necessary to also specify
# an argument data = with the data.frame containing
# the variables in the formula.
# names : group labels that will be printed next to
# each variable (= group). [Argument from
# graphics::boxplot.] If NA, the default, these are
# extracted from the provided input data, and if
# the data does not contain names the labels
# 1, 2, 3,... are used.
# add : logical, if true then the bixplot is added to
# the current plot. Defaults to FALSE. [Argument
# from boxplot.]
# at : numeric vector giving the locations where the
# bixplots should be drawn, particularly when
# add = TRUE. [Argument from boxplot.]
# Defaults to 1:p where p is the number of variables.
# horizontal : logical indicating if the bixplots should
# be horizontal. [Argument from boxplot.]
# The default FALSE means vertical boxes.
# bodyW : the width of the body, which is the region
# inside the estimated density.
# col : contains the color(s) of the bodies for the p
# variables. col can be a single color or a
# vector, and will be recycled as needed. [Argument
# from boxplot]. It is only applied to those
# variables the method deems unimodal. When col is
# NA, the body is not drawn.
# bodyCol : if not NULL, it overrides col.
# bodyOpaque: opacity of the body.
# bodyW : the width of the body. When zero, the body
# is not drawn. When two or more modes are shown
# for the same variable, it is the width of the
# widest body.
# bodysize: either "area_is_constant", "area_from_count",
# or "width_is_constant".
# Determines the relative sizes of bodies of modes
# in the same variable.
# When "area_is_constant", the area of each density
# is the same.
# When "area_from_count", the default, the area of
# each density is proportional to the number of
# members in its cluster (mode).
# When "width_is_constant", all bodies have the same
# width.
# In all three settings, the width of the widest
# body is bodyW[j], possibly modified by argument
# width if it is not null.
# The width of the box changes proportionally
# with that of the body.
# modeCol : contains the color(s) of the bodies of the
# modes (clusters) of the variables deemed not
# unimodal. Can be a single color or a vector,
# and will be recycled as needed. If NULL, all
# modes of a variable will receive the same color
# as provided in col for that variable.
# curveCol : contains the color(s) of the density
# curves of each variable, that is, the boundary
# of the body. Can be a single color or a vector,
# and will be recycled as needed. If NA, no
# curve boundary is drawn. Defaults to "black".
# curveLwd : a single number gving the line width ("lwd")
# of the density curves.
# border : contains the color(s) of the boxes and
# whiskers [argument from boxplot].
# It can be a single color or a vector, and will
# be recycled as needed. A variable deemed
# unimodal will have a single box, otherwise
# each mode has its own box. When border is NA,
# the box is not drawn.
# boxCol : if not NULL, it overrides border.
# boxOpaque : opacity of the box.
# boxW : the width of the box. When zero, the box is
# not drawn.
# boxLwd : a single number giving the line width ("lwd")
# of the box and whiskers.
# makewhiskers : logical specifying whether whiskers
# should be drawn.
# innerwhiskers : logical. If TRUE and there is more than
# one mode, also the whiskers _between_ the modes
# are drawn. If FALSE, the default, no whiskers
# between modes are not drawn, because they do not
# have their usual meaning due to possible
# overlapping of densities.
# rugCol : contains the color(s) of the rug plot of
# the variables. The rug consists of small lines
# perpendicular to the variable axis, one for
# each data value. It can be a single color or a
# vector, and will be recycled as needed. If NA,
# the rug is not plotted.
# rugoutCol : if not NULL or NA, and the body of a
# variable or its modes are drawn, this specifies
# the color(s) of the part of the ruglines outside
# of the body. If NULL or NA, the same color rugCol
# is used as inside the body.
# rugNumeric : external numeric variable for coloring the rug.
# rugNumericColors : two or three colors to which
# colorRampPalette can be applied, to obtain a smooth
# palette of colors for rugNumeric.
# colorbarW : the width of the color bar with values of
# rugNumeric, relative to the width of the main plot.
# Defaults to 0.1 . Only has an effect if rugNumeric
# is specified.
# rugFactor : external factor variable for coloring the rug.
# rugFactorColors : colors of the factor levels of
# rugFactor.
# rugLwd : a single number specifying the width of the
# lines in the rug plot. Defaults to par("lwd").
# rugoutLwd : a single number specifying the width of
# the part of the ruglines outside of the body.
# Defaults to 2*rugLwd to make isolated points
# stand out visually.
# rugOpaque: opacity of the rug.
# jittering : logical indicating whether the values at
# which the rug lines are drawn are to be jittered
# by base::jitter, to make tied values more visible.
# Defaults to TRUE.
# jitteramount : the amount of jittering (see ?jitter).
# rugW : the width of the rug. When zero, the rug is
# not drawn.
# stickCol : when side = "both" this contains the color(s)
# of the `stick', which is a typically thin line
# separating the bodies of two variables plotted
# side by side. It goes from the smallest to the
# largest value of these variables. When the
# density is computed, it contains the range of
# the density as well. stickCol can be
# a single color or a vector, and will be recycled
# as needed. If NA, the default, the stick is not
# plotted.
# stickLwd : a single number giving the line width ("lwd")
# of the stick. When zero, the stick is not drawn.
# boxwex : a scale factor to be applied to the width
# of the body, box, and rug of all bixplots
# [Argument from boxplot].
# width : a vector giving the relative widths of the p
# bixplots, one for each variable. [Argument from
# boxplot]. The entries should be strictly
# positive numbers, that will be divided by the
# largest of them. The resulting ratios multiply
# the widths of the body, box, and rug.
# side : the side of the variable axis where the body,
# box and rug will be plot. [Argument from
# beanplot::beanplot]. The default is "no",
# meaning that each bixplot is symmetric about
# its axis. Options "first" and "second" plot
# all "half" bixplots on that given side. Option
# "both" plots variables on alternate sides of
# each axis, so fewer variable axes are needed.
# tick : boolean indicating whether to draw tick marks
# on the plot axis with the variable (group)
# names. Only has an effect when add = FALSE.
# diplevel : the level of Hartigan's dip test for
# unimodality of a variable. Only when this test
# rejects unimodality, that is, its p-value is
# at most diplevel, do we search for clusters.
# Defaults to 0.01. Setting it higher may lead to
# more clusters, and setting it lower to fewer
# clusters.
# minN : the minimum number of observations required per
# potential cluster. Defaults to 15. The maximum
# number of clusters searched is limited to n/minN,
# where n is the number of non-missing values.
# If n < 2*minN, clustering is not attempted to
# avoid spurious clusters from small samples.
# kmax : the maximal number of clusters. It will be
# truncated to 5 internally. If NULL, it is set
# to min(floor(n/minN), 5). When setting kmax=1
# all variables are considered as single clusters,
# making the display resemble a violin plot.
# bigN : when a variable has over bigN non-NA values,
# we sample bigN values from it without
# replacement, to save computation time.
# Defaults to 500. If a number below 300 is
# given, it is set to 300 internally.
# clusMinN : the minimum number of unique values in a
# cluster. The clustering is constrained so that
# any cluster must contain at least clusMinN
# unique values. Defaults to 3.
# bw : the bandwidth for use by stats::density which
# constructs the body of a variable (group). It
# can be a number or a formula, and defaults to
# "SJ-dpi" (see ?density).
# kernel : the kernel for use by stats::density.
# Defaults to "gaussian".
# cut : denoting a variable by y, the density is only
# constructed from min(y) - cut*bw to
# max(y) + cut*bw where bw is the numeric
# bandwidth. Defaults to 3. [Argument from
# beanplot.]
# cutmin : if specified, the densities of all variables
# and modes cannot go below the value cutmin.
# Defaults to -Inf which has no effect. [Argument
# from beanplot.]
# cutmax : if specified, the densities of all variables
# and modes cannot go above the value cutmax.
# Defaults to Inf which has no effect. [Argument
# from beanplot.]
#
# Graphical arguments, only when add = FALSE:
# xlim : vector with limits of the axis with the groups
# (variables), whether horizontal is TRUE or FALSE.
# This is a convention from graphics::boxplot.
# ylim : vector with limits of the axis with the numeric
# values taken, whether horizontal is TRUE or FALSE.
# This is a convention from graphics::boxplot.
# cex.axis, main, cex.main, line.main, xlab, ylab,
# cex.lab, line.lab, xaxs, yaxs, las : work as usual.
# ann : logical indicating if the plot should be
# annotated by xlab, ylab, and main title.
# [Argument from boxplot]. Defaults to !add.
# plot : if TRUE (the default) then the numerical
# summary on which the plot is based is
# returned invisibly. If FALSE it is also
# output visibly on the terminal. [Argument
# from boxplot.]
# log : this argument from boxplot has to be NULL,
# because a log transform can change the
# clusters (modes). If not NULL, a warning is
# given but the argument has no further effect.
# If you wish to use a logarithm it needs to be
# put explicitly in the formula, or you can
# transform the data first.
#
# There is no need to declare unused arguments from
# boxplot, vioplot, beanplot for backward compatibility
# (and then to ignore them): they are absorbed by the ...
#
# Value
#
# A list with the following components:
# call : the call to bixplot
# p : the number of variables
# <variable1>, <variable2>,... : for each variable, this
# is a list.
# When no clusters were found in that variable,
# it contains the non-NA values of that variable
# with their names if any, and their five-number
# summary from stats::fivenum that was used to
# draw the box.
# When clusters were found in that variable, it
# contains the clustering (a vector of integers),
# and for each cluster its values and five-number
# summary.
wnq = function (string, qwrite = TRUE) {
if (qwrite)
write(noquote(string), file = "", ncolumns = 100)
}
checkFactor = function(y){
# Auxiliary function for use of rugFactor.
# Argument:
# y : the factor variable
#
# Results:
# ints : the integer version of the factor variable
# levels : the levels of the factor variable
# indsv : the indices of the non-NA entries of the factor variable
#
if (!(is.factor(y))) {
stop(paste0("\n The rugFactor variable is not a factor.",
"\n Please use as.factor() or factor() first."))
}
if (is.null(attr(y, "levels"))) {
stop(paste0("\n The rugFactor variable has no levels.",
"\n Please use as.factor() or factor() first."))
}
levelsattr <- attr(y, "levels")
if (sum(is.na(levelsattr)) > 0) {
stop(paste0("\n The levels attribute of the rugFactor variable",
"\n has at least one NA."))
}
dupls <- duplicated(levelsattr)
if (sum(dupls) > 0)
stop(paste0("\n The label ", levelsattr[dupls == TRUE],
" occurs more than once in",
"\n the levels attribute of the rugFactor variable."))
yv <- as.character(y)
indsv <- which(!is.na(yv))
if (length(indsv) == 0) {
stop(paste0("The rugFactor variable only contains NA's."))
}
yv <- yv[indsv]
uniqy <- sort(unique(yv))
for (g in seq_len(length(uniqy))) {
if (!(uniqy[g] %in% levelsattr)) {
stop(paste0("\n The label ", uniqy[g], " does not appear among",
"\n the levels attribute of the rugFactor variable."))
}
}
if (length(uniqy) < 2) {
stop(paste0("\n The rugFactor variable has only a single level ( ",
uniqy[1], " )", "\n so we cannot color by level."))
}
for (g in seq_len(length(levelsattr))) {
if (!(levelsattr[g] %in% uniqy)) {
wnq(paste0("\nThe level ", levelsattr[g], " does not occur",
" in the rugFactor variable.\n"))
}
}
levels <- levelsattr[which(levelsattr %in% uniqy)]
xlevelz <- levels
lab2int <- function(labs, xlevelz) {
ints <- rep(NA, length(labs))
indv <- which(!is.na(labs))
labv <- labs[indv]
for (g in seq_len(length(xlevelz))) {
clinds <- indv[which(labv == xlevelz[g])]
ints[clinds] <- g
}
ints
}
ints = lab2int(y, xlevelz)
return(list(ints = ints, levels = levels, indsv = indsv))
}
drawpoints = function(y, coord=1, horizontal = FALSE,
bokw = 0.36, side = "no",
cex = 1, pointcol = par()$fg){
if(!(side %in% c("no","first","second"))) stop("invalid side")
if(side == "first") coord = coord - 0.04*bokw
if(side == "second") coord = coord + 0.04*bokw
coords = rep(coord,length(y))
if(horizontal==FALSE){
points(coords, y, col=pointcol, pch=16, cex=cex)
} else {
points(y, coords, col=pointcol, pch=16, cex=cex)
}
}
drawbox = function(y, coord = 1, horizontal = FALSE,
bokw = 0.36, lwd = 1,
bcol = "blue", side = "no",
whiskers = "none"){
# Auxiliary function to draw box at (Q1, median, Q3)
# y : a data vector
# coord : coordinate of the box
# bokw : box width if side = "no"
# side : side on which to draw the box
# whisker : whether to draw whiskers
#
if(!(side %in% c("no","first","second"))) stop("invalid side")
if(!(whiskers %in% c("both","up","down","none")))
stop("invalid whiskers")
bstat = grDevices::boxplot.stats(y, coef = 1.53)$stats
nuniq = length(unique(y))
miny = min(y, na.rm = TRUE); maxy = max(y, na.rm = TRUE)
if(nuniq == 3) bstat = c(miny, miny, bstat[3], maxy, maxy)
spine = coord
Left = coord - 0.50*bokw
Right = coord + 0.50*bokw
if(side == "second") {
Left = coord + 0.06*bokw
spine = coord + 0.28*bokw
}
if(side == "first") {
Right = coord - 0.06*bokw
spine = coord - 0.28*bokw
}
Low = bstat[2]; Mid = bstat[3]; Hi = bstat[4]
if(horizontal == FALSE){
if(nuniq > 1) lines(c(Left,Left,Right,Right,Left),
c(Low,Hi,Hi,Low,Low),
lwd = lwd, col = bcol)
if(nuniq > 2) lines(c(Left,Right), c(Mid,Mid),
lwd = lwd, col = bcol)
if(length(y) > 3){
if(whiskers %in% c("both","up")){
lines(c(spine,spine),c(Hi,bstat[5]),
lwd = lwd, col = bcol) }
if(whiskers %in% c("both","down")){
lines(c(spine,spine),c(Low,bstat[1]),
lwd = lwd, col = bcol) }
}
} else {
if(nuniq > 1) lines(c(Low,Hi,Hi,Low,Low),
c(Left,Left,Right,Right,Left),
lwd = lwd, col = bcol)
if(nuniq > 2) lines(c(Mid,Mid), c(Left,Right),
lwd = lwd, col = bcol)
if(length(y) > 3){
if(whiskers %in% c("both","up")){
lines(c(Hi,bstat[5]), c(spine,spine),
lwd = lwd, col = bcol) }
if(whiskers %in% c("both","down")){
lines(c(Low,bstat[1]), c(spine,spine),
lwd = lwd, col = bcol) }
}
}
}
compdens <- function(x, bwnum, kernel, cut, cutmin, cutmax) {
# Auxiliary function for drawdensity
if (length(x) > 0) {
from <- max(cutmin, (min(x, na.rm = TRUE) - cut * bwnum))
to <- min(cutmax, max(x, na.rm = TRUE) + cut * bwnum)
density(x, bw = bwnum, kernel = kernel, from = from,
to = to)[c("x", "y")]
}
else list(x = numeric(), y = numeric())
# if no non-NA data, this empty density comes out.
# But we only run this function for n > 0.
}
drawdensity = function(side, dens, at, bodcol,
curcol, curlwd = 1, horizontal){
# if curcol is NA we don't draw the curve.
if (side == "first") {
x1 <- NULL
y1 <- NULL
} else {
x1 <- dens$y + at
y1 <- dens$x
}
if (side == "second") {
x2 <- NULL
y2 <- NULL
} else {
x2 <- rev(dens$y) * -1 + at
y2 <- rev(dens$x)
}
if (length(x1) > 0) {
x1 <- c(at, x1, at)
y1 <- c(y1[1], y1, y1[length(y1)])
}
if (length(x2) > 0) {
x2 <- c(at, x2, at)
y2 <- c(y2[1], y2, y2[length(y2)])
}
if (horizontal) {
polygon(c(y1, y2), c(x1, x2), col = bodcol, border = NA)
if (!is.na(curcol))
lines(c(y1, y2), c(x1, x2), col = curcol, lwd = curlwd)
} else {
polygon(c(x1, x2), c(y1, y2), col = bodcol, border = NA)
if (!is.na(curcol))
lines(c(x1, x2), c(y1, y2), col = curcol, lwd = curlwd)
}
}
drawrug2sided = function(y, side, jittering, jitteramount, dens,
at, rugw, colin, colout, rugLwd,
rugoutLwd, horizontal)
{ # only for side == "no"
if(side != "no") stop('side must be \"no\" here')
if (jittering == TRUE) {
rugy <- list(jitter(y, amount = jitteramount),
rep(1, length(y)))
} else {
rugy <- list(y, rep(1, length(y)))
# this is poor when there are many ties
}
rugy[[2]] <- rugy[[2]] * rugw/2 # 1-sided width of ruglines
if (!isTRUE(all.equal(colin,colout))) {
crossings <- approx(dens$x, dens$y, rugy[[1]])$y
crossings <- pmin(crossings, rugy[[2]], na.rm = TRUE)
extra <- unlist(rugy[2]) - crossings
}
else {
crossings <- rugy[[2]]
extra <- 0
}
## First we plot the part of the rug that sticks out of the body:
if (max(extra) > 0) { # if at least one segment exceeds density
w <- (extra > 0) # which segments exceed density
if (horizontal) {
if (side %in% c("no", "first"))
segments(rugy[[1]][w], -extra[w] - crossings[w] + at,
rugy[[1]][w], -crossings[w] + at,
col = colout, lwd = rugoutLwd)
if (side %in% c("no", "second"))
segments(rugy[[1]][w], crossings[w] + at,
rugy[[1]][w], extra[w] + crossings[w] + at,
col = colout, lwd = rugoutLwd)
}
else {
if (side %in% c("no", "first"))
segments(-extra[w] - crossings[w] + at, rugy[[1]][w],
-crossings[w] + at, rugy[[1]][w],
col = colout, lwd = rugoutLwd)
if (side %in% c("no", "second"))
segments(crossings[w] + at, rugy[[1]][w],
extra[w] + crossings[w] + at, rugy[[1]][w],
col = colout, lwd = rugoutLwd)
}
}
## Now plot the central part, inside the body:
x1 = -crossings + at
x2 = crossings + at
if (horizontal)
segments(rugy[[1]], x1, rugy[[1]], x2, col = colin)
else segments(x1, rugy[[1]], x2, rugy[[1]], col = colin)
}
drawrug1sided = function(y, side, jittering, jitteramount, at,
rugw, rugcenter, colin, rugLwd, horizontal)
{ # only use this function when side != "no", so here it
# has to be either "first" or "second".
if(!(side %in% c("first", "second"))) stop(
'side must be \"first\" or \"second\" here')
if (jittering == TRUE) {
rugy <- jitter(y, amount = jitteramount)
} else {
rugy <- y # this is poor when there are many ties
}
spine = rugcenter
if(side == "first") spine = -rugcenter
x1 <- at + spine - rugw/2
x2 <- at + spine + rugw/2
if (horizontal)
segments(rugy, x1, rugy, x2, col = colin)
else segments(x1, rugy, x2, rugy, col = colin)
}
# Here main function starts
if(!(side %in% c("no", "first", "second", "both")))
stop("invalid side")
if(!(bodysize %in% c("area_is_constant",
"area_from_count",
"width_is_constant"))) stop("invalid bodysize")
if(!is.null(rugNumeric) & !is.null(rugFactor)) stop(
"You cannot specify rugNumeric and rugFactor simultaneously.")
if(!is.null(log)) warning(paste0(
"The argument 'log' has no effect here. If you wish\n",
"to use a logarithm it needs to be put explicitly in\n",
"the formula, or you can transform the data first.\n",
"Note that a log transform can change the modes."))
#
# Getting the y variables to plot
args <- match.call()
cat("\n")
print(args)
mcall <- as.list(args)
noname = rep(FALSE, length(args))
for(ii in seq_len(length(args))){
noname[ii] = (is.null(base::names(args[ii])))
}
whichnoname = which(noname == TRUE)
isadatafr = isamatrix = FALSE
if(length(whichnoname) == 2){
tempmat = eval(mcall[[whichnoname[2]]], envir = parent.frame())
isamatrix = is.matrix(tempmat)
isadatafr = is.data.frame(tempmat)
}
if(isamatrix == TRUE || isadatafr == TRUE){
p = NCOL(tempmat)
nams = colnames(tempmat)
if(is.null(nams) || missing(nams)) nams = paste0("X",seq_len(p))
n = NROW(tempmat); n
rownams = rownames(tempmat)
if(is.null(rownams) || missing(rownams)) rownams = seq_len(n)
ylist = list()
for(j in seq_len(p)){
tempvec = tempmat[,j]
names(tempvec) = rownams
ylist[[nams[j]]] = tempvec
}
} else {
envir = parent.frame(2)
if (is.null(base::names(args)))
vars <- 1:length(args)
else vars <- c(1, which(base::names(args)[2:length(args)] %in%
c("formula", "x", "data", "")) + 1)
if (length(vars) < 2)
return(list())
options <- which(base::names(args) %in%
c("subset", "na.action",
"drop.unused.levels", "xlev"))
args <- args[c(vars, options)]
args[[1]] <- quote(model.frame)
hashad <- rep(FALSE, length(vars))
ylist <- list()
notnamed <- 0
subsetno <- numeric()
naactno <- numeric()
dulno <- numeric()
xlevno <- numeric()
datano <- numeric()
argsvals <- lapply(as.list(args[2:length(vars)]), eval, envir)
islists <- lapply(argsvals, function(x) {
is.list(x) || is.null(x)
})
for (i in 2:length(vars)) {
if (hashad[i])
next
x <- argsvals[[i - 1]]
if (inherits(x, "formula")) {
datanonext <- match(TRUE, islists[
max(datano, i):length(islists)]) +
max(datano, i)
if (!is.na(datanonext)) {
hashad[datanonext] <- TRUE
datano <- datanonext
}
nextargpos <- function(name, pos) {
if (any(pos == length(args))) return(pos)
posnext <- match(name, base::names(args[
max(pos + 1, 3):length(args)])) +
max(pos + 1, 3) - 1
if (!is.na(posnext)) pos <- posnext
pos
}
subsetno <- nextargpos("subset", subsetno)
naactno <- nextargpos("na.action", naactno)
dulno <- nextargpos("drop.unused.levels", dulno)
xlevno <- nextargpos("xlev", xlevno)
attr(args, "names")[i] <- "formula"
m <- args[c(1, i, datano, subsetno, naactno, dulno,
xlevno)]
mf <- eval(m, envir)
response <- attr(attr(mf, "terms"), "response")
ylist <- c(ylist, split(mf[[response]], mf[-response]))
}
else if (is.list(x)) {
ylist <- c(ylist, x)
}
else {
x <- list(x)
notnamed <- notnamed + 1
attr(x, "names") <- notnamed
ylist <- c(ylist, x)
}
}
}
p <- length(ylist)
if(p == 0) stop("no data found to plot")
constantN = TRUE
nn = length(ylist[[1]])
for(j in seq_len(p)){
if(length(ylist[[j]]) != nn) constantN = FALSE
}
if(is.null(col)) col = NA
if(is.null(bodyCol)) bodyCol = col
colormodes = TRUE
if(is.null(modeCol)) colormodes = FALSE
if(is.null(curveCol)) curveCol = NA # no curves
if(is.null(boxCol)) boxCol = border
if(is.null(rugCol)) rugCol = par("fg")
if(is.null(rugoutCol)) rugoutCol = rugCol
if(is.na(rugoutCol)) rugoutCol = rugCol
#
# adjusting transparency of colors:
indx = which(!is.na(bodyCol))
if(length(indx) >= 1) bodyCol[indx] =
adjustcolor(bodyCol[indx], alpha.f = bodyOpaque)
if(colormodes == TRUE) {
indx = which(!is.na(modeCol))
if(length(indx) >= 1) modeCol[indx] =
adjustcolor(modeCol[indx], alpha.f = bodyOpaque)
}
indx = which(!is.na(boxCol))
if(length(indx) >= 1) boxCol[indx] =
adjustcolor(boxCol[indx], alpha.f = boxOpaque)
indx = which(!is.na(rugCol))
if(length(indx) >= 1) rugCol[indx] =
adjustcolor(rugCol[indx], alpha.f = rugOpaque)
indx = which(!is.na(rugoutCol))
if(length(indx) >= 1) rugoutCol[indx] =
adjustcolor(rugoutCol[indx], alpha.f = 1)
#
# Making vectors longer if needed:
cycle2size = function(x, size){
if(length(x) < size) x = rep(x, ceiling(size/length(x)))
return(x)
}
bodyCol = cycle2size(bodyCol, p)
bodyW = cycle2size(bodyW, p)
modeCol = cycle2size(modeCol, 5*p) # assumes mykmax <= 5
boxCol = cycle2size(boxCol, p)
boxW = cycle2size(boxW, p)
curveCol = cycle2size(curveCol, p)
rugCol = cycle2size(rugCol, p)
rugoutCol = cycle2size(rugoutCol, p)
rugW = cycle2size(rugW, p)
stickCol = cycle2size(stickCol, p)
#
if(!is.null(width)){
if(length(width) != p) stop(paste0(
"Argument width should be a vector of length" ,p))
if(min(width) <= 0) stop(
"Argument width should only contain positive entries")
bodyW = bodyW * width/(max(width))
boxW = boxW * width/(max(width))
rugW = rugW * width/(max(width))
}
bodyW = boxwex * abs(bodyW)
boxW = boxwex * abs(boxW)
rugW = boxwex * abs(rugW)
#
if (missing(names)) {
if (is.null(base::names(ylist)))
attr(ylist, "names") = 1:displayp
names <- base::names(ylist)
} else { attr(ylist, "names") <- names }
if(!(side == "both")){
displayp = p
xpos = seq_len(p)
mysides = rep(side, p)
if(!is.null(at)){
lat = length(at)
if(lat != p)
stop(paste0("argument at has length ",lat,
" which should be ", p))
xpos = at
}
xcoords = xpos
}
if(side == "both"){
# if plot on both sides: "half" as many variable axes
displayp = ceiling(p/2)
xpos = seq_len(displayp)
if(!is.null(at)){
lat = length(at)
if(lat != displayp)
stop(paste0("argument at has length ", lat,
" which should be ", displayp))
xpos = at
}
if ((side == "both") && (length(names) > displayp)) {
combinenames <- function(string1, string2)
{ if (is.na(string2)) return(string1)
if (string1 == string2) return(string1)
s1 <- unlist(strsplit(string1, NULL))
s2 <- unlist(strsplit(string2, NULL))
fromleft <- 0
gfromleft <- 0
while ((fromleft < length(s1)) && (fromleft < length(s2)) &&
(s1[fromleft + 1] == s2[fromleft + 1])) {
if (any(grep("[. \t]", s1[fromleft + 1])))
gfromleft <- fromleft
fromleft <- fromleft + 1
}
fromright <- 0
gfromright <- 0
while ((fromright < length(s1)) && (fromright < length(s2)) &&
(s1[length(s1) - fromright] == s2[length(s2) - fromright])) {
if (any(grep("[. \t]", s1[length(s1) - fromright])))
gfromright <- fromright
fromright <- fromright + 1
}
if (gfromleft > gfromright)
result <- substr(string1, 1, gfromleft)
else if (gfromright == 0)
result <- paste(string1, string2, sep = "+")
else result <- substr(string1, nchar(string1) - gfromright +
1, nchar(string1))
result
}
for (i in 1:displayp) {
names[i] <- combinenames(names[i*2 - 1], names[i*2])
}
length(names) <- displayp
}
xcoords = c(matrix(rep(xpos, 2), nrow=2, byrow = TRUE))
mysides = rep(c("first","second"), displayp)
}
## From here on, each mysides[j] is either "no", "first", or "second".
## It cannot be "both" anymore.
#
rugfact = FALSE
if(!is.null(rugFactor)){
rugfact = TRUE
if(constantN == FALSE) {
warning(
"We cannot color the rug by rugFactor because not all",
"\n y variables have the same length.")
rugfact = FALSE
} else {
if(length(rugFactor) != nn) {
warning(paste0(
"We cannot color the rug by rugFactor,",
"\n because rugFactor has length ", length(rugFactor),
"\n and the y variables have length ", nn, ".\n"))
rugfact = FALSE
}
}
if(rugfact == TRUE){
out = checkFactor(rugFactor)
# If this errors, also bixplot stops.
# So from here on we can assume it didn't.
ints = out$ints
mylevels = out$levels
indsv = out$indsv
nlevels = length(mylevels)
ncolors = length(rugFactorColors)
if(nlevels > ncolors) {
rugFactorColors = cycle2size(rugFactorColors, nlevels) }
}
cat("The variable rugFactor has the levels:\n")
print(mylevels)
}
#
rugnumer = FALSE
if(add == FALSE){
if(!is.null(rugNumeric)) {
rugnumer = TRUE
if(constantN == FALSE) {
warning(
"We cannot color the rug by rugNumeric because not all",
"\n y variables have the same length.")
rugnumer = FALSE
} else {
if(length(rugNumeric) != nn) {
warning(paste0(
"We cannot color the rug by rugNumeric,",
"\n because rugNumeric has length ", length(rugNumeric),
"\n and the y variables have length ", nn, ".\n"))
rugnumer = FALSE
}
}
}
}
#
# Determine limits of x and y
if(is.null(xlim)){
xlim = c(min(xcoords) - 0.5, max(xcoords) + 0.5)
}
ymin = Inf; ymax = -Inf
rugNum = rugFac = list()
if(bigN < 300) bigN = 300
for(j in seq_len(p)){
y = ylist[[j]]
if(length(y) >= 1){
if(!(is.numeric(y))) stop(paste0("variable ",j,
" is not numeric"))
if(rugnumer == TRUE) rugNum[[j]] = rugNumeric
if(rugfact == TRUE) rugFac[[j]] = ints
if(length(y) > bigN){
set.seed(1)
inds = sample(seq_len(length(y)), bigN, replace = FALSE)
# In order to display the worst outliers:
inds[1] = which.min(y)
inds[2] = which.max(y)
y = y[inds]
if(rugnumer == TRUE) rugNum[[j]] = rugNum[[j]][inds]
if(rugfact == TRUE) rugFac[[j]] = rugFac[[j]][inds]
}
noNA = which(!is.na(y))
y = y[noNA]
if(rugnumer == TRUE) rugNum[[j]] = rugNum[[j]][noNA]
if(rugfact == TRUE) rugFac[[j]] = rugFac[[j]][noNA]
ord = order(y)
y = y[ord] # This sort is necessary. The names are permuted too.
if(rugnumer == TRUE) rugNum[[j]] = rugNum[[j]][ord]
if(rugfact == TRUE) rugFac[[j]] = rugFac[[j]][ord]
ylist[[j]] = y
indy = which(!is.na(y))
tempy = y[indy]
indy = which(is.finite(tempy))
tempy = tempy[indy]
bwnum = 0
if(length(tempy) > 1 && !(all(is.na(modeCol)) && all(is.na(bodyCol)))){
bwnum = if(!is.numeric(bw)) density(unlist(tempy), bw = bw)$bw else bw
}
ymin = min(ymin, min(tempy) - cut*bwnum)
ymax = max(ymax, max(tempy) + cut*bwnum)
}
}
if(is.null(ylim)) { ylim = c(ymin, ymax) }
if(add == FALSE){
if(rugnumer == TRUE){ # make room for color bar
layout(matrix(c(1, 2), ncol = 2), widths = c(1, colorbarW))
maruser = par("mar")
mymar = maruser
mymar[4] = 1
par(mar = mymar)
colorpalette <- colorRampPalette(rugNumericColors)(100)
ran = range(rugNumeric, na.rm = TRUE)
}
if(horizontal == FALSE){
plot(NA, NA, type = "n",
xaxt = "n", xlim = xlim, ylim = ylim,
xaxs = xaxs, yaxs = yaxs,
xlab = "", ylab = "", cex.axis = cex.axis)
axis(side = 1, at = xpos, labels = names, las = las,
cex.axis = cex.axis, tick = tick)
} else {
plot(NA, type = "n",
yaxt = "n", xlim = ylim, ylim = xlim,
xaxs = xaxs, yaxs = yaxs,
xlab = "", ylab = "", cex.axis = cex.axis)
axis(side = 2, at = xpos, labels = names, las = las,
cex.axis = cex.axis, tick = tick, ylim = xlim)
}
if(ann == TRUE){
title(main = main, line = line.main, cex.main = cex.main)
title(xlab = xlab, line = line.lab, cex.lab = cex.lab)
title(ylab = ylab, line = line.lab, cex.lab = cex.lab)
}
}
outp = list()
outp[["call"]] = match.call()
outp[["p"]] = p
#
jj = 0 # counter for modeCol
for(j in seq_len(p)){
listj = list() # to add to outp list
y = ylist[[j]]
n = length(y)
cat(paste0("\nVariable ", j, " has length = ", n, ":\n"))
if(n < 1) { cat("we are skipping it.\n")
} else { # draw this plot
nuniq = length(unique(y))
if((side == "both") && (j%%2 == 0)){
ymintemp = min(c(y, ymintemp))
ymaxtemp = max(c(y, ymaxtemp))
} else{
ymintemp = min(y)
ymaxtemp = max(y)
}
# If bandwidth bw is a formula, turn it into a number:
bwnum = bw
if(n == 1) { bwnum = 0.5
} else {
if (!is.numeric(bw)) {
bwnum = density(y, kernel = kernel, bw = bw)$bw
}
if(is.nan(bwnum)) bwnum <- 0.5
}
mykmax = floor(n/minN)
if(!is.null(kmax)) mykmax = min(kmax, mykmax)
mykmax = min(mykmax, 5)
reject0 = FALSE
if(mykmax > 1){ # else we don't need the dip test
# Is there more than 1 mode:
pval = round(diptest::dip.test(y)$p.value,4)
cat(paste0("pvalue(dip.test) = ",pval,"\n"))
if(pval <= diplevel) reject0 = TRUE
}
kopt = 1
if(reject0 == TRUE){
mykmax = max(1, min(mykmax, floor(nuniq/clusMinN)))
cat(paste0("mykmax = ",mykmax,"\n"))
if(mykmax >= 2){
silwidths = rep(-2, mykmax)
cluslist = list()
cluslist[[1]] = NA
for(kk in 2:mykmax){
out = pamc1d(y, kk, clusMinN, maxit = 100,
verbose = FALSE)
cluslist[[kk]] = out$clustering
silh = cluster::silhouette(cluslist[[kk]], dist(y))
silwidths[kk] = mean(silh[,3], na.rm = TRUE)
}
cat("Silhouette widths: ")
print(round(silwidths[-1],3))
if(max(silwidths) > 0){ # else we keep kopt = 1
kopt = which.max(silwidths) # always >= 2
clusvec = cluslist[[kopt]]
}
} # ends mykmax >= 2
}
cat(paste0("Selected k = ", kopt,".\n"))
if(kopt > 1){
mymodes = list()
if(rugnumer == TRUE | rugfact == TRUE) myrugs = list()
tempval = rep(NA, kopt)
for(m in seq_len(kopt)){ # m=1
mymodes[[m]] = y[which(clusvec == m)]
if(rugnumer == TRUE) myrugs[[m]] = rugNum[[j]][which(clusvec == m)]
if(rugfact == TRUE) myrugs[[m]] = rugFac[[j]][which(clusvec == m)]
names(mymodes[[m]]) = names(y)[which(clusvec == m)]
tempval[m] = (mymodes[[m]])[1]
}
ordr = order(tempval) # to sort the modes
cat("Cluster sizes:\n")
print(t(as.matrix(table(clusvec))))
sortedmodes = list()
if(rugnumer == TRUE | rugfact == TRUE) sortedrugs = list()
clustnew = rep(NA, n)
for(kk in seq_len(kopt)){
kkk = ordr[kk]
sortedmodes[[kk]] = as.vector(mymodes[[kkk]])
names(sortedmodes[[kk]]) = names(mymodes[[kkk]])
if(rugnumer == TRUE) sortedrugs[[kk]] = as.vector(myrugs[[kkk]])
if(rugfact == TRUE) sortedrugs[[kk]] = as.vector(myrugs[[kkk]])
clustnew[clusvec == kkk] = kk
}
clusvec = clustnew; rm(clustnew)
names(clusvec) = names(y)
listj[["clustering"]] = clusvec
modelist = list()
#
# We have to compute the densities first, since their
# relative widths will depend on the argument bodysize.
densities = list()
areas = heights = counts = rep(NA, kopt)
for(m in seq_len(kopt)){ # loop over the modes
ym = sortedmodes[[m]]
counts[m] = length(ym)
if(length(unique(ym)) < clusMinN){
# This should never happen due to pamc
densities[[m]] = NA
} else { # for this mode:
dens = NA
# compute density of mode m:
dens = compdens(ym, bwnum = bwnum, kernel = kernel,
cut = cut, cutmin = cutmin,
cutmax = cutmax)
# we reuse the overall bandwidth for all modes!
ymintemp = min(ymintemp, min(dens$x))
ymaxtemp = max(ymaxtemp, max(dens$x))
xrange = max(dens$x) - min(dens$x)
area = xrange * mean(dens$y, na.rm = T)
dens$y = dens$y/area # so area is exactly 1
areas[m] = xrange * mean(dens$y, na.rm = T)
heights[m] = max(dens$y)
densities[[m]] = dens
} # ends density computation for this mode
} # ends loop over modes
factors = rep(1, kopt)
if(bodysize == "width_is_constant"){
factors = 1/heights
}
if(bodysize == "area_is_constant"){
factors = rep(1, kopt)/max(heights)
}
if(bodysize == "area_from_count"){
factors = (counts/max(counts))
factors = factors/max(factors*heights)
}
factors[which(is.na(factors))] = 0
ratios = heights*factors # for width of boxes and rug
ratios = ratios/max(ratios)
} # ends if(kopt > 1)
#
# Start plotting
if(kopt == 1){ # Plot as a single cluster
listj[["values"]] = y
my5 = fivenum(y, na.rm=TRUE)
names(my5) = NULL
listj[["fivenumbersummary"]] = my5
whiskers = "none"
if(makewhiskers == TRUE) whiskers = "both"
if(length(unique(y)) < clusMinN){ # should never happen
drawpoints(y, coord = xcoords[j], bokw = boxW[j],
side = mysides[j], cex=0.6,
horizontal = horizontal, pointcol = par()$fg)
if(boxW[j] > 0) {
drawbox(y, coord = xcoords[j], horizontal = horizontal,
side = mysides[j], bokw = boxW[j],
bcol = boxCol[j], whiskers = "none") }
} else { # For this variable:
colin = rugCol[j]
colout = rugoutCol[j]
if(rugnumer == TRUE){
mycols = round(99*(rugNum[[j]]-ran[1])/(ran[2]-ran[1])) + 1
mycols[is.na(mycols)] = 0
mycols = colorpalette[mycols]
colin = colout = mycols
}
if(rugfact == TRUE){
mycols = rugFac[[j]]
mycols[is.na(mycols)] = 0
mycols = rugFactorColors[mycols]
colin = colout = mycols
}
if((!is.na(bodyCol[j]) | !is.na(curveCol[j])) && (bodyW[j] > 0)) {
# Compute the density
dens = compdens(y, bwnum = bwnum, kernel = kernel,
cut = cut, cutmin = cutmin,
cutmax = cutmax)
ymintemp = min(ymintemp, min(dens$x))
ymaxtemp = max(ymaxtemp, max(dens$x))
dens$y = 0.5*bodyW[j]*dens$y/max(dens$y)
drawdensity(side = mysides[j], dens, at = xcoords[j],
bodcol = bodyCol[j], curcol = curveCol[j],
curlwd = curveLwd, horizontal = horizontal)
} else {
colout = colin # whether fixed or variable
}
if (sum(is.na(colin)) == 0 && (rugW[j] > 0)) {
set.seed(1) # so jittering always gives same result
if(mysides[j] == "no"){
drawrug2sided(y, side = mysides[j], jittering = jittering,
jitteramount = jitteramount, dens = dens,
at = xcoords[j], rugw = rugW[j],
colin = colin, colout = colout,
rugLwd = rugLwd, rugoutLwd = rugoutLwd,
horizontal = horizontal)
} else { # mysides[j] is either "first" or "second"
if(boxW[j] > 0){
drawrug1sided(y, side = mysides[j], jittering = jittering,
jitteramount = jitteramount, at = xcoords[j],
rugw = rugW[j]/2, rugcenter = 0.27*boxW[j],
colin = colin, rugLwd = rugLwd,
horizontal = horizontal)
}
}
}
if(boxW[j] > 0) {
drawbox(y, coord = xcoords[j], horizontal = horizontal,
side = mysides[j], bokw = boxW[j], lwd = boxLwd,
bcol = boxCol[j], whiskers = whiskers) }
}
} else { # kopt > 1
for(m in seq_len(kopt)){ # plot all the modes
ym = sortedmodes[[m]] # modes in increasing order
my5 = fivenum(ym, na.rm=TRUE)
names(my5) = NULL
modelist[[paste0("cluster_",m)]] =
list(members = ym, fivenumbersummary = my5)
whiskers = "none"
if(makewhiskers == TRUE){
whiskers = "both"
if(innerwhiskers == FALSE){
whiskers = "none"
if(m == 1) whiskers = "down"
if(m == kopt) whiskers = "up"
}
}
if(length(unique(ym)) < clusMinN){ # should never happen
drawpoints(ym, coord = xcoords[j], bokw = boxW[j],
side = mysides[j], cex=0.6,
horizontal = horizontal, pointcol = par()$fg)
if(boxW[j] > 0) drawbox(ym, coord = xcoords[j],
horizontal = horizontal,
side = mysides[j], bokw = boxW[j],
bcol = boxCol[j], whiskers = "none")
} else { # for this mode:
thismodecol = bodyCol[j]
if(colormodes == TRUE){
jj = jj+1
thismodecol = modeCol[jj] # can be NA
}
colin = rugCol[j]
colout = rugoutCol[j]
if(rugnumer == TRUE){
mycols = round(99*(sortedrugs[[m]]-ran[1])/(ran[2]-ran[1])) + 1
mycols[is.na(mycols)] = 0
mycols = colorpalette[mycols]
colin = colout = mycols
}
if(rugfact == TRUE){
mycols = sortedrugs[[m]]
mycols[is.na(mycols)] = 0
mycols = rugFactorColors[mycols]
colin = colout = mycols
}
if((!is.na(thismodecol) | !is.na(curveCol[j])) && (bodyW[j] > 0)) {
dens = densities[[m]]
dens$y = 0.5*bodyW[j]*factors[m]*dens$y
drawdensity(side = mysides[j], dens, at = xcoords[j],
bodcol = thismodecol, curcol = curveCol[j],
curlwd = curveLwd, horizontal = horizontal)
} else { # if we don't draw a density, the outside
# rug color should coincide with the inside
colout = colin # whether fixed or variable
}
if (sum(is.na(colin)) == 0 && (rugW[j] > 0)) {
set.seed(1) # so jittering always gives same result
if(mysides[j] == "no"){
drawrug2sided(ym, side = mysides[j], jittering = jittering,
jitteramount = jitteramount, dens = dens,
at = xcoords[j], rugw = rugW[j],
colin = colin, colout = colout,
rugLwd = rugLwd, rugoutLwd = rugoutLwd,
horizontal = horizontal)
} else { # mysides[j] is either "first" or "second"
if(boxW[j] > 0){
drawrug1sided(ym, side = mysides[j], jittering = jittering,
jitteramount = jitteramount, at = xcoords[j],
rugw = rugW[j]/2,
rugcenter = 0.27*ratios[m]*boxW[j],
colin = colin, rugLwd = rugLwd,
horizontal = horizontal)
}
}
}
bokw = ratios[m]*boxW[j]
if(bokw > 0) drawbox(ym, coord = xcoords[j],
horizontal = horizontal,
side = mysides[j], bokw = bokw,
lwd = boxLwd, bcol = boxCol[j],
whiskers = whiskers)
}
} # ends loop over modes m
listj = c(listj, modelist)
} # ends kopt > 1
} # ends if(n >= 1)
if(!is.na(stickCol[j]) && (stickLwd > 0) && (side == "both")){
if((ymintemp < Inf) && (ymaxtemp > -Inf)){
if(horizontal == FALSE){
lines(c(xcoords[j], xcoords[j]), c(ymintemp, ymaxtemp),
col = stickCol[j], lwd = stickLwd)
} else {
lines(c(ymintemp, ymaxtemp), c(xcoords[j], xcoords[j]),
col = stickCol[j], lwd = stickLwd)
}
}
} # ends stick
name = names[j]
outp[[name]] = listj # noquote(paste0("Variable ",j))
} # ends loop over j
if(add == FALSE){
if(rugnumer == TRUE){ # plot color bar:
mymar = maruser
mymar[2] = 0
par(mar = mymar)
color_bar_matrix <- matrix(seq(0, 1, length.out = 100), nrow = 1)
image(color_bar_matrix, col = colorpalette, axes=FALSE)
# Add a y-axis to the color bar to show the values:
# Use pretty() to get "nice" tick mark values:
tickvalues <- pretty(ran, n = 5)
# Calculate the corresponding positions on the 0-1
# scale of the color bar
tickpositions <- (tickvalues - ran[1])/(ran[2] - ran[1])
# Use these positions and values to create the axis
axis(4, at = tickpositions, labels = tickvalues,
cex.axis = cex.colorbar, las = 1)
box() # Add a box around the color bar
# Reset the layout to default:
layout(1)
# Reset the margins to before:
par(mar = maruser)
}
}
if(plot == TRUE) { invisible(outp) } else { return(outp) }
}
pamc1d <- function(y, k, minsize = 4, countwhat = "unique",
stand = TRUE, maxit = 100, verbose = TRUE){
# Constrained version of k-medoids clustering (partitioning
# around medoids) for univariate data y.
# This is a modification of an algorithm for constrained
# k-means by Bradley-Bennett-Demiriz (2000, Microsoft report
# MSR-TR-2000-65). The modifications include working with
# medians instead of means, and providing a minsize constraint
# on unique cases instead of any cases, to prevent placing
# tied cases in separate clusters.
#
# Arguments
# y : univariate data
# k : specified number of clusters
# minsize : lower bound on the size of all clusters
# maxit : maximum number of iterations in reassignment loop
# verbose : TRUE will print intermediate steps.
objL1 = function(yy, centers, clusvec){
mycmat = abs(outer(as.vector(yy),centers,"-"))
sum(mycmat*outer(clusvec, 1:length(centers), "=="))/length(yy)
}
if(!(countwhat %in% c("any", "unique"))) stop("invalid countwhat")
y = sort(na.omit(y))
if(length(unique(y)) < 2) stop("y has < 2 non-NA unique values, so no clustering")
n <- length(y)
if(is.null(names(y))) names(y) = seq_len(length(y))
yraw = y
if(stand == TRUE) y = scale(y) # possible since sd(y) > 0
if(countwhat == "unique"){
ydup = duplicated(y)
nuniq = sum(ydup == FALSE)
if(nuniq == n) countwhat = "any"
}
kminsize = k*minsize
if(countwhat == "any"){
if(kminsize > n) stop(paste0(
"A partition with k = ",k," clusters that each\n",
"have at least minsize = ", minsize, " members\n",
"would require ", kminsize," data points, but\n",
"there are only ", n, " data points."))
} else { # if(countwhat == "unique")
if(kminsize > nuniq) stop(paste0(
"A partition with k = ",k," clusters that each\n",
"have at least minsize = ", minsize, " unique members\n",
"would require ", kminsize," unique data points, but\n",
"there are only ", nuniq, " unique data points."))
}
objectives = list()
set.seed(1)
clusvec = cluster::pam(y, k, cluster.only = TRUE,
keep.diss=F, keep.data=F)
if(verbose == TRUE){
cat("Initial clustering vector:\n")
print(clusvec) # for all n cases (whether unique or not)
cat("Initial clustering table:\n")
print(as.data.frame(t(as.matrix(table(clusvec)))),
row.names = FALSE)
}
centers = rep(NA, k)
bigenough = TRUE
for(h in seq_len(k)){
clush = which(clusvec == h)
yh = y[clush]
if(countwhat == "any"){
if(length(yh) < minsize) bigenough = FALSE
}
if(countwhat == "unique"){
if(length(unique(yh)) < minsize) bigenough = FALSE
}
centers[h] = median(yh)
}
obj = objL1(y, centers, clusvec)
if(verbose == TRUE){
cat(paste0("Unconstrained objective = ", obj, "\n"))
}
objectives = c(objectives, obj)
num = 0
iter = 1
if(bigenough == FALSE){
if(countwhat == "any"){
num = 1
while (iter <= maxit & num > 0){
# The second condition says: stop when no assignment changes
if(verbose) cat(paste0("\nIteration Number = ", iter,"\n"))
cost.mat = abs(outer(as.vector(y),centers,"-"))
# = matrix with distances of all cases to each center
trans <- lpSolve::lp.transport(cost.mat = cost.mat,
direction = "min",
row.signs = rep("=", n),
row.rhs = rep(1, n),
col.signs = rep(">=", k),
col.rhs = rep(minsize, k))
clusnew <- apply(trans$solution, 1, which.max)
num <- sum(clusnew != clusvec)
if(verbose == TRUE){
cat(paste0("Number of assignment changes = ", num,"\n"))
}
if(num > 0){ # if not, we are done
clusvec <- clusnew # update clustering
if(verbose == TRUE){
print(clusvec) # new clustering vector
print(as.data.frame(t(as.matrix(table(clusvec)))),
row.names = FALSE)
}
# recompute centers:
for(h in seq_len(k)){
clush = which(clusvec == h)
centers[h] = median(y[clush])
}
obj = objL1(y, centers, clusvec)
if(verbose == TRUE){
cat(paste0("objective = ", obj, "\n"))
}
objectives = c(objectives, obj)
iter <- iter + 1
}
} # ends while
if(min(table(clusvec)) < minsize){ # should never happen
warning(paste0("Failed to satisfy the minsize condition for\n",
"all clusters. Try with lower k or minsize."))
}
} # ends countwhat == any
if(countwhat == "unique"){
mult = rep(1, nuniq)
if(verbose == TRUE){
cat(paste0("\nThere are ", nuniq, " unique cases out of ",
n," cases.\n"))
}
whichuni = which(ydup == FALSE)
yuniq = y[whichuni]
yrawuniq = yraw[whichuni]
if(verbose == TRUE){
cat("Unique cases:\n")
print(clusvec[whichuni]*0 + yrawuniq)
}
clusuni = clusvec[whichuni]
whichdup = which(ydup == TRUE)
if(verbose == TRUE){
cat("Duplicate cases:\n")
print(clusvec[whichdup]*0 + yraw[whichdup])
}
dupofwhat = rep(NA, length(whichdup))
for(i in seq_len(length(whichdup))){ # i = 10
yi = y[whichdup[i]]
dupofwhat[i] = min(which.min(abs(y[whichuni] - yi)))
mult[dupofwhat[i]] = mult[dupofwhat[i]] + 1
}
if(verbose == TRUE){
cat("They are duplicates of the cases:\n")
whichof = whichuni[dupofwhat]
print(clusvec[whichof]*0 + yraw[whichof])
}
num = 1
while (iter <= maxit & num > 0){
if(verbose) cat(paste0("\nIteration Number = ", iter,"\n"))
cost.mat = abs(outer(as.vector(yuniq),centers,"-"))
trans <- lp.transport(cost.mat = cost.mat,
direction = "min",
row.signs = rep("=", nuniq),
row.rhs = rep(1, nuniq),
col.signs = rep(">=", k),
col.rhs = rep(minsize, k))
clusuninew <- apply(trans$solution, 1, which.max)
num <- sum(clusuninew != clusuni) # number of assignment changes
if(verbose == TRUE){
cat(paste0("Number of assignment changes = ", num,"\n"))
}
if(num > 0){
clusuni <- clusuninew # update clustering
# turn clusuni back into clusvec:
if(nuniq < n){
clusvec[whichuni] = clusuni
clusvec[whichdup] = clusuni[dupofwhat]
} else { clusvec = clusuni }
if(verbose == TRUE){
print(clusvec) # true clustering vector
print(as.data.frame(t(as.matrix(table(clusvec)))),
row.names = FALSE)
}
for(h in 1:k){
clush = which(clusvec == h)
centers[h] = median(y[clush])
}
obj = objL1(y, centers, clusvec)
if(verbose == TRUE){
cat(paste0("objective = ", obj, "\n"))
}
objectives = c(objectives, obj)
iter <- iter + 1
}
} # ends while
if(min(table(clusuni)) < minsize){ # should never happen
warning(paste0("Failed to satisfy the minsize condition for\n",
"all clusters. Try with lower k or minsize."))
}
} # ends countwhat == unique
} # ends bigenough == FALSE
clustable = clusvec
clustable = table(clustable)
if(verbose == TRUE){
cat("\nObjective function in iteration steps: \n")
print(matrix(objectives, ncol=1))
cat("\nFinal clustering vector:\n")
print(clusvec)
cat("\nFinal clustering table:\n")
print(as.data.frame(t(as.matrix(clustable))), row.names = FALSE)
}
result <- list(iter = iter, converged = (num==0),
clustering = clusvec, obj = obj,
centers = centers, clustable = clustable)
return(result)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.