bixplot_examples"

knitr::opts_chunk$set(
  fig.width  = 7,
  fig.height = 4,
  fig.align  = 'center'
)
oldpar <- list(mar = par()$mar, mfrow = par()$mfrow)

Introduction

This vignette illustrates the bixplot function, which visualizes a univariate dataset by combining an estimated density body, a box-and-whisker summary, and a rug of individual data values. For variables with more than one mode, the function automatically detects clusters and displays each mode separately. The examples below start with simple univariate comparisons and continue with more elaborate multivariable layouts and rug coloring options.

library(vioplot)
library(robustHD)
library(classmap)

Unimodal, bimodal and multimodal data

We start with three simulated variables: one unimodal, one bimodal and one trimodal. This illustrates the core difference between a violin plot and a bixplot.

set.seed(1)
dat1 <- rnorm(120, 0, 2.5)
dat2 <- c(rnorm(80, -3, 1), rnorm(40, 3, 1))
dat3 <- c(rnorm(25, -4, 0.8), rnorm(50, 0, 0.8), rnorm(40, 4, 0.8))
xlist <- list(Unimodal = dat1, Bimodal = dat2, Multimodal = dat3)

The default call already gives an informative result:

bixout <- bixplot(xlist, main = "bixplot")

The returned list bixout contains the cluster structure and five-number summaries for each variable:

bixout

For comparison, below we display the violin plot and the bixplot side by side. The bixplot separates the modes of the bimodal and trimodal variables and draws a dedicated box for each cluster.

ylim <- c(-8.5, 9)
par(las = 1, mfrow = c(1, 2))
par(mar = c(2.1, 2.2, 1.7, 2))
viocol <- adjustcolor("chocolate3", alpha.f = 0.5)
vioplot::vioplot(xlist, ylim = ylim, main = "", col = viocol)
title(main = "violin plot", line = 0.5, cex.main = 1)
par(mar = c(2.1, 2.2, 1.7, 0.2))
bixplot(xlist, ylim = ylim, main = "")
title(main = "bixplot", line = 0.5, cex.main = 1)
par(oldpar)

Latency data and penguin bill length

Next we apply bixplot to two real datasets: a latency dataset with three variables and the penguin bill length variable grouped by island.

data("data_latenc")
dim(data_latenc) # 40 rows, 3 columns

par(las = 1, mfrow = c(2, 2))
mar1 <- c(2.1, 2.4, 2.3, 2)
mar2 <- c(2.1, 2.2, 2.3, 0.2)
viocol <- adjustcolor("chocolate3", alpha.f = 0.5)

par(mar = mar1)
vioplot::vioplot(data_latenc, main = "", col = viocol)
title(main = "violin plot", cex.main = 1.2, line = 0.6)

par(mar = mar2)
mymodeCol <- c("cadetblue3", "hotpink2")
bixplot(data_latenc, main = "", cutmin = 0, cutmax = 300,
        ylim = c(0, 300), modeCol = mymodeCol)
title(main = "bixplot", cex.main = 1.2, line = 0.6)

# Score the islands in a meaningful order
islandscore <- rep(NA, length(penguins$island))
islandscore[penguins$island == "Torgersen"] <- 1
islandscore[penguins$island == "Biscoe"]    <- 2
islandscore[penguins$island == "Dream"]     <- 3
ylim <- c(29, 62)

par(mar = mar1)
vioplot::vioplot(bill_len ~ reorder(island, islandscore, mean),
                 data = penguins, main = "",
                 col = viocol, xlab = "", ylab = "",
                 ylim = ylim, cex.axis = 1)
title(main = "violin plot", cex.main = 1.2, line = 0.6)

par(mar = mar2)
bixplot(bill_len ~ reorder(island, islandscore, mean),
        data = penguins, main = "", ylim = ylim,
        bodyCol = "gray40", bodyOpaque = 0.3)
title(main = "bixplot", cex.main = 1.2, line = 0.6)
par(oldpar)

For the latency variables, the bixplot correctly identifies a bimodal structure that is obscured by the smooth envelope of the violin plot. For the penguin bill lengths, the Biscoe island group is clearly bimodal, reflecting the presence of two penguin species.


Iris data

The standardized iris dataset has four variables. The bixplot reveals that petal length and petal width have more than one mode (cluster).

par(mfrow = c(1, 2))
par(mar = c(4, 2, 2, 0.1))

diris <- data.frame(scale(iris[, 1:4]))
colnames(diris) <- c("Sepal.L", "Sepal.W", "Petal.L", "Petal.W")

mymodeCol <- c("cadetblue3", "hotpink2",
               "cadetblue3", "hotpink2",
               "lawngreen")
bixplot(diris, main = "", cut = 3, col = "gray75",
        bodyOpaque = 0.6, rugW = 0.16,
        rugoutCol = "red", curveLwd = 0.5,
        modeCol = mymodeCol,
        ylim = c(-3.6, 4.2), yaxs = "i",
        xlab = "standardized variables")
title(main = "bixplot display of iris data", cex.main = 1, line = 0.7)

We can also overlay marginal bixplots on top of a scatter plot, to show the marginal distributions of the axes variables. Here we add horizontal and vertical half-bixplots to a scatter plot of petal length versus petal width.

par(mar = c(4, 4, 2, 0.2))
x <- diris$Petal.L
y <- diris$Petal.W
xlim <- c(-2.3, 2.4)
ylim <- c(-2.2, 2.4)
xyratio <- (xlim[2] - xlim[1]) / (ylim[2] - ylim[1])

plot(x, y, xlim = xlim, ylim = ylim, pch = 16,
     xlab = "", ylab = "", xaxs = "i", yaxs = "i")
title(xlab = "Petal.L", line = 2)
title(ylab = "Petal.W", line = 2)
title(main = "petal length versus petal width", cex.main = 1, line = 0.7)

bixplot(x, add = TRUE, horizontal = TRUE,
        at = ylim[1] + 0.015, cutmin = xlim[1],
        boxwex = 0.9, curveLwd = 0.5,
        border = "black", side = "second",
        bodyOpaque = 0.6)
bixplot(y, add = TRUE, horizontal = FALSE,
        at = xlim[1] + 0.015, cutmin = ylim[1],
        boxwex = xyratio * 0.9,
        modeCol = c("cadetblue3", "hotpink2", "lawngreen"),
        curveLwd = 0.5, side = "second",
        bodyOpaque = 0.6)
par(oldpar)

The argument xyratio scales the vertical bixplot so that its density body has the same visual height as the horizontal one.


Body sizing options for multimodal variables

The bodysize argument controls how the density bodies of individual modes within a variable are sized relative to each other. The three options are illustrated below on the (bimodal) petal length variable.

par(mfrow = c(1, 3))
par(mar = c(2.5, 2, 2, 1))

for (bs in c("width_is_constant", "area_is_constant", "area_from_count")) {
  bixplot(diris[, 3], main = "", cut = 3, col = "gray75",
          bodyOpaque = 0.5, bodysize = bs, curveLwd = 0.5,
          ylim = c(-2.2, 2.4), yaxs = "i", names = "Petal.L")
  title(main = switch(bs,
                      width_is_constant = "equal width",
                      area_is_constant  = "equal area",
                      area_from_count   = "area from count"),
        cex.main = 1, line = 0.7)
}
par(oldpar)

With "area_from_count" (the default), the larger cluster receives a proportionally higher area, making cluster sizes immediately apparent.


Bill length by island and sex

Here we use the formula interface to plot bill length split by the interaction of sex and island. We compare a standard multi-group bixplot with the side = "both" layout, which pairs male and female distributions on opposing sides of a shared axis.

par(las = 1, mfrow = c(1, 2))

islscore <- rep(NA, length(penguins$island))
islscore[penguins$island == "Torgersen"] <- 1
islscore[penguins$island == "Biscoe"]    <- 2
islscore[penguins$island == "Dream"]     <- 3
ylim    <- c(27, 64)
mynames <- c("Torger.F", "Torger.M", "Biscoe.F",
             "Biscoe.M", "Dream.F",  "Dream.M")
mycol      <- c("slateblue2", "orange")
mymodeCol  <- c("darkorchid1", "slateblue3",
                "goldenrod1",  "darkorange2")

par(mar = c(2.9, 2, 0.8, 1))
bixplot(bill_len ~ sex + reorder(island, islscore, mean),
        data = penguins, names = mynames, modeCol = mymodeCol,
        bodyOpaque = 0.6, ylim = ylim, bodyW = 0.9,
        col = mycol, main = "", rugCol = "black", las = 1)
legend("topleft", legend = c("female", "male"),
       fill = c("slateblue3", "orange"), cex = 1.5)

par(mar = c(2.9, 3, 0.8, 0.1))
bixplot(bill_len ~ sex + reorder(island, islscore, mean),
        data = penguins, main = "", rugCol = "black",
        bodyOpaque = 0.6, ylim = ylim, bodyW = 0.9,
        col = mycol, stickCol = "gray10", stickLwd = 1,
        side = "both", modeCol = mymodeCol, las = 1,
        names = c("Torgersen", "Biscoe", "Dream"))
legend("topleft", legend = c("female", "male"),
       fill = mycol, cex = 1.5)
par(oldpar)

With side = "both" the number of axis labels is halved and the male/female pair for each island shares one axis, making comparisons within an island more direct. Here, male penguins tend to have larger bill lengths than females.


Rug colored by an external numeric variable

The rugNumeric argument colors each rug tick mark according to a continuous covariate via a smooth color palette. Here the rug is colored by body mass, and a color bar legend is added automatically.

par(las = 1, mfrow = c(1, 1))
par(mar = c(3.2, 3.2, 2, 2.6))

scpenguins <- scale(penguins[, c(4, 3, 5)])
varnames   <- c("bill_depth", "bill_length", "flipper_length")

bixplot(scpenguins, rugNumeric = penguins$body_mass,
        main = "", boxW = 0.30, rugW = 0.24,
        names = varnames, boxLwd = 1.2, boxOpaque = 1,
        curveLwd = 2, colorbarW = 0.16, cex.colorbar = 0.8,
        bodysize = "width_is_constant",
        bodyCol = "grey90", modeCol = "grey90",
        xlab = "feature", ylab = "standardized value")
title(main = "penguins with rug color by body_mass       ",
      line = 0.6)
par(oldpar)

We see that heavier penguins (green) tend to have a larger flipper length, revealing a positive association between these two variables.


Rug colored by a factor variable

The rugFactor argument colors each rug tick mark by a factor level. Below we display the same three penguin measurements with rug color indicating species, this time using a horizontal layout.

par(mfrow = c(1, 1))
par(mar = c(3.2, 3.8, 2, 0.2))

rugFactorColors <- c("red", "blue", "forestgreen")

bixplot(scpenguins, rugFactor = penguins$species,
        main = "", boxW = 0.30, rugW = 0.24,
        ylim = c(-3, 4), names = varnames,
        boxLwd = 1.2, boxOpaque = 1, curveLwd = 2,
        horizontal = TRUE, bodysize = "width_is_constant",
        bodyCol = "grey90", modeCol = "grey90",
        xlab = "standardized value", ylab = "feature",
        rugFactorColors = rugFactorColors, las = 0)
title(main = "penguins with rug color by species", line = 0.6)
legend(x = 2.36, y = 2.8,
       legend = c("Adelie", "Chinstrap", "Gentoo"),
       fill = rugFactorColors, cex = 0.88)
par(oldpar)

The species labels in the rug indicate that Gentoo penguins tend to have smaller bill_depth and longer flipper_length.


Top Gear car data

We now illustrate bixplot on the Top Gear dataset from the robustHD package, which contains performance and price data for about three hundred cars.

data(TopGear, package = "robustHD")
scars        <- TopGear[, c(14, 12, 5, 7, 8)]
scars[, 3]   <- log(scars[, 3])
colnames(scars)[3] <- "log(Price)"
scars[, 1:4] <- scale(scars[, 1:4])

An initial bixplot reveals a severe outlier in the Weight variable:

bixplot(scars[, 1:4])
which.min(TopGear$Weight) # row 199
TopGear[199, c(1, 2, 14, 12, 5, 7)]

The Peugeot 107 has a recorded weight of 210, which is clearly a data error. We set it to NA and replot:

scars[199, 1] <- NA
bixplot(scars[, 1:4])

There is also an outlier in TopSpeed: the value 50 of the Renault Twizy, that is a tiny city car. This value is correct, so it is retained.

which.min(TopGear$TopSpeed) # row 220
TopGear[220, c(1, 2, 14, 12, 5, 7)]

We now produce a final two-panel figure: a bixplot of the four standardized variables alongside a scatter plot of TopSpeed versus Displacement with marginal bixplots added on each axis.

par(las = 1, mfrow = c(1, 2))
par(mar = c(4, 2, 2, 0.1))

bixplot(scars[, 1:4], ylim = c(-3.8, 5.2), main = "",
        col = "darkgoldenrod2", yaxs = "i",
        modeCol = c("cadetblue3", "hotpink2"),
        bodyOpaque = 0.6, las = 1)
title(main = "standardized Top Gear variables",
      cex.main = 1, line = 0.7)

par(mar = c(4, 4, 2, 0.1))
x <- jitter(scars[, 4])
y <- jitter(scars[, 2])
mycol <- rep(NA, nrow(scars))
mycol[scars[, 5] == "Front"] <- "red"
mycol[scars[, 5] == "Rear"]  <- "forestgreen"
mycol[scars[, 5] == "4WD"]   <- "orange"

plot(x, y, xlim = c(-1.999, 3.999), ylim = c(-2.999, 4.5),
     xaxs = "i", yaxs = "i", xlab = "", ylab = "",
     pch = 16, cex = 1, col = mycol, las = 1)
title(xlab = "Displacement", line = 2)
title(ylab = "TopSpeed",     line = 2)
title(main = "TopSpeed versus Displacement",
      cex.main = 1, line = 0.7)

bixplot(scars[, 4], add = TRUE, at = -2.98, horizontal = TRUE,
        side = "second", boxwex = 2, bodyOpaque = 0.6)
bixplot(scars[, 2], add = TRUE, at = -1.985, horizontal = FALSE,
        side = "second", boxwex = 1.4, bodyOpaque = 0.6)

legend(x = -1.5, y = 4, title = "DriveWheel",
       legend = c("Front", "Rear", "4WD"),
       fill = c("red", "forestgreen", "orange"), cex = 1)
par(oldpar)

Rear-wheel-drive cars (green) span the full range of displacement and are concentrated at the higher end of TopSpeed, whereas front-wheel-drive cars (red) have rather low values of both TopSpeed and Displacement.


Tooth growth and iris data with side = "both"

The side = "both" option is particularly useful for two-group comparisons, as it places paired distributions on opposite sides of a shared axis.

par(las = 0, mfrow = c(1, 2))
par(mar = c(3.5, 2, 2, 1))

bixplot(len ~ supp * dose,
        data = ToothGrowth, side = "both",
        col = c("orange", "slateblue3"),
        main = "", xlab = "Vitamin C dose (mg)",
        ylab = "tooth length", bodyOpaque = 0.7,
        ylim = c(-1, 44), yaxs = "i", las = 0,
        stickCol = "black", stickLwd = 1)
title(main = "guinea pigs' tooth growth by supplement type",
      cex.main = 1, line = 0.7)
legend("topleft", title = "supplement",
       legend = c("OJ", "AA"),
       fill = c("orange", "slateblue3"), cex = 1)

par(mar = c(3.5, 2.8, 2, 0.1))
diris <- data.frame(scale(iris[, 1:4]))
colnames(diris) <- c("Sepal.L", "Sepal.W", "Petal.L", "Petal.W")

bixplot(diris, side = "both", main = "",
        bodysize = "width_is_constant",
        col = c("orange", "slateblue3"),
        modeCol = c("cadetblue3", "hotpink2",
                    "lawngreen", "gray60", "cyan3"),
        stickCol = "red", stickLwd = 1,
        bodyOpaque = 0.7, las = 0,
        xlab = "standardized measurements")
title(main = "iris data by length and width",
      cex.main = 1, line = 0.7)
legend("top", legend = c("length", "width"),
       fill = c("orange", "slateblue3"), cex = 1)
par(oldpar)

In the tooth growth data we see that the orange juice (OJ) supplement yields higher tooth growth than ascorbic acid (AA), except at the highest dose, where the medians of OJ and AA align and AA has a larger spread than OJ. In the right panel, the iris data are now shown on two vertical axes instead of four.


Iris data with rug coloring

Finally, we revisit the iris data with rug colors tied to an external variable, first a continuous one (sepal length) and then a factor (species).

par(las = 1, mfrow = c(1, 1))
par(mar = c(2.2, 2.2, 2, 2.6))

bixplot(diris, main = "",
        rugNumeric = iris$Sepal.Length, colorbarW = 0.15,
        bodyCol = "grey80", modeCol = "grey80", las = 1)
title(main = "iris data with rug color by sepal length      ",
      line = 0.6)
par(oldpar)

Flowers with high sepal length (green) tend to have high petal length and petal width. There is no such relation between sepal length and sepal width.

par(las = 1, mfrow = c(1, 1))
par(mar = c(2.2, 2.2, 2, 0.2))

rugFactorColors <- c("red", "blue", "forestgreen")
bixplot(diris, main = "", rugFactor = iris$Species,
        bodyCol = "grey80", modeCol = "grey80",
        rugFactorColors = rugFactorColors, las = 1)
title(main = "iris data with rug color by species", line = 0.6)
legend("topright",
       legend = c("setosa", "versicolor", "virginica"),
       fill = rugFactorColors, cex = 1)
par(oldpar)

The three species nicely match the clusters in Petal.W and Petal.L. The colors inside Sepal.L are roughly similar, whereas Sepal.W again behaves differently.


Titanic data

As a final example we use the Titanic dataset from the classmap package. We compare the standardized age and log-fare distributions for casualties and survivors using the side = "both" layout, and then color the rug by cabin class.

data(data_titanic, package = "classmap")
titanic         <- data_titanic[1:100, ]
titanic$logFare <- log(titanic$Fare + 1)
titanic$Pclass  <- as.factor(titanic$Pclass)
titanic[, c(5, 14)] <- scale(titanic[, c(5, 14)])

xt <- list(titanic[titanic$y == "casualty",  5],
           titanic[titanic$y == "survived",  5],
           titanic[titanic$y == "casualty",  14],
           titanic[titanic$y == "survived",  14])
names(xt) <- c("standardized Age.C",
               "standardized Age.S",
               "standardized log(Fare).C",
               "standardized log(Fare).S")
par(las = 1, mfrow = c(1, 2))
par(mar = c(2.4, 2, 2, 1))

mycol <- c("coral2", "cadetblue3")

bixplot(xt, side = "both", main = "", col = mycol,
        stickLwd = 1, stickCol = "purple",
        boxW = 0.18, rugW = 0.10, las = 1)
title(main = "Titanic data by survival", line = 0.6)
legend("top", legend = c("casualty", "survived"),
       fill = mycol, cex = 0.9)

par(mar = c(2.4, 3, 2, 0.1))
bixplot(titanic[, c(5, 14)], main = "",
        rugFactor = titanic$Pclass, boxW = c(0.22, 0.18),
        names = c("standardized Age",
                  "standardized log(Fare)"), las = 1)
title(main = "Titanic data by cabin class", line = 0.6)
legend("top", title = "cabin class",
       legend = c("1", "2", "3"),
       fill = c("red", "blue", "forestgreen"), cex = 0.9)
par(oldpar)

In the left panel we see that the median of Age and especially its mode are higher for the survivors, with a smaller effect of Fare. On the right we see that both Age and Fare have an effect on cabin class.




Try the classmap package in your browser

Any scripts or data that you put into this service are public.

classmap documentation built on April 29, 2026, 5:10 p.m.