| X | R Documentation |
lessR introduces the concept of a data view visualization function, in which the choice of visualization function directly reflects the structure of the data and the analyst's goal for understanding the data. The function X() visualizes the distribution of a continuous variable along with
associated summary statistics. The specific representation is selected with the
type argument:
Histogram with type = "histogram" (default)
Density plot with type = "density"
Box plot with type = "box"
Violin plot with type = "violin"
Strip (one-dimensional scatter) plot with type = "strip"
Integrated violin-box-strip (VBS) plot with type = "vbs"
Two-dimensional scatter plot for a by variable with type = "scatter"
Stratification divides the distribution into groups.
Use by to overlay the groups within a single panel, or facet to
display each group, or combination of groups, in separate panels.
When running in RStudio, static displays appear in the
Plots window, while histograms and density plots also appear as
interactive Plotly visualizations in the Viewer window.
X(
# ----------------------------------------------
# Data from which to construct the visualization
x=NULL, by=NULL, facet=NULL, data=d, filter=NULL,
# ------------------------------------------
# Type of plot for the continuous variable x
type = c("histogram", "freq_poly", "density", "scatter",
"violin", "box", "strip", "bs", "vbs"), # violin, box, strip
stat=c("count", "proportion", "density"),
n_row=NULL, n_col=NULL, aspect="fill",
# -----------------------------------------------------------------
# Analogy of physical Marks on paper that create the bars and labels
theme=getOption("theme"),
fill=getOption("bar_fill_cont"),
color=getOption("bar_color_cont"),
transparency=getOption("trans_bar_fill"),
# -----------------------------------------------------
# Form of the histogram - bin the continuous variable x
counts=FALSE,
bin_start=NULL, bin_width=NULL, bin_end=NULL, breaks="Sturges",
cumulate=c("off", "on", "both"), reg="snow2", # Cumulative histogram
# -----------------------------------------------------
# Density (smooth curve) plot
show_histogram=TRUE,
bandwidth=NULL, kind=c("general", "normal", "both"),
fill_normal=NULL, fill_hist=getOption("se_fill"),
color_normal="gray20", line_width=NULL,
x_pt=NULL, y_axis=FALSE,
rug=FALSE, color_rug="black", size_rug=0.5,
# -------------------------------------------------
# Integrated violin/box/scatter plot
vbs_plot="vbs", vbs_ratio=0.9, bw=NULL, bw_iter=10,
violin_fill=getOption("violin_fill"),
box_fill=getOption("box_fill"),
pt_size=NULL,
vbs_pt_fill="black",
vbs_mean=FALSE, fences=FALSE, n_min_pivot=1,
k=1.5, box_adj=FALSE, a=-4, b=3,
ID="row.name", ID_size=0.60,
MD_cut=0, out_cut=0, out_shape="circle", out_size=1,
# -----------------------------------------------------------------
# Labels for axes, values, and legend if x and by variables, margins
xlab=NULL, ylab=NULL, main=NULL, sub=NULL,
lab_adjust=c(0,0), margin_adjust=c(0,0,0,0),
rotate_x=getOption("rotate_x"), rotate_y=getOption("rotate_y"),
offset=getOption("offset"),
scale_x=NULL,
axis_fmt=c("K", ",", ".", ""), axis_x_pre="", axis_y_pre="",
# ----------------------------------------------------------------------
# Draw one or more objects, text, or geometric figures, on the histogram
add=NULL, x1=NULL, y1=NULL, x2=NULL, y2=NULL,
# ---------------------------------------
# Output: turn off, chart to PDF file, decimal digits, markdown file
quiet=getOption("quiet"), do_plot=TRUE,
use_plotly=getOption("lessR.use_plotly"),
pdf_file=NULL, width=6.5, height=6, digits_d=NULL, Rmd=NULL,
# --------------------------------------
# Deprecated, removed in future versions
n_cat=getOption("n_cat"),
rows=NULL, facet1=NULL, facet2=NULL,
# ---------------------------------
# Other
eval_df=NULL, fun_call=NULL, ...)
x |
Variable(s) to analyze. May be a single numeric variable, a vector
in the workspace, multiple variables combined with |
by |
A categorical grouping variable. When supplied, overlapping histograms or other distributional displays are drawn on the same coordinate system. |
facet |
A categorical conditioning variable, or a vector of two such variables, that activates Trellis (lattice) graphics to produce separate panels for each level or combination of levels. |
data |
Optional data frame that contains the variables of interest.
Default is |
filter |
Logical expression or integer index vector identifying the rows to retain for analysis. |
type |
Type of univariate display. Default is |
stat |
Specifies the vertical axis content.
Default is |
n_row |
Optional number of rows for multi-panel displays created
by |
n_col |
Optional number of columns for multi-panel displays.
If |
aspect |
Aspect ratio of lattice panels (height/width). Default
|
theme |
Color theme. Use |
fill |
Fill color for bars or density regions. May be a single color,
named palette ( |
color |
Border color for bars or density curves. May be a vector.
Default follows |
transparency |
Transparency of filled areas. Default from
|
counts |
If |
bin_start |
Optional starting value of the first bin. |
bin_width |
Optional bin width, with or without a specified bin start. |
bin_end |
Optional value that must fall inside the final bin. |
breaks |
Method used to compute bins or an explicit set of breakpoints.
Default is |
cumulate |
Produces a cumulative histogram when set to |
reg |
Color of the regular histogram when |
show_histogram |
When plotting a density curve, optionally draw a histogram behind it. |
bandwidth |
Bandwidth for kernel density estimation. Default begins with
|
kind |
Determines whether to display the general density curve, the normal density curve, or both. |
fill_normal |
Fill color used for the normal density curve. |
fill_hist |
Fill color for histograms under density curves. |
color_normal |
Border color for the normal density curve. |
line_width |
Line width of density curves. |
x_pt |
Value on the x-axis from which to illustrate a unit interval
of density area. Applies when |
y_axis |
If |
rug |
If |
color_rug |
Color of rug ticks. |
size_rug |
Line width of rug ticks. |
vbs_plot |
Controls which components of the violin-box-strip display
appear. Characters |
vbs_ratio |
Relative height of the violin (and box) component. |
bw |
Bandwidth for violins; larger values yield smoother shapes. |
bw_iter |
Number of iterations for modifying the bandwidth to produce smoother violins. |
violin_fill |
Fill color for violins. |
box_fill |
Fill color for box plots. |
pt_size |
Size of points in strip plots. |
vbs_pt_fill |
Point color in the strip component of the VBS display. Default is black. |
vbs_mean |
If |
fences |
If |
n_min_pivot |
Minimum sample size required for a group to appear in a VBS pivot table (default 1). |
k |
IQR multiplier for determining whisker length. |
box_adj |
If |
a, b |
Scaling factors for adjusted boxplot whiskers. |
ID |
Variable supplying labels for selected points (outlier identification). Defaults to row names. |
ID_size |
Point label text size. |
MD_cut |
Mahalanobis distance cutoff for outliers in 2-variable plots. |
out_cut |
Number or proportion of extreme points to label. Interpreted differently for univariate vs. bivariate displays. |
out_shape |
Plotting symbol for outliers. |
out_size |
Point size for labeled outliers. |
xlab |
X-axis label. Defaults to variable name or label. |
ylab |
Y-axis label. Defaults to "Frequency" or "Proportion". |
main |
Main title for the plot. |
sub |
Subtitle (not yet implemented). |
lab_adjust |
Two-element vector adjusting axis-label positions. |
margin_adjust |
Four-element vector adjusting plot margins. |
rotate_x |
Rotation (in degrees) of x-axis tick labels. |
rotate_y |
Rotation (in degrees) of y-axis tick labels. |
offset |
Distance between tick labels and axis. |
scale_x |
Vector specifying start, end, and number of intervals for the x-axis. |
axis_fmt |
Formatting of axis numbers ( |
axis_x_pre |
Prefix for x-axis labels (e.g., |
axis_y_pre |
Prefix for y-axis labels. |
add |
Add text or geometric objects (rectangles, lines, arrows, horizontal/vertical lines, etc.) to the plot. |
x1 |
First x-coordinate for added objects. |
y1 |
First y-coordinate. |
x2 |
Second x-coordinate (for applicable object types). |
y2 |
Second y-coordinate. |
quiet |
If |
do_plot |
If |
use_plotly |
If |
pdf_file |
Optional file name to direct static graphics output to PDF. |
width |
Width of plot window (inches). |
height |
Height of plot window (inches). |
digits_d |
Digits of precision for printed statistics. |
Rmd |
Name of an R Markdown file to generate containing the analysis. |
n_cat |
Maximum number of unique integer values to treat as categorical. Deprecated. |
rows |
Deprecated; use |
facet1 |
Deprecated; use |
facet2 |
Deprecated; use a two-element vector in |
eval_df |
If |
fun_call |
Function call used internally for knitr compatibility. |
... |
Additional graphics parameters passed to |
OVERVIEW
X() implements the univariate, distributional view for numeric variables.
Internally it still relies on the base R hist function
for histogram binning when type = "histogram", but extends that core
infrastructure to a larger family of displays including frequency polygons,
kernel density estimates, violin plots, strip plots, and the integrated
violin-box-strip (VBS) display. For histogram-based displays, the
freq option of hist is always set to FALSE, so
counts, proportions, or densities are controlled through the lessR
arguments (stat and type) rather than directly
through hist.
For plotting densities, the recommended approach is to use
type = "density", which computes and displays a kernel density estimate.
The older usage of stat = "density" is maintained for backward
compatibility but is not recommended for new code.
VARIABLES AND TRELLIS PLOTS
At a minimum there is one primary numeric variable, x, which results in a
single univariate display (histogram, density, VBS, etc.). Facet graphics
(also called Trellis graphics), from Deepayan Sarkar's lattice package,
may be requested by supplying one or two categorical conditioning variables
to the facet argument. A single facet variable produces one panel
per level of that variable; a two-element vector in facet produces a
panel for each combination of levels of the two facet variables.
In each panel, the same numeric primary variable x is displayed,
conditioning on the facet levels. The combination of x with
facet thus yields a grid of small multiples that share scales and
styling, facilitating comparisons across subgroups without changing the
underlying analysis.
BOXPLOTS AND THE VBS DISPLAY
For a single variable, the preferred summary display is often the integrated
violin-box-strip (VBS) plot, requested with type = "vbs". Only the
violin or only the box portion can be obtained either via the corresponding
aliases or by setting vbs_plot to "v" or "b". To view
a boxplot of a continuous variable across the levels of a categorical variable,
either as part of the full VBS plot or alone, there are two primary styles:
1. X(x, by = g, type = "box")
2. X(x, facet = f, type = "box")
Both styles convey the same information about numeric-by-category distributions.
The difference is visual: with by, multiple boxplots appear in a single
panel using the qualitative color palette from getColors, with
all hues chosen to be comparable in perceived saturation and brightness. With
facet, a separate panel is drawn for each group, typically using a
single hue. The same default qualitative colors are used throughout lessR,
including in Chart.
DATA
The data may be supplied as a vector from the global environment (user
workspace), as one or more variables within a data frame, or as an entire
data frame. The default input data frame is d. A different source
data frame may be specified with the data argument. When multiple
variables are listed, only those of numeric type are analyzed by X().
Variables in a data frame are referenced directly by their names; there is no
need to use d$name, with, or attach. If a
name exists both as a vector in the global environment and as a variable in
the specified data frame, the vector in the global environment takes
precedence.
To obtain a histogram (or other univariate display) for each numeric variable
in the default data frame d, call X() without specifying x.
For a different data frame, set the data argument accordingly. To
restrict analysis to a subset of variables, specify them with : or
c, such as m01:m03 or c(m01, m02, m03).
The filter parameter subsets rows (cases) of the input data frame
according to either (a) a logical expression or (b) a vector of integer row
indices. Logical expressions use standard R logical operators as
described in Logic (e.g., \& for "and", | for "or",
! for "not") and relational operators as described in
Comparison (e.g., == for equality, != for
inequality, > for "greater than"). A vector of integers can be created
with standard R syntax; see the Examples section for illustrations.
COLORS
Colors for individual elements of the plot can be controlled directly through
arguments such as fill (for areas) and color (for borders), or
indirectly through the global style system. The style function
allows selection of a color theme for the entire analysis. The default theme
is "lightbronze", with additional themes such as "gray" for
grayscale and several color-emphasis themes (e.g., "red",
"green").
For black backgrounds and partially transparent colors, use
style(sub_theme = "black"). For all color options, including
fill, a setting of "off" is equivalent to "transparent".
For fill, you may specify a single color, a vector of colors, or a color
range. Besides standard color names, lessR provides multiple hue-balanced
palettes, including "hues", as well as pre-specified ranges
"blues", "reds", and "greens". Standard R palettes
"rainbow", "terrain", and "heat" are also available.
Custom ranges can be generated via getColors.
HISTOGRAMS AND RELATED OUTPUT
When type = "histogram", the output includes a frequency histogram with
lessR's default background and grid styling, along with optional relative
frequency and/or cumulative histograms. Summary statistics and a table of bin
information (bin boundaries, midpoints, counts, proportions, cumulative
counts, and cumulative proportions) are provided as part of the returned
object.
Bin construction can follow the default Sturges rule or be customized via
bin_width, bin_start, and bin_end, or via the
breaks argument (which can accept numeric vectors or named rules such
as "Scott" or "FD" as in hist). lessR adds more
informative error checking and guidance when user-specified bins do not fully
span the data or otherwise lead to problematic binning.
If multiple variables are supplied (including a complete data frame), X()
analyzes each numeric variable in turn. The related CountAll
function performs a similar role across mixed types, producing bar charts for
categorical variables and histograms for numeric variables. Faceting via
facet can be combined with X() to create trellised displays
across conditioning variables.
VARIABLE LABELS
If variable labels exist (e.g., created by Read), they are used
by default for the horizontal axis label and in the text output. Variable
names are still available, but labels provide more descriptive and
publication-ready annotations.
ONLY VARIABLES ARE REFERENCED
In lessR functions, the primary argument x (and any other variable
arguments) must be specified as variable names, not expressions. The variables
must exist either in the referenced data frame (such as the default d)
or in the global environment. Direct expressions are not evaluated. For example:
> X(rnorm(50)) # does NOT work
Instead, assign the expression to a named object and then reference that name:
> Y <- rnorm(50) # create vector Y in user workspace
> X(Y) # directly reference Y
ERROR DETECTION
A common error for beginning users of hist is to specify a
sequence of break points that does not fully span the data, often constructed
with seq. Base R then produces a cryptic error. In
contrast, X() (and the Histogram() alias) checks for this
problem before calling hist and, if detected, issues a more
informative message along with guidance on how to correct the specification.
In addition, full control of the binning is available via bin_width
and bin_start. If bin_start is specified without bin_width,
the bin width is determined by the default Sturges rule.
PDF OUTPUT
To obtain PDF output directly, use the pdf_file argument, optionally
with width and height to control the device size in inches. The
resulting files are written to the current working directory, which can be set
explicitly via setwd. This mechanism facilitates high-quality,
publication-ready graphics saved directly from X().
The output may be assigned to an R object; otherwise it is printed directly
to the console. The returned object contains two types of components:
Readable output: character strings such as tables and summaries, formatted for display.
Statistics: numerical values suitable for further computation or inclusion in dynamic documents.
These components are designed to support R Markdown workflows. Any element of the
saved object can be inserted into an R Markdown document by referencing its name,
preceded by the object name and the $ operator (see examples).
Only elements relevant to the requested analysis are returned.
For example, bin proportions appear only when a histogram is requested.
To save the results, assign the output to an object, such as h <- Histogram(Salary).
Use names(h) to view the available components, and display any one by typing, for
example, h$out_freq. Output can be viewed interactively at the console or embedded in
R Markdown documents along with narrative interpretation.
READABLE OUTPUT
out_suggest: Suggestions for other similar analyses
out_summary: Summary statistics
out_freq: Frequency distribution
out_outliers: Outlier analysis
STATISTICS
bin_width: Bin width
n_bins: Number of bins
breaks: Breaks of the bins
mids: Bin midpoints
counts: Bin counts
prop: Bin proportions
cumulate: Bin cumulative counts
cprop: Bin cumulative proportions
David W. Gerbing (Portland State University; gerbing@pdx.edu)
Gerbing, D. W. (2023). R Data Analysis without Programming: Explanation and Interpretation, 2nd edition, Chapter 5, NY: Routledge.
Gerbing, D. W. (2020). R Visualizations: Derive Meaning from Data, Chapter 4, NY: CRC Press.
Gerbing, D. W. (2021). Enhancement of the Command-Line Environment for use in the Introductory Statistics Course and Beyond, Journal of Statistics and Data Science Education, 29(3), 251-266.
Hubert, M. and Vandervieren, E. (2008). An adjusted boxplot for skewed distributions, Computational Statistics and Data Analysis 52, 51865201.
Sarkar, D. (2008). Lattice: Multivariate Data Visualization with R. Springer.
Sievert, C. (2020). Interactive Web-Based Data Visualization with R, plotly, and shiny. Chapman and Hall/CRC. URL: https://plotly.com/r/
XY, Chart, getColors, style.
# get the data
d <- rd("Employee")
# make sure default style is active
style()
# --------------------
# different histograms
# --------------------
# histogram with all defaults
X(Salary, type="histogram")
# with deprecated alias
Histogram(Salary)
# output saved for later analysis into object h
h <- hs(Salary)
# view full text output
h
# view just the outlier analysis
h$out_outliers
# list the names of all the components
names(h)
# histogram with no borders for the bars
X(Salary, color="off")
# just males employed more than 5 years
X(Salary, filter=(Gender=="M" & Years > 5))
# histogram with red bars, black background, and black border
style(panel_fill="black", fill="red", panel_color="black")
X(Salary)
# or use a lessR pre-defined sequential color palette
# with some transparency
X(Salary, fill="rusts", color="brown", transparency=.1)
# histogram with purple color theme, translucent gold bars
style("purple", sub_theme="black")
X(Salary)
# back to default color theme
style()
# histogram with specified bin width
# can also use bin_start
X(Salary, bin_width=12000)
# histogram with rotated axis values, offset more from axis
# suppress text output
style(rotate_x=45, offset=1)
X(Salary, quiet=TRUE)
style()
# histogram with specified bin width
X(Salary, bin_width=20000, xlab="My Variable")
# histogram with bins calculated with the Scott method and values displayed
X(Salary, breaks="Scott", counts=TRUE, quiet=TRUE)
# histogram with the number of suggested bins, with proportions
X(Salary, breaks=15, stat="proportion")
# histogram with non-default values for x- and y-axes
d[2,4] <- 45000
X(Salary, scale_x=c(20000,160000,8))
# ----------------
# Trellis graphics
# ----------------
Histogram(Salary, facet=Dept)
# ---------------------
# cumulative histograms
# ---------------------
# cumulative histogram with superimposed regular histogram, all defaults
X(Salary, cumulate="both")
# cumulative histogram plus regular histogram
X(Salary, cumulate="both", reg="mistyrose")
# -------------
# density plots
# -------------
# default density plot
X(Salary, type="density")
# normal curve and general density curves superimposed over histogram
# all defaults
X(Salary, type="density", kind="both")
# display only the general estimated density
# so do not display the estimated normal curve
# specify the bandwidth for the general density curve
X(Salary, type="density", bandwidth=8000)
# display only the general estimated density and a corresponding
# interval of unit width around x_pt
X(Salary, type="density", x_pt=40000)
# densities for all specified numeric variables in a list of variables
# e.g., use the combine or c function to specify a list of variables
X(c(Years, Salary), type="density")
# -------------------------------------------------
# histograms for data frames and multiple variables
# -------------------------------------------------
# create data frame, d, to mimic reading data with Read function
# d contains both numeric and non-numeric data
d <- data.frame(rnorm(50), rnorm(50), rnorm(50), rep(c("A","B"),25))
names(d) <- c("X","Y","Z","C")
# although data not attached, access the variable directly by its name
X(X)
# histograms for all numeric variables in data frame called d
# d is the default name, so does not need to be specified with data
X()
# histogram with specified options, including red axis labels
style(fill="palegreen1", panel_fill="ivory", axis_color="red")
X(counts=TRUE)
style() # reset
# histograms for all specified numeric variables
# use the combine or c function to specify a list of variables
X(c(X,Y))
# integrated violin/box/scatter plot (VBS)
d <- Read("Employee")
X(Salary, type="vbs")
X(Years, by=Gender, pt_size=1.25,
fill=c("olivedrab3", "gold1"),
color=c("darkgreen", "gold4"), type="vbs")
# by variable, different colors for different values of the variable
# two panels
X(Salary, facet=Dept)
# large sample size
x <- rnorm(10000)
X(x)
# custom colors for outliers, which might not appear in this subset data
style(out_fill="hotpink", out2_fill="purple")
X(Salary, type="vbs")
style()
# no violin plot or scatterplot, just a boxplot
X(Salary, type="box")
# -----------
# annotations
# -----------
d <- rd("Employee")
# Place a message in the top-right of the graph
# Use \n to indicate a new line
X(Salary, add="Salaries in our Company", x1=100000, y1=7)
# Use style to change some parameter values
style(add_trans=.8, add_fill="gold", add_color="gold4",
add_lwd=0.5, add_cex=1.1)
# Add a rectangle around the message centered at <100000,7>
X(Salary, add=c("rect", "Salaries in our Company"),
x1=c(82000, 100000), y1=c(7.7, 7), x2=118000, y2=6.2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.