| XY | R Documentation |
lessR introduces the concept of a data view visualization function, in which the choice of visualization function directly reflects the structure of the data and the analyst's goal for understanding the data. The function XY() visualizes the joint distribution of two continuous variables,
or more generally the relationship among multiple variables, along with associated
summary statistics. The specific representation is selected with the
type argument:
Scatterplot of points with type = "scatter" (default)
Smoothed scatterplot with type = "smooth"
Contour plot (2-dimensional density display) with type = "contour"
For the standard scatterplot, stratification divides the data into groups
via the arguments by and facet. The by argument overlays
groups in a single coordinate system for direct comparison, while
facet produces coordinated small multiples on separate panels.
Depending on the combination of variable types supplied to x and
y, XY() can also create time series, bubble plots, and
Cleveland-style dot or lollipop plots.
When running in RStudio, static graphics are drawn in the Plots
window. When the default use_plotly=TRUE, an additional interactive
plotly version of the scatterplot (including time series) display is also drawn in the Viewer window.
A scatterplot displays the joint values of one or more variables as points
in an \(n\)-dimensional coordinate system, in which the coordinates of each
point are the values of \(n\) variables for a single observation (row of
data). With a common syntax, XY(x, y, ...) generates a family of
related 1- or 2-dimensional relationship plots, possibly enhanced with
fitted lines, ellipses, and outlier labeling. Categorical variables should
typically be defined as factors, and date variables should be stored as
Date objects. When x is of class Date, the display is
treated as a time-indexed series.
XY() therefore serves as the relationship-oriented counterpart to
Chart() (aggregated values) and X() (univariate distributions)
within the three-function visualization framework implemented in lessR.
XY(
# -------------------------------------------------------
# Data from which to construct the plot for x- and y-axis
x, y=NULL, data=d, filter=NULL,
# --------------------------------------------------
# Stratification: Same panel or Trellis (facet) plot [x, or x and y]
by=NULL, facet=NULL,
n_row=NULL, n_col=NULL, aspect="fill",
# ----------------------------------------------------------
# Types of plots
type=c("scatter", "smooth", "contour"),
# -------------------------------
# Enhancements and customizations
# -------------------------------
# --------------------------------------------------------------------
# Analogy of physical Marks on paper that create the points and labels
# See ?style for more options with the style() function
theme=getOption("theme"),
fill=NULL, color=NULL,
transparency=getOption("trans_pt_fill"),
enhance=FALSE, means=TRUE,
size=NULL, size_cut=NULL, shape="circle", line_width=1.5,
segments=FALSE, segments_y=FALSE, segments_x=FALSE,
# ----------------------
# Sort and jitter points
sort=c("0", "-", "+"),
jitter_x=NULL, jitter_y=NULL,
# ----------------
# Outlier analysis
ID="row.name", ID_size=0.60,
MD_cut=0, out_cut=0, out_shape="circle", out_size=1,
# -------------------------------------------------
# Fit line, confidence interval, confidence ellipse
fit=c("off", "loess", "lm", "ls", "null", "exp", "quad",
"power", "log"),
fit_power=1, fit_se=0.95,
fit_color=getOption("fit_color"), fit_lwd=getOption("fit_lwd"),
fit_new=NULL, plot_errors=FALSE, ellipse=0,
# -----------------------------------------------------------------------
# Time series and forecasting, plot x values sequentially [xDate, y or Y]
ts_unit=NULL, ts_ahead=0, ts_method=c("es", "lm"),
ts_source=c("fable", "classic"), ts_agg=c("sum", "mean"),
ts_NA=NULL, ts_format=NULL, ts_fitted=FALSE, ts_PI=.95,
ts_trend=NULL, ts_seasons=NULL, ts_error=NULL,
ts_alpha=NULL, ts_beta=NULL, ts_gamma=NULL, ts_new_x=NULL,
ts_stack=FALSE, ts_area_fill="transparent", ts_area_split=0,
ts_n_x_tics=NULL,
# Run chart (indicate with .Index for the name of the x-variable)
show_runs=FALSE, center_line=c("off", "mean", "median", "zero"),
# -----------------------------------
# Lollipop chart from aggregated data [xCategorical and y]
stat=c("mean", "sum", "sd", "deviation", "min", "median", "max"),
stat_x=c("count", "proportion", "%"),
# ----------------------------------
# Integrated violin/box/scatter plot [x only]
vbs_plot="vbs", vbs_ratio=0.9, bw=NULL, bw_iter=10,
violin_fill=getOption("violin_fill"),
box_fill=getOption("box_fill"),
vbs_pt_fill="black",
vbs_mean=FALSE, fences=FALSE, n_min_pivot=1,
k=1.5, box_adj=FALSE, a=-4, b=3,
# -----------
# Bubble plot [xCategorical, or xCategorical and yCategorical]
radius=NULL, power=0.5, low_fill=NULL, hi_fill=NULL,
# -----------------------------------------------------------
# Large data sets, smoothing, contours and binning [x and y]
smooth_points=100, smooth_size=1,
smooth_power=0.25, smooth_bins=128, n_bins=1,
contour_n=10, contour_nbins=50, contour_points=FALSE,
# ------------------------------------------------------
# Bins for frequency polygon or text output of VBS plots
bin=FALSE, bin_start=NULL, bin_width=NULL, bin_end=NULL,
breaks="Sturges", cumulate=FALSE,
# ----------------------
# Axes labels and values
# ----------------------
# -----------------------
# Axis labels and spacing
xlab=NULL, ylab=NULL, main=NULL, sub=NULL,
label_adjust=c(0,0), margin_adjust=c(0,0,0,0), # top, right, bottom, left
pad_x=c(0,0), pad_y=c(0,0),
# ---------------------
# Axis values specified
scale_x=NULL, scale_y=NULL, origin_x=NULL, origin_y=NULL,
# ---------------------
# Axis values formatted
rotate_x=getOption("rotate_x"), rotate_y=getOption("rotate_y"),
offset=getOption("offset"),
axis_fmt=c("K", ",", ".", ""), axis_x_prefix="", axis_y_prefix="",
xy_ticks=TRUE, n_axis_x_skip=0, n_axis_y_skip=0,
# ------
# legend
legend_title=NULL,
# -------------
# Miscellaneous
# -------------
# ----------------------------------------------------
# Add one or more objects, text, or geometric figures
add=NULL, x1=NULL, y1=NULL, x2=NULL, y2=NULL,
# ---------------------------------------------
# Output: turn off, to PDF file, decimal digits
quiet=getOption("quiet"), do_plot=TRUE,
use_plotly=getOption("lessR.use_plotly"),
pdf_file=NULL, width=6.5, height=6,
digits_d=NULL,
# -------------------------------------------------------------
# Deprecated, to be removed in future versions
n_cat=getOption("n_cat"), value_labels=NULL, # use R factors instead
rows=NULL, facet1=NULL, facet2=NULL, smooth=FALSE,
# ---------------------------------
# Other
eval_df=NULL, fun_call=NULL, ...)
x |
The variable plotted on the |
y |
The variable plotted on the vertical |
data |
Optional data frame that contains one or both of |
filter |
A logical expression or a vector of integers that specify the row numbers to retain, defining a subset of rows of the data frame to analyze. |
by |
A grouping variable, a categorical variable that provides
separate group profiles of the primary numeric variables |
facet |
One categorical variable, or a vector of two categorical
variables, that activates facet graphics (Trellis plots) via
the lattice package, providing a separate plot of the primary
variable(s) |
n_row |
Optional specification for the number of rows
in the layout of a multi-panel display with Trellis graphics.
Specify |
n_col |
Optional specification for the number of columns in the
layout of a multi-panel display with Trellis (facet) graphics.
Specify |
aspect |
Lattice parameter for the aspect ratio of the Trellis panels
(facets), defined as height divided by width.
The default value is |
type |
Display type. The default is |
theme |
Color theme for this analysis. Make persistent across analyses
with |
fill |
Fill color for the points or for the area under a line chart.
Can also be set via the lessR function |
color |
Border color of the points or line color for a line plot.
Can be a vector to customize the color for each point or a color
range such as |
transparency |
Transparency factor of the fill color of each point.
Default is |
enhance |
For a two-variable scatterplot, if |
means |
If one variable is categorical (factor) and the other is
continuous, then if |
size |
When set to a constant, the scaling factor for standard points
(not bubbles) or for a line, with default of 1.0 for points and 2.0 for a line.
Set to 0 to suppress plotting of points or lines. If |
size_cut |
For a bubble plot in which bubble sizes are defined by a
|
shape |
Plot character(s). The default value is |
line_width |
Width of the line segments that connect adjacent points, such as in time series plots. Set to 0 to remove line segments. |
segments |
Designed for interaction plots of means: connects each pair of
successive points with a line segment. Pass a data frame of means,
such as from |
segments_y |
For one |
segments_x |
Draw a line segment from the |
sort |
Sort the values of |
jitter_x |
Randomly perturb the plotted points of
a scatterplot horizontally within the limits of the specified value,
or set to |
jitter_y |
Vertical jitter. Defaults to |
ID |
Name of a variable that provides the labels for selected
plotted points for outlier identification, row names of the data frame
by default. To label all points, use the |
ID_size |
Size of the plotted labels.
Modify label text color with the |
MD_cut |
Mahalanobis distance cutoff to define an outlier in a two-variable scatterplot. |
out_cut |
Count or proportion of plotted points to label, in order of their distance from the center (means) of the univariate distribution or scatterplot, sorted from most to least extreme. For two-variable plots, distance from the center is based on Mahalanobis distance. |
out_shape |
Shape of outlier points in a two-variable scatterplot
or VBS plot.
Modify fill color from the current |
out_size |
Size of outlier points in a two-variable scatterplot or VBS plot. |
fit |
Type of best fit line. Default is |
fit_power |
Power that describes the response |
fit_se |
Confidence level for the error band displayed around the
line of best fit. Default is 0.95 when a fit line is specified,
but is turned off when |
fit_color |
Color of the fit line. |
fit_lwd |
Line width of the fit line. |
fit_new |
When |
plot_errors |
If |
ellipse |
Confidence level of a data ellipse for a scatterplot
of one |
ts_unit |
Time unit for plotting time series data
when |
ts_ahead |
Number of |
ts_method |
Forecasting method. Default is |
ts_source |
Source of forecasting functions. Default is |
ts_agg |
Function used to aggregate over time according to
|
ts_NA |
By default, missing |
ts_format |
Optional format string for |
ts_fitted |
If |
ts_PI |
Prediction interval level about the forecasted values,
with default of |
ts_trend |
Trend parameter. Default value is |
ts_seasons |
Seasonality parameter. See |
ts_error |
Error parameter. See |
ts_alpha |
Exponential smoothing level parameter. Sets the value for
|
ts_beta |
Exponential smoothing trend parameter. See |
ts_gamma |
Exponential smoothing seasonal parameter. See |
ts_new_x |
A data frame of predictor variable names and new values for
exogenous regressors used for model fitting and forecasting.
The number of rows of new data values determines the number of future
time periods for which forecasts are generated.
Currently applies only to the default |
ts_stack |
If |
ts_area_fill |
Fill under line segments, if present.
If |
ts_area_split |
Applies only to a Trellis plot activated with |
ts_n_x_tics |
Suggested number of ticks for dates on the |
show_runs |
If |
center_line |
Plots a dashed line through the middle of a run chart.
Default center line is the |
stat |
Apply a specified aggregation such as |
stat_x |
If no |
vbs_plot |
A character string that specifies the components of the
integrated Violin–Box–Scatterplot (VBS) of a continuous variable.
A |
vbs_ratio |
Height of the violin plot relative to the plot area. Make the violin (and the accompanying box plot) larger or smaller by adjusting either the plot area or this value. |
bw |
Bandwidth for the smoothness of the violin plot. Higher values give smoother plots. Default is to calculate a bandwidth that provides a relatively smooth density estimate. |
bw_iter |
Number of iterations used to adjust the default R bandwidth for further smoothing of the density estimate. When set, the iterations and corresponding results are displayed. |
violin_fill |
Fill color for a violin plot. |
box_fill |
Fill color for a box plot. |
vbs_pt_fill |
Points in a VBS scatterplot are black by default because
the background violin is based on the current theme color.
To use the |
vbs_mean |
Show the mean on the box plot with a strip in the color
of |
fences |
If |
n_min_pivot |
For the pivot table underlying a VBS plot over at least a
|
k |
IQR multiplier that determines the distance of the whiskers of the box plot from the box. Default is Tukey's setting of 1.5. |
box_adj |
Adjust the box and whiskers, and thus outlier detection, for skewness using the medcouple statistic as a robust measure of skewness according to Hubert and Vandervieren (2008). |
a, b |
Scaling factors for the adjusted box plot to set the lengths
of the whiskers. If explicitly set, |
radius |
Scaling factor of the bubbles in a bubble plot, which
sets the radius of the largest displayed bubble in inches. To
activate bubble scaling, set |
power |
Relative scaling of bubble sizes. The default value of 0.5 scales bubbles so that the area of each bubble is proportional to the corresponding sizing variable. A value of 1 scales bubble radii directly by the sizing variable, increasing the size differences between values. |
low_fill |
For a categorical variable and the resulting bubble plot, or a matrix of such plots, sets the low end of a color gradient for bubble fill. |
hi_fill |
For a categorical variable and the resulting bubble plot, or a matrix of such plots, sets the high end of a color gradient for bubble fill. |
smooth_points |
Number of points superimposed on the density plot in the areas of lowest density to help identify outliers, thereby controlling how dark the smoothed display becomes. |
smooth_size |
Size of points superimposed on the density plot. Default value is 1, resulting in very small points. |
smooth_power |
Exponent of the function that maps the density scale to the color scale. Smaller values than the default of 0.25 yield darker plots. |
smooth_bins |
Number of bins in each direction for the density estimation. |
n_bins |
Number of bins for a single numeric
|
contour_n |
Number of contour levels in a contour plot, with default of 10. |
contour_nbins |
Number of bins constructed for each of |
contour_points |
If |
bin |
If |
bin_start |
Optional starting value of the bins for a frequency polygon. |
bin_width |
Optional specified bin width. Also sets
|
bin_end |
Optional value that lies within the last bin, so the actual endpoint of the last bin may be larger than the specified value. |
breaks |
Method for calculating bins, or an explicit
specification of the bins, such as via |
cumulate |
If |
xlab, ylab |
Axis labels for the |
main |
Label for the plot title. If corresponding variable labels exist, the title is set by default from those labels. |
sub |
Subtitle for the graph, below |
label_adjust |
Two-element vector (x-axis label, y-axis label) that adjusts the positions of axis labels in approximate inches. Positive values move labels away from the plot edge. Not applicable to Trellis graphics. |
margin_adjust |
Four-element vector (top, right, bottom, left)
that adjusts plot margins in approximate inches.
Positive values move the corresponding margin away from the plot edge.
Can be used with |
pad_x |
Proportion of padding added to the left and right sides of the
|
pad_y |
Proportion of padding added to the bottom and top sides of the
|
scale_x |
If specified, a vector of three values that define the
numeric |
scale_y |
If specified, a vector of three values that define the
numeric |
origin_x |
Origin of the |
origin_y |
Origin of the |
rotate_x |
Rotation in degrees of the axis value labels on
the |
rotate_y |
Rotation in degrees of the axis value labels on
the |
offset |
Spacing between axis value labels and the axis. Default
is 0.5. Larger values (e.g., 1.0) create additional space for
labels, especially when rotated. Can be used with
|
axis_fmt |
Numeric format of axis labels for both axes. Default
rounds thousands to |
axis_x_prefix |
Prefix for axis labels on the |
axis_y_prefix |
Prefix for axis labels on the |
xy_ticks |
Flag indicating whether tick marks and associated
value labels on the axes are displayed. To rotate axis values,
use |
n_axis_x_skip |
Particularly useful for Trellis or facet plots with
many labels on the |
n_axis_y_skip |
Same as |
legend_title |
Title of the legend for a multiple-variable |
add |
Overlay one or more objects, text or geometric figures,
on the plot. Possible values include any text (first argument) or
|
x1 |
First x-coordinate for each overlaid object; may be
|
y1 |
First y-coordinate for each overlaid object; may be
|
x2 |
Second x-coordinate for each object; may be |
y2 |
Second y-coordinate for each object; may be |
quiet |
If |
do_plot |
If |
use_plotly |
If |
pdf_file |
If specified, direct PDF graphics to the given file name. |
width |
Width of the plot window in inches, default 5 except within RStudio, where the default maintains an approximately square plotting area. |
height |
Height of the plot window in inches, default 4.5 except for 1-D scatterplots and when running in RStudio. |
digits_d |
Number of significant digits for each displayed summary statistic. |
n_cat |
Maximum number of unique, equally spaced integer values of a variable for which it will be analyzed as categorical rather than continuous. Default is 0. Use to treat such variables as informal factors. Deprecated. Best to convert integer-coded categoricals to factors explicitly. |
value_labels |
For factors, default labels are the factor levels;
for character variables, default labels are the character values.
Optionally provide labels for the |
rows |
Deprecated. Old parameter name; use |
facet1 |
Deprecated. Old parameter name; use |
facet2 |
Deprecated. Old parameter name; use a two-element
vector in |
smooth |
Deprecated. Use parameter |
eval_df |
Controls whether to check for an existing data frame and
specified variables. Default is |
fun_call |
Function call. Used with knitr to pass the function
call when obtained from the abbreviated helper |
... |
Other parameters for non-Trellis graphics as recognized by
base R functions |
VARIABLES and TRELLIS PLOTS
There is at least one primary variable, x, which defines the horizontal
x-axis. A second primary variable, y, defines the vertical
y-axis. Either x or y (but not both simultaneously) may be
a vector of variables. The simplest usage—single x and y—produces
a single scatterplot on one panel. With type = "scatter" this is a
standard point plot; alternative values such as "smooth" and
"contour" invoke 2-D kernel density and contour displays, respectively.
For numeric primary variables, multiple plots can appear on a single panel in
two ways. First, by defining groups with the by argument:
by identifies a categorical grouping variable, and a separate
scatterplot layer is drawn for each of its levels. Group levels are
distinguished by color and/or shape. By default, colors vary across groups;
for two groups, a common pattern is a filled symbol for one group and a point
with transparent interior for the other.
Second, multiple numeric x-variables or multiple y-variables can
be supplied as a vector, which produces multiple series on the same panel. This
is commonly used for time series overlays and multi-series line plots.
Trellis graphics (facets), from Deepayan Sarkar's (2009) lattice
package, may also be used. A variable specified with facet is a
conditioning variable. A single facet variable produces one panel for
each of its levels; a length-two vector of facet variables produces one panel
for each cross-classified combination of their levels. Within each panel,
x and y are plotted as usual, and an additional grouping variable
specified with by may be applied to all panels. When x has at
most 1000 unique values, XY() can provide a brief diagnostic of the maximum
number of repeated values for each level of facet.
Control the panel dimensions and the overall size of the Trellis display with
width and height for the graphics device, n_row and
n_col for the number of panels in each direction, and aspect for
the panel height-to-width ratio. The plot window is the active graphics device
(e.g., the standard R plotting window, RStudio Plots pane, or a pdf file
when pdf_file is specified).
CATEGORICAL VARIABLES
Conceptually, lessR distinguishes continuous and categorical variables.
Categorical variables have relatively few unique values and are often coded as
factors. However, categorical variables may also be coded numerically, such as
Likert responses from 1 to 5. The structuring arguments by and
facet apply when at least one of x or y is numeric, but
that numeric variable may represent either a truly continuous scale or a
discrete Likert-type scale.
Scatterplots of Likert-type data can be challenging because the number of
possible joint values is small. For two five-point scales there are only 25
possible combinations, so many points overlap at the same coordinates. For such
situations, jittering, or dot/mean plots (using stat) may
provide clearer displays.
DATA
The default input data frame is d. Another data frame can be specified
with the data argument. Regardless of the name of the data frame,
variables can be referenced directly by name—no need to attach the data frame
or use d$name. Referenced variables may live in the data frame, the
global environment, or both.
The plotted values can be the raw data themselves, or summaries derived from
them. With a single numeric variable, counts or proportions can be plotted on
the y-axis, optionally after binning x. For a categorical
x-variable paired with a continuous y-variable, summary
statistics such as means can be plotted at each level of x. When
x is continuous and binning is desired, XY() uses the same binning
parameters as Histogram, such as bin_width, to override
defaults. The stat parameter controls what is plotted (e.g.,
"data", "mean", "median"). By default, connecting line
segments are drawn, yielding a frequency polygon. Turn off line segments by
setting line_width = 0.
The filter parameter subsets rows (cases) of the input data frame
according to a logical expression or a vector of integers specifying row
numbers to retain. Use standard R logical operators as described in
Logic—for example, & (and), | (or), and !
(not)—and relational operators as described in Comparison such as
== (equality), != (not equal), and > (greater than). To
specify rows directly, create an integer vector using standard R syntax. See
Examples.
VALUE LABELS
Deprecated. Use factor() instead.
The value_labels option can override the axis tick labels with
user-supplied values. This is particularly useful for Likert-style data coded
as integers: for example, a data value 0 can be displayed as
"Strongly Disagree". These labels apply to integer categorical variables
and to factor variables. Any spaces in a label are translated into line breaks
to improve readability.
In current workflows, the recommended approach is to define factors with the
desired labels directly, typically via factors, which allows
convenient creation of labeled factors for one or more variables in a single
statement.
VARIABLE LABELS
Base R does not natively support variable labels, but lessR stores labels in
the data frame alongside the variables, typically created by
Read or VariableLabels. When variable labels
exist, XY() uses them by default for axis labels and in text output. Otherwise,
the variable names are used.
TWO VARIABLE PLOT
With two variables specified, XY() behaves as follows. If the values of
x are unsorted, have unequal intervals, or there is missing data in
either variable, a standard scatterplot is produced (for type =
"scatter"). When x is sorted with equal intervals and there are no
missing values, the default display connects adjacent points with line
segments, yielding a function or time-series style plot.
Supplying multiple continuous x-variables against a single y, or
vice versa, produces multiple series on the same graph. The points for the
second series reuse the first series' color but with transparent fill; for more
series, additional colors from the current theme are used.
SCATTERPLOT MATRIX
If x is a vector of continuous variables and y is omitted,
XY() generates a scatterplot matrix. The matrix adopts the current color theme.
Individual colors (e.g., fill, color) can be overridden. The
lower triangle shows the pairwise scatterplots, and the upper triangle shows
the corresponding correlation coefficients. By default a non-linear loess fit
line is added to each scatterplot; the fit parameter can be used to
request a linear least squares line instead, along with fit_color to set
its color.
SIZE VARIABLE
Specifying a numeric size variable activates a bubble plot in which the
area of each bubble is determined by the corresponding value of size
(modified by radius and power).
To vary shapes explicitly across groups, use shape and provide a vector
of values (e.g., created with c). One shape is used for each
level of the grouping variable in by. To vary colors, use fill;
if fill is specified without shape, colors vary but shapes do
not. To vary both, specify both shape and fill with values for
each by level.
Beyond the named shapes such as "circle", any single character (letters,
digits, "+", "*", "#") may be used as a plotting symbol.
Within a single specification, either use standard named shapes or single
characters, but not both.
SCATTERPLOT ELLIPSE
For a scatterplot of two numeric variables, the ellipse argument
requests one or more data ellipses, based on the contours of the corresponding
bivariate normal density. For a single x- and y-variable pair,
setting ellipse to a numeric value between 0 and 1 (e.g., 0.95) draws
the corresponding probability contour. A vector of values produces multiple
ellipses. XY() expands the axes as needed so that ellipses extending beyond the
range of the data remain fully visible. For Trellis graphics, only the largest
ellipse level is used (one ellipse per panel). Ellipse fill and border colors
are controlled via ellipse_fill and ellipse_color in
style.
TIME CHARTS
See https://web.pdx.edu/~gerbing/lessR/examples/Time.html for additional examples and explanation.
When x is the special variable .Index, XY() produces a run chart
of y. The y-values are plotted against their index positions
(1, 2, …), and run-chart diagnostics such as center lines and runs analysis can
be displayed via the corresponding arguments.
If x is of type Date or an R time-series object, XY() produces a
time series plot for each specified variable. Time-series data can be supplied
in “long” format (a single column of values plus a date column) or in “wide”
format (multiple time-series columns sharing a date index). XY() will attempt
to convert character string date values (e.g., "08/18/1952") to
Date via as.Date(), using default date formats when possible.
XY() makes a reasonable attempt to decode common date formats, but some formats
(e.g., those with month names rather than numbers) may require an explicit
format string via ts_format. If the default conversion fails or is
ambiguous, specify the correct format using examples such as those in the
table below.
| Example Date | Format |
| --------------------------- | ----------------- |
"2022-09-01" | "%Y-%m-%d" |
"2022/9/1" | "%Y/%m/%d" |
"2022.09.01" | "%Y.%m.%d" |
"09/01/2022" | "%m/%d/%Y" |
"9/1/22" | "%m/%d/%y" |
"September 1, 2022" | "%B %d, %Y" |
"Sep 1, 2022" | "%b %d, %Y" |
"20220901" | "%Y%m%d" |
| --------------------------- | ----------------- |
XY() also converts character-string dates such as "2024 Aug" and
"2024 Q1", interpreting three-letter month abbreviations and quarter
codes Q1–Q4.
The ts_unit argument optionally aggregates the date variable to a higher
time unit using endpoints() and period.apply() from the
xts package. For example, daily data can be aggregated and plotted as
monthly or quarterly series.
The aggregation function is specified with ts_agg, default "sum",
with "mean" as a common alternative.
For missing data, if a date is present but the corresponding y-value is
NA, the line is broken at that point (no segment is drawn). If both the
date and value are absent (the entire row is missing), the line connects the
nearest observed dates, spanning the gap in calendar time but skipping the
missing tick label. For example, if "2021-01-07" and "2021-01-09" are present
but "2021-01-08" is absent, the plot includes points for the 7th and 9th
connected by a line; the 8th does not appear on the axis.
Forecasting is activated by setting ts_ahead to a positive integer. The
trend, seasonal, and error components for exponential smoothing and regression
models follow the fable conventions and are controlled by
ts_trend, ts_seasons, and ts_error. XY() supports four
forecasting engines: ETS() and TSLM() from fable via
ts_source = "fable", and classic HoltWinters() and regression
decomposition (stl() + lm()) via ts_source = "classic".
Multiplicative components require positive data. Additive trend is linear,
while multiplicative trend corresponds to exponential growth or decay; additive
and multiplicative seasonality scale the seasonal pattern differently as the
series level changes.
Exponential smoothing via ETS() (the default when
ts_source = "fable") generally provides flexible model specification
and often superior predictive performance compared to classic Holt–Winters.
For regression with seasonality, TSLM() and the stl()+lm()
approach both rely on least squares but handle seasonal structure differently.
Setting ts_fitted = TRUE provides fitted values and decomposed trend and
seasonal components for inspection.
2-D KERNEL DENSITY
Set type = "smooth" to invoke smoothScatter and display a
2-D kernel density estimate for large data sets. The display respects the
current theme. The smooth_points argument controls how many points from
low-density regions are superimposed, smooth_bins sets the number of
bins in each direction for density estimation, and smooth_power
controls the transformation from density scale to color scale. Larger
smooth_power values reduce saturation in low-density regions. These
arguments map directly to nrpoints, nbin, and
transformation in smoothScatter. Grid lines are disabled
by default for smooth density plots, but can be re-enabled via the appropriate
styling options (e.g., grid_color in style).
Alternatively, set type = "contour" to plot contour lines of the
estimated bivariate density, with the level resolution controlled by
contour_n and contour_nbins.
COLORS
A global color theme for XY() and other lessR graphics can be set with
style (e.g., style(theme = "lightbronze")). The default
theme is "lightbronze". A grayscale theme is available via
"gray", and other themes (e.g., "sienna", "darkred") are
described in style. The sub_theme = "black" option yields
a black background with partially transparent plotted colors.
Individual graphical elements (points, lines, panels, grid lines, etc.) can be
customized with additional style arguments, such as
panel_fill. For a subtle warm background, try
panel_fill = "snow"; very light grays such as
"gray99" or "gray97" provide a neutral tone.
For many color options, the value "off" is equivalent to
"transparent", removing the corresponding fill or borde
See showColors for a display of all named R colors and their RGB
values.
ANNOTATIONS
The add argument and its related coordinates (x1, y1,
x2, y2) annotate the plot with text and geometric objects. Each
object type requires a specific set of coordinates, summarized below.
x-coordinates may take the special value "mean_x" and
y-coordinates may take "mean_y".
| Value | Object | Required Coordinates |
| ----------- | ------------------- | ----------------------- |
"text" | text | x1, y1 |
"point" | point | x1, y1 |
"rect" | rectangle | x1, y1, x2, y2 |
"line" | line segment | x1, y1, x2, y2 |
"arrow" | arrow | x1, y1, x2, y2 |
"v_line" | vertical line | x1 |
"h_line" | horizontal line | y1 |
"means" | horizontal and vertical lines at the means | |
| ----------- | ------------------- | ----------------------- |
The value of add specifies the object type. For a single object, provide
one value and its required coordinates. For multiple placements of the same
object, supply coordinate vectors. For multiple different objects, supply a
vector of add values and, for each coordinate argument (x1,
y1, x2, y2), a vector whose elements correspond to the
sequence of objects in add.
Styling options such as add_color, add_fill, and transparency
may also be given as vectors, allowing different objects to have different
colors or other properties.
PDF OUTPUT
To obtain pdf output, set pdf_file to a file name, optionally with
width and height to control device size. The pdf is written to
the current working directory, which can be explicitly set with
setwd.
ADDITIONAL OPTIONS
Many commonly used graphical parameters from the base R function
plot are passed through by XY(), including:
Settings for main/sub-titles and axis
annotation; see title and par.
Main title of the graph; see title.
Limits of the x-axis, expressed as c(x1, x2). Note
that x1 > x2 is allowed and produces a reversed axis.
Limits of the y-axis.
ONLY VARIABLES ARE REFERENCED
A referenced variable in a lessR plotting function such as XY must be a
variable name (or vector of variable names), not an arbitrary expression. The
variable must exist either in the referenced data frame (e.g., the default
d) or in the user's workspace (global environment). Expressions are not
evaluated in place. For example:
> XY(rnorm(50), rnorm(50)) # does NOT work
Instead, create named objects and reference them directly:
> X <- rnorm(50) # create vector X in user workspace
> Y <- rnorm(50) # create vector Y in user workspace
> XY(X, Y) # directly reference X and Y
The output may be assigned to an R object; otherwise it is printed directly
to the console. Each component appears only when the corresponding analytic option
is activated. For example, outlier identification must be enabled (e.g., via
MD_cut) for out_outliers to be included in the output.
To save the results, assign the output to an object, such as
p <- XY(Years, Salary).
Use names(p) to view the available components, and access any one by
prefixing with p$, for example p$out_stats.
Output can be viewed interactively at the console or inserted into R~Markdown
documents for reproducible reporting.
READABLE OUTPUT
out_stats: Correlational analysis.
out_outliers: Mahalanobis Distance values for detected outliers.
out_frcst: Forecasted values.
out_fitted: Fitted values for the observed data.
out_coefs: Linear and seasonal coefficients from forecasting models.
out_smooth: Smoothing parameters from exponential smoothing models.
out_bubble: Bubble-plot settings, including radius and power.
out_reg: Regression statistics produced when fit is specified.
out_parm: Parameter settings for a VBS plot of a continuous variable.
out_pivot: Pivot table for VBS plots based on any combination of
by and facet variables.
out_by: Pivot table for VBS plots aggregated by the by variable.
out_facet1: Pivot table for VBS plots aggregated by the first facet
variable.
out_facet2: Pivot table for VBS plots aggregated by the second facet
variable.
STATISTICS
outliers: Row numbers corresponding to detected outliers.
David W. Gerbing (Portland State University; gerbing@pdx.edu)
Brys, G., Hubert, M., & Struyf, A. (2004). A robust measure of skewness. Journal of Computational and Graphical Statistics, 13(4), 996-1017.
Murdoch, D, and Chow, E. D. (2013). ellipse function from the ellipse package.
Gerbing, D. W. (2023). R Data Analysis without Programming, 2nd edition, Chapter 10, NY: Routledge.
Gerbing, D. W. (2020). R Visualizations: Derive Meaning from Data, Chapter 5, NY: CRC Press.
Gerbing, D. W. (2021). Enhancement of the Command-Line Environment for use in the Introductory Statistics Course and Beyond, Journal of Statistics and Data Science Education, 29(3), 251-266, https://www.tandfonline.com/doi/abs/10.1080/26939169.2021.1999871.
Hyndman, R. J., & Athanasopoulos, G. (2021). Forecasting: Principles and Practice (3rd ed.). Melbourne, Australia: OTexts. Retrieved from https://otexts.com/fpp3/
Sarkar, Deepayan (2008) Lattice: Multivariate Data Visualization with R, Springer. http://lmdvr.r-forge.r-project.org/
Sievert, C. (2020). Interactive Web-Based Data Visualization with R, plotly, and shiny. Chapman and Hall/CRC. URL: https://plotly.com/r/
X, Chart, style.
# read the data
d <- rd("Employee", quiet=TRUE)
d <- d[.(random(0.6)),] # less computationally intensive
dd=d
#---------------------------------------------------
# traditional scatterplot with two numeric variables
#---------------------------------------------------
# scatterplot with all defaults
XY(Years, Salary)
# or use Plot in place of XY, the older method
XY(Years, Salary, by=Gender, size=2, fit="lm",
fill=c(M="olivedrab3", W="gold1"),
color=c(M="darkgreen", W="gold4"))
# maximum information, minimum input: scatterplot +
# means, outliers, ellipse, least-squares lines with and w/o outliers
XY(Years, Salary, enhance=TRUE)
# extend x and y axes
XY(Years, Salary, scale_x=c(-10, 35, 10), scale_y=c(0,200000,10))
XY(Years, Salary, add="Hi", x1=c(12, 16, 18), y1=c(80000, 100000, 60000))
d <- factors(Gender, levels=c("M", "W"))
XY(Years, Salary, facet=Gender)
d <- dd
# just males employed more than 5 years
XY(Years, Salary, filter=(Gender=="M" & Years > 5))
# plot 0.95 data ellipse with the points identified that represent
# outliers defined by a Mahalanobis Distance larger than 6
# save outliers into R object out
d[1, "Salary"] <- 200000
out <- XY(Years, Salary, ellipse=0.95, MD_cut=6)
# new shape and point size, no grid or background color
# then put style back to default
style(panel_fill="powderblue", grid_color="powderblue")
XY(Years, Salary, size=2, shape="diamond")
style()
# translucent data ellipses without points or edges
# show the idealized joint distribution for bivariate normality
style(ellipse_color="off")
XY(Years, Salary, size=0, ellipse=seq(.1,.9,.10))
style()
# bubble plot with size determined by the value of Pre
# display the value for the bubbles with values of min, median and max
XY(Years, Salary, size=Pre, size_cut=3)
# variables in a data frame not the default d
# plot 0.6 and 0.9 data ellipses with partially transparent points
# change color theme to gold with black background
style("gold", sub_theme="black")
XY(eruptions, waiting, transparency=.5, ellipse=seq(.6,.9), data=faithful)
# scatterplot with two x-variables, plotted against Salary
# define a new style, then back to default
style(window_fill=rgb(247,242,230, maxColorValue=255),
panel_fill="off", panel_color="off", pt_fill="black", transparency=0,
lab_color="black", axis_text_color="black",
axis_y_color="off", grid_x_color="off", grid_y_color="black",
grid_lty="dotted", grid_lwd=1)
XY(c(Pre, Post), Salary)
style()
# increase span (smoothing) from default of .7 to 1.25
# span is a loess parameter, which generates a caution that can be
# ignored that it is not a graphical parameter -- we know that
# display confidence intervals about best-fit line at
# 0.95 confidence level
XY(Years, Salary, fit="loess", span=1.25)
# 2-D kernel density (more useful for larger sample sizes)
XY(Years, Salary, type="smooth")
#------------------------------------------------------
# scatterplot matrix from a vector of numeric variables
#------------------------------------------------------
# with least squares fit line
XY(c(Salary, Years, Pre), c(Salary, Years, Pre), fit="lm")
#--------------------------------------------------------------
# Trellis graphics and by for groups with two numeric variables
#--------------------------------------------------------------
# Trellis plot with condition on 1-variable
# optionally re-order default alphabetical R ordering by converting
# to a factor with lessR factors (which also does multiple variables)
# always save to the full data frame with factors
d <- factors(Gender, levels=c("M", "W"))
XY(Years, Salary, facet=Gender)
d <- Read("Employee", quiet=TRUE)
# all three by (categorical) variables
XY(Years, Salary, facet=c(Dept, Gender), by=Plan, n_axis_y_skip=1)
# vary both shape and color with a least-squares fit line for each group
style(color=c("darkgreen", "brown"))
XY(Years, Salary, facet=Gender, fit="lm", shape=c("W","M"), size=.8)
style("gray")
# compare the men and women Salary according to Years worked
# with an ellipse for each group
XY(Years, Salary, by=Gender, ellipse=.50)
# time charts
#------------
# run chart, with default fill area
XY(.Index, Salary, ts_area_fill="on")
# two run charts in same panel
# or could do a multivariate time series
XY(.Index, c(Pre, Post))
# Trellis graphics run chart with custom line width, no points
XY(.Index, Salary, facet=Gender, line_width=3, size=0)
# daily time series plot
# create the daily time series from R built-in data set airquality
oz.ts <- ts(airquality$Ozone, start=c(1973, 121), frequency=365)
XY(oz.ts)
# multiple time series plotted from dates and stacked
# black background with translucent areas, then reset theme to default
style(sub_theme="black", color="steelblue2", transparency=.55,
window_fill="gray10", grid_color="gray25")
date <- seq(as.Date("2013/1/1"), as.Date("2016/1/1"), by="quarter")
x1 <- rnorm(13, 100, 15)
x2 <- rnorm(13, 100, 15)
x3 <- rnorm(13, 100, 15)
df <- data.frame(date, x1, x2, x3)
rm(date); rm(x1); rm(x2); rm(x3)
XY(date, x1:x3, data=df)
style()
# aggregate monthly data to plot by quarter
n.q <- 42
month <- seq(as.Date("2013/1/1"), length=n.q, by="months")
x <- rnorm(n.q, 100, 15)
XY(month, x, ts_unit="quarters")
# trigger a time series with a Date variable specified first
# stock prices for three companies by month: Apple, IBM, Intel
d <- rd("StockPrice")
# only plot Apple
XY(Month, Price, filter=(Company=="Apple"))
# Trellis plots, one for each company
XY(Month, Price, facet=Company, n_col=1)
# all three plots on the same panel, three shades of blue
XY(Month, Price, by=Company, color="blues")
# exponential smoothing forecast for next 12 months,
# aggregate monthly data by mean over quarters
XY(Month, Price, ts_ahead=12, ts_unit="quarters")
#------------------------------------------
# analysis of a single categorical variable
#------------------------------------------
d <- rd("Employee")
# default 1-D bubble plot
# frequency plot, replaces bar chart
XY(Dept)
# plot of frequencies for each category (level), replaces bar chart
XY(Dept, stat_x="count")
#----------------------------------------------------
# scatterplot of numeric against categorical variable
#----------------------------------------------------
# generate a chart with the plotted mean of each level
# rotate x-axis labels and then offset from the axis
style(rotate_x=45, offset=1)
XY(Dept, Salary)
style()
#-------------------
# Cleveland dot plot
#-------------------
# standard scatterplot
XY(Salary, row_names, segments_y=FALSE)
# Cleveland dot plot with two x-variables
XY(c(Pre, Post), row_names)
#------------
# annotations
#------------
# add text at the one location specified by x1 and x2
XY(Years, Salary, add="Hi There", x1=12, y1=80000)
# add text at three different specified locations
XY(Years, Salary, add="Hi", x1=c(12, 16, 18), y1=c(80000, 100000, 60000))
# add three different text blocks at three different specified locations
XY(Years, Salary, add=c("Hi", "Bye", "Wow"), x1=c(12, 16, 18),
y1=c(80000, 100000, 60000))
# add an 0.95 data ellipse and horizontal and vertical lines through the
# respective means
XY(Years, Salary, ellipse=0.95, add=c("v_line", "h_line"),
x1="mean_x", y1="mean_y")
# can be done also with the following short-hand
XY(Years, Salary, ellipse=0.95, add="means")
# a rectangle requires two points, four coordinates, <x1,y1> and <x2,y2>
style(add_trans=.8, add_fill="gold", add_color="gold4", add_lwd=0.5)
XY(Years, Salary, add="rect", x1=12, y1=80000, x2=16, y2=115000)
# the first object, a rectangle, requires all four coordinates
# the vertical line at x=2 requires only an x1 coordinate, listed 2nd
XY(Years, Salary, add=c("rect", "v_line"), x1=c(10, 2),
y1=80000, x2=12, y2=115000)
# two different rectangles with different locations, fill colors and translucence
style(add_fill=c("gold3", "green"), add_trans=c(.8,.4))
XY(Years, Salary, add=c("rect", "rect"),
x1=c(10, 2), y1=c(60000, 45000), x2=c(12, 75000), y2=c(80000, 55000))
#----------------------------------------------------
# analysis of two categorical variables (Likert data)
#----------------------------------------------------
d <- Read("Mach4", quiet=TRUE) # Likert data, 0 to 5
XY(m06, m07)
#---------------
# function curve
#---------------
x <- seq(10,50,by=2)
y1 <- sqrt(x)
y2 <- x**.33
# x is sorted with equal intervals so run chart by default
XY(x, y1)
# multiple plots from variable vectors need to have the variables
# in a data frame
d <- data.frame(x, y1, y2)
# if variables are in the user workspace and in a data frame
# with the same names, the user workspace versions are used,
# which do not work with vectors of variables, so remove
rm(x); rm(y1); rm(y2)
XY(x, c(y1, y2))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.