#' @export
#' @title Spread-location and spread-level plots
#'
#' @description The \code{eda_sl} function generates William Cleveland's
#' spread-location plot for univariate and bivariate data. The function will also
#' generate Tukeys' spread-level plot.
#'
#' @param dat Dataframe of univariate data or a linear model.
#' @param x Continuous variable column (ignored if \code{dat} is a linear model).
#' @param fac Categorical variable column (ignored if \code{dat} is a linear
#' model).
#' @param type s-l plot type. \code{"location"} = spread-location,
#' \code{"level"} = spread-level (only for univariate data).
#' \code{"dependence"} = spread-dependence (only for bivariate model input).
#' @param p Power transformation to apply to variable. Ignored if input is a
#' linear model.
#' @param tukey Boolean determining if a Tukey transformation should be adopted
#' (FALSE adopts a Box-Cox transformation).
#' @param sprd Choice of spreads used in the spread-versus-level plot (i.e.
#' when \code{type = "level"}). Either
#' interquartile, \code{sprd = "IQR"} or
#' fourth-spread, \code{sprd = "frth"} (default).
#' @param jitter Jittering parameter for the spread-location plot. A fraction of
#' the range of location values.
#' @param robust Boolean indicating if robust regression should be used on the
#' spread-level plot.
#' @param loess.d Arguments passed to the internal loess function. Applies only
#' to the bivariate model s-l plots and the spread-level plot.
#' @param loe.col LOESS curve color.
#' @param label Boolean determining if group labels are to be added to the
#' spread-location plot.
#' @param plot Boolean determining if plot should be generated.
#' @param equal Boolean determining if axes lengths should match (i.e. square
#' plot).
#' @param grey Grey level to apply to plot elements (0 to 1 with 1 = black).
#' @param pch Point symbol type.
#' @param p.col Color for point symbol.
#' @param p.fill Point fill color passed to \code{bg} (Only used for \code{pch}
#' ranging from 21-25).
#' @param size Point size (0-1).
#' @param alpha Point transparency (0 = transparent, 1 = opaque). Only
#' applicable if \code{rgb()} is not used to define point colors.
#' @param xlab X label for output plot.
#' @param ylab Y label for output plot.
#' @param label.col Color assigned to group labels (only applicable if
#' \code{type = location}).
#' @param labelxbuff Buffer to add to the edges of the plot to make room for
#' the labels in a spread-location plot. Value is a fraction of the plot width.
#' @param labelybuff Buffer to add to the top of the plot to make room for
#' the labels in a spread-location plot. Value is a fraction of the plot width.
#' @param show.par Boolean determining if the power transformation applied to
#' the data should be displayed.
#'
#' @return Returns a dataframe of level and spread values.
#'
#' @details
#' The function generates a few variations of the spread-location/spread-level
#' plots depending on the data input type and parameter passed to the
#' \code{type} argument. The residual spreads are mapped to the y-axis and the
#' levels are mapped to the x-axis. Their values are computed as follows:
#' \itemize{
#'
#' \item \code{type = "location"} (univariate data):\cr\cr
#' William Cleveland's spread-location plot applied to univariate
#' data.\cr
#' \eqn{\ spread = \sqrt{|residuals|}} \cr
#' \eqn{\ location = medians}
#'
#' \item \code{type = "level"} (univariate data):\cr\cr
#' Tukey's spread-level plot (aka spread-versus-level plot, Hoaglin et al.,
#' p 260). If the pattern is close to linear, the plot can help find a power
#' transformation that will help stabilize the spread in the data by
#' subtracting one from the fitted slope. This option outputs the slope of
#' the fitted line in the console. A loess is added to assess linearity.
#' By default, the fourth spread is used to define the spread. Alternatively,
#' the IQR can be used by setting \code{spread = "IQR"}. The output will be
#' nearly identical except for small datasets where the two methods may
#' diverge slightly in output.\cr
#' \eqn{\ spread = log(fourth\ spread(residuals))} \cr
#' \eqn{\ location = log(medians)}
#'
#' \item \code{type = "location"} if input is a model of class \code{lm},
#' \code{eda_lm} or \code{eda_rline}:\cr\cr
#' William Cleveland's spread-location plot (aka scale-location plot)
#' applied to residuals of a linear model.\cr
#' \eqn{\ spread = \sqrt{|residuals|}} \cr
#' \eqn{\ location = fitted\ values}
#'
#' \item \code{type = "dependence"} if input is a model of class \code{lm},
#' \code{eda_lm} or \code{eda_rline}:\cr\cr
#' William Cleveland's spread-location plot applied to residuals of
#' a linear model.\cr
#' \eqn{\ spread = \sqrt{|residuals|}} \cr
#' \eqn{\ dependence = x\ variable}
#' }
#'
#' @references
#' \itemize{
#' \item Understanding Robust and Exploratory Data Analysis, Hoaglin,
#' David C., Frederick Mosteller, and John W. Tukey, 1983.
#' \item William S. Cleveland. Visualizing Data. Hobart Press (1993)
#' }
#'
#' @examples
#' cars <- MASS::Cars93
#' # Cleveland's spread-location plot applied to univariate data
#' eda_sl(cars, MPG.city, Type)
#'
#' # You can specify the exact form of the spread on the y-axis
#' # via the ylab argument
#' eda_sl(cars, MPG.city, Type, ylab = expression(sqrt(abs(residuals))) )
#'
#' # The function can also generate Tukey's spread-level plot to identify a
#' # power transformation that can stabilize spread across fitted values
#' # following power = 1 - slope
#' eda_sl(cars, MPG.city, Type, type = "level")
#'
#' # A slope of around 3 is computed from the s-l plot, therefore, a suggested
#' # power is 1 - 3 = -2. We can apply a power transformation within the
#' # function via the p argument. By default, a Box-Cox transformation method
#' # is adopted.
#' eda_sl(cars, MPG.city, Type, p = -2)
#'
#' # Spread-location plot can also be generated from residuals of a linear model
#' M1 <- lm(mpg ~ hp, mtcars)
#' eda_sl(M1)
#'
#' # Spread can be compared to X instead of fitted value
#' eda_sl(M1, type = "dependence")
eda_sl <- function(dat, x=NULL, fac=NULL, type = "location", p = 1, tukey = FALSE,
sprd = "frth", jitter = 0.01, robust = TRUE,
loess.d = list(family = "symmetric", degree=1, span = 1),
loe.col = rgb(.3, .3, 1, 1),
label = TRUE, label.col = "lightsalmon", plot = TRUE, equal = TRUE,
grey = 0.6, pch = 21, p.col = "grey50", p.fill = "grey80",
size = 0.8, alpha = 0.8, xlab = NULL, ylab = NULL, labelxbuff = 0.05,
labelybuff = 0.05, show.par = TRUE) {
# Check that input is either an eda_lm model or a dataframe
if (! (inherits(dat,"data.frame") |
(inherits(dat,"eda_lm") |
inherits(dat, "lm") |
inherits(dat, "eda_rline"))))
stop("The input object must of class eda_lm or a data.frame.")
# Parameters check
if (!sprd %in% c("frth", "IQR"))
stop("Argument \"sprd\" must be one of \"frth\" or \"IQR\".",
call. = FALSE)
# Check that type is properly specified
if(!type %in% c("location", "level", "dependence"))
stop("type argument is invalid.")
# Initialize some variables
ylim = NULL
xlim = NULL
# Extract data
if(inherits(dat,"data.frame")){ # Univariate input
dtype <- "univariate"
equal <- FALSE
x <- eval(substitute(x), dat)
fac <- eval(substitute(fac), dat)
if(is.factor(fac)) fac <- droplevels(fac)
# Remove missing values from the data
which_na <- which(is.na(x))
if(length(which_na > 0)){
x <- x[-which_na]
fac <- fac[-which_na]
if(is.factor(fac)) fac <- droplevels(fac)
warning(cat(length(which_na),"rows were removed due to NAs being present.\n"))
}
# Check that each group has at least two values (only applies to univariate
# data). Remove groups with less than 2 records.
group_n <- table(fac)
x <- x[fac %in% names(group_n[group_n > 1])]
fac <- fac[fac %in% names(group_n[group_n > 1])]
if( any(group_n < 2) )
warning(paste("One or more groups was removed from the dataset",
"for having less than two observations:",
names(group_n[group_n < 2]), "\n"))
} else { # Model input
y <- dat$residuals
if (type == "dependence"){
if(inherits(dat, "lm")){
x <- dat$model[,2]
} else {
x <- dat$x
}
} else {
x <- dat$fitted.values
}
dtype <- "model"
}
# Get labels
if(is.null(xlab)){
if (dtype == "model" & type == "location"){
xlab = "Fitted values"
} else if (dtype == "model" & type == "dependence"){
xlab = "X"
} else {
xlab = "Location"
}
}
if(is.null(ylab)){
if (dtype == "model"){
ylab <- expression(sqrt(abs(residuals)))
} else {
ylab <- "Spread"
}
}
# Re-express data if required (only applies to univariate data)
if(inherits(dat,"data.frame")){
x <- eda_re(x, p = p, tukey = tukey)
x.nan <- is.na(x)
if( any(x.nan)){
x <- x[!x.nan]
fac <- fac[!x.nan]
warning(paste("\nRe-expression produced NaN values. These observations will",
"be removed from output. This will result in fewer points",
"in the ouptut."))
}
}
# Set plot elements color
plotcol <- rgb(1-grey, 1-grey, 1-grey)
# Set point color parameters.
if(!is.null(alpha)){
if(p.col %in% colors() & p.fill %in% colors() ){
p.col <- adjustcolor( p.col, alpha.f = alpha)
p.fill <- adjustcolor( p.fill, alpha.f = alpha)
}
}
# Custom function
frth_sprd <- function(x) {
lsum <- eda_lsum(x, l=2)
return(lsum[2,6]) # Returns spread
}
# Spread-location plot options
if(type == "location" & dtype == "univariate"){ # univariate spread-location
meds <- tapply(x, fac, median)
level <- meds[as.character(fac)]
spread <- sqrt(abs(x - level))
level <- jitter(level, amount = jitter * diff(range(level)))
spread_med <- tapply(spread, fac, median)
df4 <- data.frame(Level = level, Spread = spread, grp = fac)
ylim <- c(0, max(spread) + max(spread) * labelybuff )
rangex <- range(level)
xlim <- c(rangex[1] - diff(rangex) * labelxbuff,
rangex[2] + diff(rangex) * labelxbuff)
} else if (type == "level" & dtype == "univariate"){ # univariate spread-level
# Split data into groups
x_fac <- split(x, fac)
level <- log(unlist(lapply(x_fac, median)))
if( sprd == "frth"){
spread <- log(unlist(lapply(x_fac, frth_sprd)))
} else if(sprd == "IQR") {
spread <- log(unlist(lapply(x_fac, IQR)))
}
df4 <- data.frame(Level = level, Spread = spread)
} else { # linear model spread-level
level <- x
spread <- sqrt(abs(y))
df4 <- data.frame(Level = level, Spread = spread)
}
# Generated plot (if requested)
if(plot == TRUE){
# Get lines-to-inches ratio
in2line <- ( par("mar") / par("mai") )[2]
# Create a dummy plot to extract y-axis labels
pdf(NULL)
plot(x = level, y = spread, type = "n", xlab = "", ylab = "", xaxt = "n",
yaxt='n', main = NULL, xlim=xlim, ylim=ylim)
y.wid <- max( strwidth( axTicks(2), units="inches")) * in2line + 1.2
dev.off()
# Compute the margin width (returned in inches before converting to lines)
# y.wid <- max( strwidth( y.labs[1], units="inches"),
# strwidth( y.labs[2], units="inches")) * in2line + 1
# .pardef <- par(col = plotcol, mar = c(3,y.wid,3,1))
if(equal == TRUE ){
.pardef <- par(mar = c(3,y.wid,3,1), col = plotcol, pty = "s")
} else {
.pardef <- par(mar = c(3,y.wid,3,1), col = plotcol)
}
on.exit(par(.pardef))
plot( x=level, y=spread , ylab=NA, las=1, yaxt='n', xaxt='n', xlab=NA,
col.lab=plotcol, pch = pch, col = p.col, bg = p.fill, cex = size,
ylim = ylim, xlim = xlim)
box(col=plotcol)
axis(1,col=plotcol, col.axis=plotcol, labels=TRUE, padj = -0.5)
axis(2,col=plotcol, col.axis=plotcol, labels=TRUE, las=1, hadj = 0.9, tck = -0.02)
#mtext(ylab, side=3, adj= -0.06 , col=plotcol, padj = -1.2, cex = par("cex"))
# Y-label
# Get plot specs
lbl_width <- strwidth(ylab, units = "inches")
mar_width <- par("mai")[2]
loc <- par("usr") # in XY coordinates
xscl <- (loc[2] - loc[1]) / par("pin")[1]
# Place y-label
if(lbl_width/2 > mar_width * 0.6){
xloc <- loc[1] + (lbl_width/2 - mar_width * 0.6) * xscl
} else {
xloc <- loc[1]
}
text(xloc, loc[4], labels = ylab, col=plotcol, cex = par("cex"), xpd = TRUE, pos = 3, offset = 1)
if (type == "location" & dtype == "univariate"){
title(xlab = xlab, line = 1.8, col.lab=plotcol)
lines(sort(meds),spread_med[order(meds)], col = rgb(1, 0.5, 0.5, 0.9), lw = 2)
points(meds, spread_med, col = rgb(1, 0.5, 0.5, 0.8), pch = 15)
if (label == TRUE){
with(df4, label_placement(Level, Spread, grp, label.col))
}
if(show.par == TRUE){
mtext(side = 3, text=paste0("p=",p), adj=1, cex = 0.65)
}
} else if (type == "level" & dtype == "univariate") {
if(robust == TRUE){
Mlevel <- MASS::rlm(Spread ~ Level, df4)
} else {
Mlevel <- lm(Spread ~ Level, df4)
}
abline(Mlevel, col = rgb(1, 0.5, 0.5, 0.9), lw = 2)
loess.l <- modifyList(list(), loess.d)
lines( do.call( "loess.smooth",c( list(x=df4$Level,y=df4$Spread), loess.l)),
col=loe.col, lw = 1.5, lt = 2 )
cat("Slope = ", Mlevel$coefficients[2])
title(xlab = xlab, line = 1.8, col.lab=plotcol)
if(show.par == TRUE){
mtext(side = 3, text=paste0("p=",p), adj=1, cex = 0.65)
}
} else {
loess.l <- modifyList(list(), loess.d)
lines( do.call( "loess.smooth",c( list(x=df4$Level,y=df4$Spread), loess.l)),
col=loe.col, lw = 1.5, lt = 1 )
title(xlab = xlab, line = 1.8, col.lab=plotcol)
}
par(.pardef)
}
invisible(df4)
}
label_placement <- function(x,y,grp, label.col){
df <- data.frame(x,y,grp)
# Get range of values for each group
ranges <- tapply(y, grp, FUN = function(x)diff(range(x)))
# Calculate label positions
label_positions <- aggregate(cbind(x, y) ~ grp, df,
FUN = function(x) c(mean = mean(x), max = max(x)))
# Loop through each group
for (i in 1:nrow(label_positions)) {
group <- label_positions$grp[i]
mean_level <- label_positions$x[i, "mean"]
max_spread <- label_positions$y[i, "max"]
# Place the text above the highest point in the cluster
text(x = mean_level, y = max_spread + 0.07 * ranges[i], labels = group,
col = label.col, cex = 0.8)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.