#' @export
#' @import grDevices
#' @importFrom utils modifyList
#' @title Least Squares regression plot (with optional LOESS fit)
#'
#' @description \code{eda_lm} generates a scatter plot with a fitted regression
#' line. A loess line can also be added to the plot for model comparison. The
#' axes are scaled such that their respective standard deviations match axes
#' unit length.
#'
#' @param dat Data frame.
#' @param x Column assigned to the x axis.
#' @param y Column assigned to the y axis.
#' @param px Power transformation to apply to the x-variable.
#' @param py Power transformation to apply to the y-variable.
#' @param tukey Boolean determining if a Tukey transformation should be adopted
#' (FALSE adopts a Box-Cox transformation).
#' @param show.par Boolean determining if power transformation should be
#' displayed in the plot.
#' @param reg Boolean indicating whether a least squares regression line should
#' be plotted.
#' @param w Weight to pass to regression model.
#' @param sd Boolean determining if standard deviation lines should be plotted.
#' @param mean.l Boolean determining if the x and y mean lines should be added
#' to the plot.
#' @param grey Grey level to apply to plot elements (0 to 1 with 1 = black).
#' @param pch Point symbol type.
#' @param p.col Color for point symbol.
#' @param p.fill Point fill color passed to \code{bg} (Only used for \code{pch}
#' ranging from 21-25).
#' @param size Point size (0-1).
#' @param alpha Point transparency (0 = transparent, 1 = opaque). Only
#' applicable if \code{rgb()} is not used to define point colors.
#' @param q Boolean determining if grey quantile boxes should be plotted.
#' @param q.val F-values to use to define the quantile box parameters. Defaults
#' to mid 68% of values. If more than 2 f-values are defined, the first two
#' are used to generate the box.
#' @param q.type Quantile type. Defaults to 5 (Cleveland's f-quantile
#' definition).
#' @param loe Boolean indicating if a loess curve should be fitted.
#' @param lm.col Regression line color.
#' @param loe.col LOESS curve color.
#' @param stats Boolean indicating if regression summary statistics should be
#' displayed.
#' @param stat.size Text size of stats output in plot.
#' @param loess.d A list of parameters passed to the \code{loess.smooth}
#' function. A robust loess is used by default.
#' @param xlab X label for output plot.
#' @param ylab Y label for output plot.
#' @param ... Not used.
#'
#' @details The function will plot an OLS regression line and, if requested, a
#' loess fit. The plot will also display the +/- 1 standard deviations as
#' dashed lines. In theory, if both x and y values follow a perfectly Normal
#' distribution, roughly 68 percent of the points should fall in between these
#' lines.
#' \cr \cr
#' The true 68 percent of values can be displayed as grey rectangles by setting
#' \code{q=TRUE}. This uses the \code{quantile} function to compute the upper
#' and lower bounds defining the inner 68 percent of values. If the data
#' follow a Normal distribution, the grey rectangle edges should coincide
#' with the +/- 1SD dashed lines.\cr
#' If you wish to show the interquartlie ranges (IQR) instead of the inner
#' 68 percent of values, simply set \code{q.val = c(0.25,0.75)}.
#' \cr \cr
#' The plot has the option to re-express the values via the \code{px} and
#' \code{py} arguments. But note that if the re-expression produces \code{NaN}
#' values (such as if a negative value is logged) those points will be removed
#' from the plot. This will result in fewer observations being plotted. If
#' observations are removed as result of a re-expression a warning message
#' will be displayed in the console. \cr
#' The re-expression powers are shown in the upper right side of the plot.
#' To suppress the display of the re-expressions set \code{show.par = FALSE}.
#'
#' @return Returns residuals, intercept and slope from an OLS fit.
#'
#' \itemize{
#' \item \code{residuals}: Regression model residuals
#' \item \code{a}: Intercept
#' \item \code{b}: Slope}
#'
#' @seealso \code{\link[graphics]{plot}} and \code{\link[stats]{loess.smooth}}
#' functions
#'
#' @examples
#'
#' # Add a regular (OLS) regression model and loess smooth to the data
#' eda_lm(mtcars, wt, mpg, loe = TRUE)
#'
#' # Add the inner 68% quantile to compare the true 68% of data to the SD
#' eda_lm(mtcars, wt, mpg, loe = TRUE, q = TRUE)
#'
#' # Show the IQR box
#' eda_lm(mtcars, wt, mpg, loe = TRUE, q = TRUE, sd = FALSE, q.val = c(0.25,0.75))
#'
#' # Fit an OLS to the Income for Female vs Male
#' df2 <- read.csv("https://mgimond.github.io/ES218/Data/Income_education.csv")
#' eda_lm(df2, x=B20004013, y = B20004007, xlab = "Female", ylab = "Male",
#' loe = TRUE)
#'
#' # Add the inner 68% quantile to compare the true 68% of data to the SD
#' eda_lm(df2, x = B20004013, y = B20004007, xlab = "Female", ylab = "Male",
#' q = TRUE)
#'
#' # Apply a transformation to x and y axes: x -> 1/3 and y -> log
#' eda_lm(df2, x = B20004013, y = B20004007, xlab = "Female", ylab = "Male",
#' px = 1/3, py = 0, q = TRUE, loe = TRUE)
#'
eda_lm <- function(dat, x, y, xlab = NULL, ylab = NULL, px = 1, py = 1,
tukey = FALSE, show.par = TRUE, reg = TRUE, w=NULL,
sd = TRUE, mean.l = TRUE, grey = 0.6, pch = 21,
p.col = "grey50", p.fill = "grey80", size = 0.8, alpha = 0.8,
q = FALSE, q.val = c(0.16,0.84), q.type = 5, loe = FALSE,
lm.col = rgb(1, 0.5, 0.5, 0.8), loe.col = rgb(.3, .3, 1, 1),
stats=FALSE, stat.size = 0.8,
loess.d=list(family = "symmetric", span=0.7, degree=1), ...){
if(is.null(xlab)){
xlab = as.character(substitute(x))
}
if(is.null(ylab)){
ylab = as.character(substitute(y))
}
# Re-express data if required
x <- eda_re(eval(substitute(x), dat), p = px, tukey = tukey)
x.nan <- is.na(x)
y <- eda_re(eval(substitute(y), dat), p = py, tukey = tukey)
y.nan <- is.na(y)
# Re-expression may produce NaN values. Output warning if TRUE
if( any(x.nan, y.nan) ) {
warning(paste("\nRe-expression produced NaN values. These observations will",
"be removed from output. This will result in fewer points",
"in the ouptut."))
bad <- x.nan | y.nan
x <- x[!bad]
y <- y[!bad]
}
# Set plot elements color
plotcol <- rgb(1-grey, 1-grey, 1-grey)
# Set point color parameters.
if(!is.null(alpha)){
if(p.col %in% colors() & p.fill %in% colors() ){
p.col <- adjustcolor( p.col, alpha.f = alpha)
p.fill <- adjustcolor( p.fill, alpha.f = alpha)
}
}
# Add to default plot list parameters
loess.l <- modifyList(list(span = 0.5), loess.d)
# Run regression model
M <- lm(y ~ x, weights = w)
# Generate plots ----
# Get lines-to-inches ratio
in2line <- ( par("mar") / par("mai") )[2]
# Create a dummy plot to extract y-axis labels
pdf(NULL)
plot(x = x, y = y, type = "n", xlab = "", ylab = "", xaxt = "n",
yaxt='n', main = NULL)
#y.labs <- range(axTicks(2))
y.wid <- max( strwidth( axTicks(2), units="inches")) * in2line + 1.2
dev.off()
# Compute the margin width (returned in inches before converting to lines)
# y.wid <- max( strwidth( y.labs[1], units="inches"),
# strwidth( y.labs[2], units="inches")) * in2line + 1
.pardef <- par(pty = "s", col = plotcol, mar = c(3,y.wid,3,1))
on.exit(par(.pardef))
sd.x <- sd(x, na.rm=T)
sd.y <- sd(y, na.rm=T)
mean.x <- mean(x, na.rm=T)
mean.y <- mean(y, na.rm=T)
plot( x=x, y=y , asp=sd.x/sd.y, ylab=NA, las=1, yaxt='n', xaxt='n', xlab=NA,
col.lab=plotcol, pch = pch, col = p.col, bg = p.fill, cex = size)
box(col=plotcol)
axis(1,col=plotcol, col.axis=plotcol, labels=TRUE, padj = -0.5)
axis(2,col=plotcol, col.axis=plotcol, labels=TRUE, las=1, hadj = 0.9,
tck = -0.02)
mtext(ylab, side=3, adj= -0.06 ,col=plotcol, padj = -1.2)
sq <- par("usr") # get plot corners
if (sd == TRUE){
ysd1 <- (mean.y + sd.y)
ysd2 <- (mean.y - sd.y)
text( label="+1sd", x= sq[2] - diff(sq[1:2]) * 0.03, y= ysd1 + diff(sq[3:4])*0.02,
srt=0, col="grey70", cex=0.7)
text( label="-1sd", x= sq[2] - diff(sq[1:2]) * 0.03, y= ysd2 + diff(sq[3:4])*0.02,
srt=0, col="grey70", cex=0.7)
text( label="+1sd", y= sq[4] - diff(sq[3:4]) * 0.01, x= (mean.x + sd.x) ,
srt=0, col="grey70", cex=0.7)
text( label="-1sd", y= sq[4] - diff(sq[3:4]) * 0.01, x= (mean.x - sd.x) ,
srt=0, col="grey70", cex=0.7)
}
title(xlab = xlab, line =1.8, col.lab=plotcol)
if(reg == TRUE) abline(M, lw = 2, col = lm.col )
if (mean.l == TRUE){
abline(v=mean.x,lty=1,col="grey70")
abline(h=mean.y, lty=1, col="grey70")
}
if (sd == TRUE){
abline(v= mean.x + c(-sd.x,sd.x) ,lty=2,col="grey80")
abline(h=mean.y + c(-sd.y,sd.y), lty=2, col="grey80")
}
if(loe == TRUE) lines( do.call( "loess.smooth",c( list(x=x,y=y), loess.l)),
col=loe.col,lw=2 , lty=2)
if(stats == TRUE){
st <- summary(M)
mtext( sprintf("R-sq = %0.2f Beta= %g P(beta) = %0.3f", st$r.sq,
st$coef[2,1] , st$coef[2,4] ),
side=3, col="blue", cex=stat.size )
}
if(q == TRUE){
# st <- summary(M)
qy <- quantile(y, q.val, type = q.type, na.rm=TRUE)
qx <- quantile(x, q.val, type = q.type, na.rm=TRUE)
rect(xleft = qx[1], xright = qx[2], ybottom=sq[3],ytop=sq[4],
col = rgb(0,0,0,0.1), border = NA)
rect(xleft = sq[1], xright = sq[2], ybottom=qy[1],ytop=qy[2],
col = rgb(0,0,0,0.1), border = NA)
}
if(show.par == TRUE){
params <- paste0("px=",round(px,2),"\n py=",round(py,2))
mtext(side = 3, text=params, adj=1, cex = 0.65)
}
# Reset plot parameters
par(.pardef)
print(coef(M))
invisible(list(residuals = residuals(M), a = coef(M)[1], b = coef(M)[2]))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.