R/eda_boxls.R

Defines functions eda_boxls

Documented in eda_boxls

#' @export
#' @import lattice
#' @title Boxplots equalized by level and spread
#'
#' @description \code{eda_boxls} creates boxplots conditioned on one variable
#'   while providing the option to spreads levels and/or levels.
#'
#' @param dat  Data frame
#' @param x    Column name assigned to the values
#' @param fac  Column name assigned to the factor the values are to be
#'   conditioned on
#' @param p  Power transformation to apply to variable
#' @param tukey Boolean determining if a Tukey transformation should be adopted
#'   (FALSE adopts a Box-Cox transformation)
#' @param outlier Boolean indicating if outliers should be plotted
#' @param out.txt Column whose values are to be used to label outliers
#' @param type Plot type. "none" = no equalization ; "l" = equalize by level;
#'   "ls" = equalize by both level and spread
#' @param notch Boolean determining if notches should be added.
#' @param  horiz  plot horizontally (TRUE) or vertically (FALSE)
#' @param  outliers  plot outliers (TRUE) or not (FALSE)
#' @param xlab X label for output plot
#' @param ylab Y label for output plot
#' @param grey Grey level to apply to plot elements (0 to 1 with 1 = black)
#' @param reorder Boolean determining if factors have to be reordered based
#'   on median, upper quartile or lower quartile (set in \code{reorder.type}).
#' @param reorder.stat Statistic to reorder level by if \code{reorder} is set to
#'   \code{TRUE}. Either \code{"median"}, \code{"upper"} (for upper quartile) or
#'   \code{"lower"} (for lower quartile). If \code{type} is set to a value other
#'   than \code{"none"}, the this argument is ignored and the stat defaults to
#'   \code{"median"}.
#'
#' @return {No values are returned}
#'
#' @details
#'  \itemize{
#'   \item By default, the boxplots are re-ordered by their median values.
#'   \item If the outlier text to be displayed is its own value, it will not be
#'    modified if the data are equalized by level or spread.
#'   \item Note that the notch offers a 95 percent test of the null that the true
#'   medians are equal assuming that the distribution of each batch is
#'   approximately normal. If the notches do not overlap, we can assume that
#'   medians are significantly different at a 0.05 level. Note that the notches
#'   do not correct for multiple comparison issues when three or more batches are
#'   plotted.}
#'
#' @examples
#'
#' # A basic boxplot. The outlier is labeled with the row number by default.
#' eda_boxls(mtcars,mpg, cyl, type="none")
#'
#' # A basic boxplot. The outlier is labeled with its own value.
#' eda_boxls(mtcars,mpg, cyl, type="none", out.txt=mpg )
#'
#' # Boxplot equalized by level. Note that the outlier text is labeled with its
#' # original value.
#' eda_boxls(mtcars,mpg, cyl, type="l", out.txt=mpg )
#'
#' # Boxplots equalized by level and spread
#' eda_boxls(mtcars,mpg, cyl, type="ls", out.txt=mpg )
#'
#' # Hide outlier
#' eda_boxls(mtcars,mpg, cyl, type="ls", out.txt=mpg , outlier=FALSE)
#'
#' # Equalizing level helps visualize increasing spread with increasing
#' # median value
#' food <- read.csv("http://mgimond.github.io/ES218/Data/Food_web.csv")
#' eda_boxls(food, mean.length, dimension, type = "l")
#'
#' # For long factor level names, flip plot
#' eda_boxls(iris, Sepal.Length, Species, out.txt=Sepal.Length , horiz = TRUE)
#'
#' # By default, plots are ordered by their medians.
#' singer <- lattice::singer
#' eda_boxls(singer, height, voice.part, out.txt=height, horiz = TRUE)
#'
#' # To order by top quartile, set reorder.stat to "upper"
#' eda_boxls(singer, height, voice.part, out.txt=height, horiz = TRUE,
#'           reorder.stat = "upper")

eda_boxls <- function(dat, x, fac, p = 1, tukey = FALSE, outlier=TRUE,
                      out.txt = NULL, type="none", notch = FALSE, horiz=FALSE,
                      outliers=TRUE, xlab = NULL, ylab = NULL, grey = 0.6,
                      reorder=TRUE, reorder.stat="median"){

  # Parameters check
  if (!type %in% c("none", "l" , "ls"))
    stop("Argument \"type\" must be one of \"none\", \"l\" or \"ls\".")
  if (!reorder.stat %in% c("median", "upper" , "lower"))
    stop("Argument \"reorder.stat\" must be one of \"median\", \"upper\" or
         \"lower\".")

  # Set plot elements color
  plotcol <- rgb(1-grey, 1-grey, 1-grey)

  # Get axes labels
  if(is.null(xlab)){
    xlab = substitute(fac)
  }
  if(is.null(ylab)){
    ylab = substitute(x)
  }

  # Get values and factors
  x   <- eval(substitute(x), dat)
  fac <- as.factor(eval(substitute(fac), dat))

  # Reorder levels if requested
  if(reorder == TRUE){
    if(reorder.stat == "lower") {
      stat <- tapply(x, fac, function(x) quantile(x,probs=0.25))
    } else if (reorder.stat == "upper"){
      stat <- tapply(x, fac, function(x) quantile(x,probs=0.75))
    } else {
      stat <- tapply(x, fac, median )
    }
    new_levels <- names(sort(stat))
    fac <- factor(fac, levels = new_levels)
  }

  # If custom outlier label is desired, grab text
  if(!missing(out.txt)) {out.txt <- eval(substitute(out.txt), dat)}

  # Re-express data if required
  x <- eda_re(x, p = p, tukey = tukey)
  x.nan <- is.na(x)
  if( any(x.nan)){
    x <- x[!x.nan]
    fac <- fac[!x.nan]
    warning(paste("\nRe-expression produced NaN values. These observations will",
                  "be removed from output. This will result in fewer points",
                  "in the ouptut."))
  }

  # Extract boxplot parameters
  bx  <- boxplot(x ~ fac, outline=outlier, plot=FALSE)

  # Which rows have outliers
  if( outlier == TRUE){
    # pst.dat <- paste(fac, x, sep=",")
    # pst.out <- paste(bx$names[bx$group], bx$out, sep=",")
    # out.row <- which(pst.dat %in% pst.out)
    out.row <- sapply(seq_along(bx$out), function(i)
      which(x == bx$out[i] & as.integer(fac) == bx$group[i]))
  }

  # Equalize levels
  if (type =="l" | type == "ls"){
    med <- bx$stats[3,]
    bx$stats <- sweep(bx$stats, 2, med, FUN="-")
    bx$conf  <- sweep(bx$conf, 2, med, FUN="-")
    bx$out <- bx$out - med[bx$group]
  }

  # Equalize levels and spreads (standardize using IQR)
  if (type == "ls"){
    sprd <- bx$stats[4,] - bx$stats[2,]
    bx$stats <- sweep(bx$stats, 2, sprd, FUN="/")
    bx$conf  <- sweep(bx$conf, 2, sprd, FUN="/")
    bx$out <- bx$out / sprd[bx$group]
  }

  # Generate plots ----

  # Get lines-to-inches ratio
  in2line <- ( par("mar") / par("mai") )[2]

  # Create a dummy plot to extract y-axis labels
  pdf(NULL)
  bxp(bx, pch=20, outline=outlier, horizontal=horiz)
  if(horiz == TRUE){
    fac.names <- unique(as.character(fac))
    fac.min <- which.min(nchar(fac.names))
    fac.max <- which.max(nchar(fac.names))
    y.labs <- c(fac.names[fac.min], fac.names[fac.max])
    y.wid <- max(strwidth( y.labs[1], units="inches"),
                 strwidth( y.labs[2], units="inches")) * in2line + 1.2
  } else {
    # y.labs <- range(axTicks(2))
    y.wid <- max( strwidth( axTicks(2), units="inches")) * in2line + 1.2
  }
  dev.off()

  # Compute the margin width (returned in inches before converting to lines)
  # y.wid <- max( strwidth( y.labs[1], units="inches"),
  #               strwidth( y.labs[2], units="inches")) * in2line + 1.2


 # .pardef <- par(pty = "s", col = plotcol, mar = c(3,y.wid,3.2,1))
  .pardef <- par(pty = "m", col = plotcol, mar = c(3,y.wid,3.2,1))
  on.exit(par(.pardef), add = TRUE)


  # Generate boxplot
  bxp(bx, pch=20, outline=outlier, horizontal=horiz, border=NA, notch = notch,
      boxfill="grey70", whiskcol="grey40", whisklty=1, staplecol="grey40",
      medcol="grey40",medlwd=4,outcol="grey40",outpch=20, yaxt='n', xaxt='n',
      pars=list(las=1, col.axis =plotcol,  col.lab="grey50"))
  box(col=plotcol)

    if (horiz == TRUE){
    xlab2 <- xlab
    xlab <- ylab
    ylab <- xlab2
  }

  if (horiz == TRUE){
    axis(1, col=plotcol, col.axis=plotcol, labels=TRUE, padj = -0.5)
    axis(2, at=1:length(levels(fac)), col=plotcol, col.axis=plotcol,
         labels=levels(fac), las=1, hadj = 0.9, tck = -0.02)
  } else {
    axis(1, at=1:length(levels(fac)), col=plotcol, col.axis=plotcol,
         labels=levels(fac), padj = -0.5)
    axis(2, col=plotcol, col.axis=plotcol, labels=TRUE, las=1, hadj = 0.9,
         tck = -0.02)
  }

#  mtext(ylab, side=3, adj= -0.06 ,col=plotcol,  padj = -1.2)
  mtext(ylab, side=3, adj= -0.01 ,col=plotcol,  padj = -1.1)
  title(xlab = xlab, line =1.8, col.lab=plotcol)

  # Add 0 line
  if(type != "none"){
    if(horiz == FALSE){
      abline(h=0,col="grey90",lty=2,lw=0.5)
    } else {
      abline(v=0,col="grey90",lty=2,lw=0.5)
    }
  }

  # Add outlier name if desired
  if(outlier == TRUE & length(bx$out) > 0 ){
    if (missing(out.txt)){
      if(horiz == FALSE){
        text(bx$group, bx$out, as.character(out.row),pos=4, cex=0.7)
      } else {
        text(bx$out, bx$group, as.character(out.row),pos=3, cex=0.7)
      }
    } else{
      if(horiz == FALSE){
        text(bx$group, bx$out, as.character(out.txt[out.row]), pos=4, cex=0.7)
      } else {
        text(bx$out, bx$group, as.character(out.txt[out.row]), pos=3, cex=0.7)
      }
    }
  }
  par(.pardef)

  if(type != "none"){
    message(paste0("========================\n",
                   "Note that the data have been equalized with \"type\" set ",
                   "to \"",type,"\".\n",
                   "========================\n"))
  }
}
mgimond/tukeyedar documentation built on March 19, 2024, 8:44 a.m.