R/eda_qq.R

Defines functions eda_qq

Documented in eda_qq

#' @export
#' @import grDevices
#' @import lattice
#' @importFrom utils modifyList
#' @title QQ and MD plots
#'
#' @description \code{eda_qq} Generates an empirical or Normal QQ plot as well
#' as a Tukey mean-difference plot.
#'
#' @param x  Vector for first variable or a dataframe.
#' @param y  Vector for second variable or column defining the continuous
#'   variable if \code{x} is a dataframe.
#' @param fac Column defining the grouping variable if \code{x} is a dataframe.
#' @param norm Boolean determining if a Normal QQ plot is to be generated.
#' @param p  Power transformation to apply to both sets of values.
#' @param tukey Boolean determining if a Tukey transformation should be adopted
#'   (FALSE adopts a Box-Cox transformation).
#' @param q.type An integer between 1 and 9 selecting one of the nine quantile
#'   algorithms. (See \code{quantile}tile function).
#' @param md Boolean determining if Tukey mean-difference plot should be
#'   generated.
#' @param fx Formula to apply to x variable. This is computed after any
#'   transformation is applied to the x variable.
#' @param fy Formula to apply to y variable. This is computed after any
#'   transformation is applied to the y variable.
#' @param plot Boolean determining if plot should be generated.
#' @param show.par Boolean determining if parameters such as power
#'   transformation or formula should be displayed.
#' @param grey Grey level to apply to plot elements (0 to 1 with 1 = black).
#' @param pch Point symbol type.
#' @param p.col Color for point symbol.
#' @param p.fill Point fill color passed to \code{bg} (Only used for \code{pch}
#'   ranging from 21-25).
#' @param size Point size (0-1)
#' @param alpha Point transparency (0 = transparent, 1 = opaque). Only
#'   applicable if \code{rgb()} is not used to define point colors.
#' @param q Boolean determining if grey quantile boxes should be plotted.
#' @param b.val Quantiles to define the quantile box parameters. Defaults to the
#'   IQR. Two values are needed.
#' @param l.val Quantiles to define the quantile line parameters. Defaults to
#'   the mid 75\% of values. Two values are needed.
#' @param xlab X label for output plot. Ignored if \code{x} is a dataframe.
#' @param ylab Y label for output plot. Ignored if \code{x} is a dataframe.
#' @param title Title to add to plot.
#' @param t.size Title size.
#' @param ... Not used
#'
#' @details When the function is used to generate an empirical QQ plot, the plot
#'   will displays the IQR via grey boxes for both x and y values. The box
#'   widths can be changed via the  \code{b.val} argument. The plot will also
#'   display the mid 75\% of values via light colored dashed lines. The line
#'   positions can be changed via the \code{l.val} argument. The middle dashed
#'   line represents each batch's median value. Console output prints the
#'   suggested multiplicative and additive offsets. See the QQ plot vignette for
#'   an introduction on its use and interpretation.\cr
#'   \cr
#'   The function can also be used to generate a Normal QQ plot when the
#'   \code{norm} argument is set to  \code{TRUE}. In such a case, the line
#'   parameters \code{l.val} are overridden and are set to +/- 1 standard
#'   deviations. Note that the "suggested offsets" output is disabled, nor
#'   can you generate an M-D version of the Normal QQ plot. Also note
#'   that the formula argument is ignored in this mode.
#'
#'
#' @returns Returns a list with the following components:
#'
#' \itemize{
#'   \item \code{x}: X values. May be interpolated to smallest quantile batch.
#'   Values will reflect power transformation defined in \code{p}.
#'   \item \code{b}: Y values. May be interpolated to smallest quantile batch.
#'   Values will reflect power transformation defined in \code{p}.
#'   \item \code{p}: Re-expression applied to original values.
#'   \item \code{fx}: Formula applied to x variable.
#'   \item \code{fy}: Formula applied to y variable.}
#'
#'
#' @examples
#'
#' # Passing data as a dataframe
#'  singer <- lattice::singer
#'  dat <- singer[singer$voice.part  %in% c("Bass 2", "Tenor 1"), ]
#'  eda_qq(dat, height, voice.part)
#'
#' # Passing data as two separate vector objects
#'  bass2 <- subset(singer, voice.part == "Bass 2", select = height, drop = TRUE )
#'  tenor1 <- subset(singer, voice.part == "Tenor 1", select = height, drop = TRUE )
#'
#'  eda_qq(bass2, tenor1)
#'
#'  # There seems to be an additive offset of about 2 inches
#'  eda_qq(bass2, tenor1, fx = "x - 2")
#'
#'  # We can fine-tune by generating the Tukey mean-difference plot
#'  eda_qq(bass2, tenor1, fx = "x - 2", md = TRUE)
#'
#'  # An offset of another 0.5 inches seems warranted
#'  # We can sat that overall, bass2 singers are 2.5 inches taller than  tenor1.
#'  # The offset is additive.
#'  eda_qq(bass2, tenor1, fx = "x - 2.5", md = TRUE)
#'
#'  # Example 2: Sepal width
#'  setosa <- subset(iris, Species == "setosa", select = Petal.Width, drop = TRUE)
#'  virginica <- subset(iris, Species == "virginica", select = Petal.Width, drop = TRUE)
#'
#'  eda_qq(setosa, virginica)
#'
#'  # The points are not completely parallel to the  1:1 line suggesting a
#'  # multiplicative offset. The slope may be difficult to eyeball. The function
#'  # outputs a suggested slope and intercept. We can start with that
#'  eda_qq(setosa, virginica, fx = "x *  1.7143")
#'
#'  # Now let's add the suggested additive offset.
#'  eda_qq(setosa, virginica, fx = "x *  1.7143  + 1.6286")
#'
#'  # We can confirm this value via the mean-difference plot
#'  # Overall, we have both a multiplicative and additive offset between the
#'  # species' petal widths.
#'  eda_qq(setosa, virginica, fx = "x *  1.7143 + 1.6286", md = TRUE)
#'
#'  # Function can also generate a Normal QQ plot
#'  eda_qq(bass2, norm = TRUE)


eda_qq <- function(x, y=NULL, fac = NULL, norm = FALSE, p = 1L, tukey = FALSE,
                   md = FALSE, q.type = 5, fx = NULL, fy = NULL, plot = TRUE,
                   show.par = TRUE, grey = 0.6, pch = 21, p.col = "grey50",
                   p.fill = "grey80", size = 0.8, alpha = 0.8, q = TRUE,
                   b.val = c(0.25,0.75), l.val = c(0.125, 0.875), xlab = NULL,
                   ylab = NULL, title = NULL, t.size = 1.2, ...) {

  # Parameters check
  if (length(b.val)!= 2) stop("The b.val argument must have two values.")
  if (length(l.val)!= 2) stop("The b.val argument must have two values.")
  if ("data.frame" %in% class(x) & norm == TRUE)
    stop("x needs to be a vector if norm=TRUE")
  if(norm == TRUE & md == TRUE)
    stop("A rotated version of Normal QQ plot is not yet implemented.")

  # Extract data ----
  if("data.frame" %in% class(x)){
    val <- eval(substitute(y), x)
    fac <- eval(substitute(fac), x)
    if (is.null(fac))
      stop("You need to pass a valid factor column to the fac parameter")
    g <- unique(fac)
    if( length(g) != 2){
      stop(paste("Column", fac, "has", length(g),
                 "unique values. It needs to have two exactly."))
    }
    x <- val[fac == g[1]]
    y <- val[fac == g[2]]
    xlab <- g[1]
    ylab <- g[2]

  } else {

    if(is.null(xlab)){
      if( norm == FALSE) {
        xlab = as.character(substitute(x))
      } else {
        xlab = "Normal quantile"
      }
    }

    if(is.null(ylab) & norm == FALSE){
      ylab = as.character(substitute(y))
    } else if (is.null(ylab) & norm == TRUE){
      ylab = substitute(x)
    }
  }

  # Re-express data if required
  x <- eda_re(x, p = p, tukey = tukey)
  x.isna <- is.na(x)
  rm.nan <- ifelse( any(x.isna), 1 , 0)
  if(norm == FALSE) {
    y <- eda_re(y, p = p, tukey = tukey)
    y.isna <- is.na(y)
    rm.nan <- ifelse( any(y.isna), 1 , 0) + rm.nan
  }

  # Re-expression may produce NaN values. Output warning if TRUE
  if( rm.nan > 0 ) {
    warning(paste("\nRe-expression produced NaN values. These observations will",
                  "be removed from output. This will result in fewer points",
                  "in the ouptut."))
    if( norm == FALSE){
      bad <- x.isna | y.isna
      x <- x[!bad]
      y <- y[!bad]
    } else {
      x <- x[!x.isna]
    }
  }


  # Compute x and y values ----
  if(norm == FALSE){
    zl <- qqplot(x, y, plot.it = FALSE, qtype = q.type) # Interpolate (if needed)
  } else {
    zl <- qqnorm(x, plot.it = FALSE)  # Get Normal quantiles
  }

  zd <- data.frame(y = zl$y - zl$x, x = zl$x)
  zd$x <- zd$x - median(zd$x); zd$y <- zd$y - median(zd$y)

  # Get suggested multiplier and offset (only for empirical data)
  if(!norm == TRUE){
    z <- eda_rline(zd,x,y)
    x.multi <-  1 + z$b
    x.add <- median(zl$y - sort(zl$x * x.multi))
  }


  # Apply formula if present
  if(!is.null(fx) & !is.null(fy) & norm == FALSE)
    warning(paste("You should apply a formula to just one axis.\n",
                  "You are applying the formula", fx,"to the x-axis",
                  "and the formula",fy ,"to the y-axis."))
  if(!is.null(fx) & norm == FALSE){
    fx <- tolower(fx)
    if(!grepl("x", fx)) stop("Formula fx does not contain \"x\" variable.")
    x <- eval(parse(text=fx))
  }
  if(!is.null(fy) & norm == FALSE){
    fy <- tolower(fy)
    if(!grepl("y", fy)) stop("Formula fx does not contain \"y\" variable.")
    y <- eval(parse(text=fy))
  }
  if( (!is.null(fx) | !is.null(fy)) & norm == TRUE)
    warning("Formula is ignored when generating a Normal QQ plot")

  # Set plot elements color
  plotcol <- rgb(1-grey, 1-grey, 1-grey)

  # Set point color parameters.
  if(!is.null(alpha)){
    if(p.col %in% colors() & p.fill %in% colors() ){
      p.col  <- adjustcolor( p.col,  alpha.f = alpha)
      p.fill <- adjustcolor( p.fill, alpha.f = alpha)
    }
  }

  # Generate qqplot using base function
  if(norm != TRUE){
    qq <- qqplot(x,y, plot.it = FALSE, qtype = q.type)
  } else {
    qq <- qqnorm(x, plot.it = FALSE)
  }

  x <- qq$x
  y <- qq$y

  # Generate plots ----

  # Get lines-to-inches ratio
  in2line <- ( par("mar") / par("mai") )[2]

  # Create a dummy plot to extract y-axis labels
  pdf(NULL)
  plot(x = x, y = y, type = "n", xlab = "", ylab = "", xaxt = "n",
       yaxt='n', main = NULL)
#  y.labs <- range(axTicks(2))
  y.wid <- max( strwidth( axTicks(2), units="inches")) * in2line + 1.2
  dev.off()

  # Compute the margin width (returned in inches before converting to lines)
  # y.wid <- max( strwidth( y.labs[1], units="inches"),
  #               strwidth( y.labs[2], units="inches")) * in2line + 1

  # Set plot parameters
  .pardef <- par(pty = "s", col = plotcol, mar = c(3,y.wid,3,1))
  on.exit(par(.pardef))

  # QQ plot ----
  if(plot == TRUE & md == FALSE){


    # Get quantile parameters
    qx <- quantile(x, b.val, qtype = q.type)
    qy <- quantile(y, b.val, qtype = q.type)
    if(norm != TRUE){
      lx <- quantile(x, l.val, qtype = q.type)
      ly <- quantile(y, l.val, qtype = q.type)
    } else {
      lx <- c(-1,1)
      ly <- c(1,-1) * sd(y) + mean(y)
    }

    medx <- median(x)
    medy <- median(y)

    # Generate plot
    xylim <- range(x,y)

    # QQ plot: Empirical ----
    if(norm == FALSE){
      plot( x=x, y=y,  ylab=NA, las=1, yaxt='n', xaxt='n', xlab=NA,
            col.lab=plotcol, pch = pch, col = p.col, bg = p.fill, cex = size,
            xlim = xylim, ylim = xylim)

      # QQ plot: Normal ----
    } else {
      plot( x=x, y=y,  ylab=NA, las=1, yaxt='n', xaxt='n', xlab=NA,
            col.lab=plotcol, pch = pch, col = p.col, bg = p.fill, cex = size)
    }

    box(col=plotcol)
    axis(1,col=plotcol, col.axis=plotcol, labels=TRUE, padj = -0.5)
    axis(2,col=plotcol, col.axis=plotcol, labels=TRUE, las=1, hadj = 0.9,
         tck = -0.02)

    mtext(ylab, side=3, adj= -0.06 ,col=plotcol,  padj = -1.2)
    title(xlab = xlab, line =1.8, col.lab=plotcol)

    if(!is.null(title)){
      title(main = title, line =1.2, col.main=plotcol, cex.main=t.size)
    }

    # Add empirical QQ line ----
    if(norm != TRUE){
      abline(0, 1, col = plotcol)

      # Add Normal QQ line ----
    } else {
      probs = c(0.25, 0.75)
      yy <- as.vector(quantile(y, probs, names = FALSE, type = q.type))
      xx <- qnorm(probs)
      slope <- diff(yy) / diff(xx)
      int <- yy[[1L]] - slope * xx[[1L]]
      abline(int, slope, col = plotcol)
    }

    # Add quantile boundary lines ----
    abline(v = medx, col = "grey80", lty = 2)
    abline(h = medy, col = "grey80", lty = 2)

    # Add core boxes ----
    sq <- par("usr") # get plot corners
    if(q == TRUE){
      rect(xleft = qx[1], xright = qx[2], ybottom=sq[3],ytop=sq[4],
           col = rgb(0,0,0,0.05), border = NA)
      rect(xleft = sq[1], xright = sq[2], ybottom=qy[1],ytop=qy[2],
           col = rgb(0,0,0,0.05), border = NA)
      abline(v = lx, lty = 3, col = "grey90")
      abline(h = ly, lty = 3, col = "grey90")
    }

    # Add power/formula parameters to plot
    if(norm != TRUE & show.par == TRUE){
      params <- gsub(";\\s*;?\\s*$", "",  paste0("p=", p,"; ",fx,"; ",fy))
      params <- gsub("\\; \\;", ";", params)
      mtext(side = 3, text=params, adj=1, cex = 0.65)
    } else if (show.par == TRUE) {
      mtext(side = 3, text=paste0("p=",p), adj=1, cex = 0.65)
    }


    #  M-D plot ----
  } else if(plot == TRUE & md == TRUE & norm == FALSE) {

    # Generate labels
    xlab2 <- paste("Mean of", xlab, "and", ylab)
    ylab2 <- paste(ylab,"-", xlab)

    # Compute m-d variables
    md.y  <- (y - x)
    md.x  <- (y + x) * 0.5

    # Get quantile parameters
    qy <- quantile(md.y, b.val, qtype = q.type)
    ly <- quantile(md.y, l.val, qtype = q.type)
    lx <- quantile(md.x, l.val, qtype = q.type)
    medx <- median(md.x)
    medy <- median(md.y)

    # Generate plot
    ylim <- range(md.y, 0)

    plot( x=md.x, y=md.y,  ylab=NA, las=1, yaxt='n', xaxt='n', xlab=NA,
          col.lab=plotcol, pch = pch, col = p.col, bg = p.fill, cex = size,
          ylim = ylim)
    box(col=plotcol)
    axis(1,col=plotcol, col.axis=plotcol, labels=TRUE, padj = -0.5)
    axis(2,col=plotcol, col.axis=plotcol, labels=TRUE, las=1, hadj = 0.7)
    mtext(ylab2, side=3, adj= -0.06 ,col=plotcol,  padj = -1.1)
    title(xlab = xlab2, line =1.8, col.lab=plotcol)
    if(!is.null(title)){
      title(main = title, line =1.2, col.main=plotcol, cex.main=t.size)
    }

    abline(h = 0,  col = plotcol)
    abline(v = medx, col = "grey80", lty = 2)
    abline(h = medy, col = "grey80", lty = 2)
    abline(h = ly, col = "grey90", lty = 3)
    abline(v = lx, col = "grey90", lty = 3)
    sq <- par("usr") # get plot corners
    if(q == TRUE){
      rect(xleft = sq[1], xright = sq[2], ybottom=qy[1],ytop=qy[2],
           col = rgb(0,0,0,0.05), border = NA)
    }
    if(show.par == TRUE){
      params <- gsub(";\\s*;?\\s*$", "",  paste0("p=", p,"; ",fx,"; ",fy))
      params <- gsub("\\; \\;", ";", params)
      mtext(side = 3, text=params, adj=1, cex = 0.65)
    }

  }

  # Reset plot parameters and  output values
  par(.pardef)
  if(!norm == TRUE){
    print(paste0("Suggested offsets:", "y = ", "x * ", round(x.multi,4),
                 " + (", round(x.add,4),")"))
  }

  invisible(list(x = x, y = y, p = p, fx = fx, fy = fy))

}
mgimond/tukeyedar documentation built on March 19, 2024, 8:44 a.m.