tukeyedar: Tukey Inspired Exploratory Data Analysis Functions

Documented in eda_viol

#' @export
#' @import grDevices
#' @importFrom utils modifyList
#' @title Violin plot.
#'
#' @description \code{eda_viol} generates a violin plot.
#'
#' @param dat Single continuous variable vector, or a dataframe.
#' @param x  Continuous variable if \code{dat} is a dataframe, ignored otherwise.
#' @param grp Categorical Variable if \code{dat} is a dataframe, ignored
#'   otherwise.
#' @param p  Power transformation to apply to all values.
#' @param tukey Boolean determining if a Tukey transformation should be adopted
#'   (\code{TRUE}) or if a Box-Cox transformation should be adopted (\code{FALSE}).
#' @param show.par Boolean determining if the power transformation used with the
#'   data should be displayed in the plot's upper-right corner.
#' @param sq Boolean determining if the plot should be square.
#' @param inner Fraction of values that should be captured by the inner color
#'   band of the normal and density plots. Defaults to 0.6826 (inner 68\% of
#'   values).
#' @param bw Bandwidth parameter passed to the \code{density()} function.
#' @param kernel Kernel parameter passed to the \code{density()}
#'   function.
#' @param stat Statistical summary to display in the plot. Choice of "median",
#' "mean", "both" or "none".
#'   Defaults to both.
#' @param pch Point symbol type.
#' @param size Point size.
#' @param alpha Fill transparency (0 = transparent, 1 = opaque). Only applicable
#'   if \code{rgb()} is not used to define fill colors.
#' @param p.col Color for point symbol.
#' @param p.fill Point fill color passed to \code{bg} (Only used for \code{pch}
#'   ranging from 21-25).
#' @param grey Grey level to apply to plot elements such as axes, labels, etc...
#'   (0 to 1 with 1 = black).
#' @param col.ends Fill color for tail-ends of the distribution.
#' @param col.mid Fill color for mid-portion of the distribution.
#' @param col.ends.dens Fill color for tail-ends of the density distribution.
#' @param col.mid.dens Fill color for mid-portion of the density distribution.
#' @param offset A value (in x-axis units) that defines the gap between left and
#'   right side plots. Ignored if \code{dens} is \code{FALSE}.
#' @param tsize Size of plot title.
#' @param reorder Boolean determining if factors have to be reordered based
#'   on \code{reoder.stat}.
#' @param reorder.stat Choice of summary statistic to use for reordering plots.
#'  \code{reorder.stat} can be either \code{mean} or \code{median}.
#' @param xlab X variable label.
#' @param ylab Y variable label.
#' @param ... Note used.
#'
#' @return Does not return a value.
#'
#' @details This function will generate a violin plot from the data.  It
#'   implements the \code{stats::density} function when generating the shape of the
#'   plots. As such the \code{bw} and \code{kernel} arguments are passed on to
#'   the \code{stats::density} function.\cr
#'   \cr
#'   The plots have two fill colors: one for the inner band and the other for
#'   the outer band. The inner band shows the area of the curve that encompasses
#'   the desired fraction of mid-values defined by \code{inner}. By default, this
#'   value is 0.6826, or 68.26\% (this is roughly the percentage of values
#'   covered by +/- 1 standard deviations of a Normal distribution). The range
#'   is computed from the actual values and not from a fitted normal
#'   distribution. \cr
#'   \cr
#'   Measures of centrality are added to the plot. By default, both the mean
#'   (dashed line) and the median (solid line) are added to the plot. \cr
#'
#' @seealso \code{\link[stats]{density}}
#'
#' @examples
#'
#' # Explore a skewed distribution
#' set.seed(132)
#' x <- rnbinom(500, 10, .5)
#'
#' # Generate violin plot
#' eda_viol(x)
#'
#' # The inner band's range can be modified. Here, we view the interquartile
#' # range, the +/- 1 standard deviation range and the inner 95% range)
#' OP <- par(mfrow = c(1,3))
#' invisible(sapply(c(0.5, 0.6826, 0.95),
#'       function(prop) eda_viol(x, inner = prop, tsize = 1,
#'                                  ylab = paste(prop*100,"% of values"))))
#' par(OP)
#'
#' # The bandwidth selector can also be specified
#' OP <- par(mfrow=c(2,3))
#'  invisible(sapply(c("SJ-dpi", "nrd0", "nrd", "SJ-ste", "bcv", "ucv" ),
#'        function(band) eda_viol(x, bw = band, tsize=0.9, size=0, offset=0.005,
#'                                   ylab = band)))
#' par(OP)
#'
#' # The bandwidth argument can also be passed a numeric value
#' # (bw = 0.75, 0.5 and 0.3)
#' OP <- par(mfrow=c(1,3))
#' invisible(sapply(c(0.75, 0.5, 0.3 ),
#'        function(band) eda_viol(x, bw = band, tsize=1,size=.5, offset=0.01,
#'                                   ylab = band)))
#' par(OP)
#'
#' # Examples of a few kernel options
#' OP <- par(mfrow=c(1,3))
#' invisible(sapply(c("gaussian", "optcosine", "rectangular" ),
#'       function(k) eda_viol(x, kernel = k, tsize=1, size=.5, offset=0.01,
#'                                ylab = k)))
#' par(OP)
#'
#' # Another example where data are passed as a dataframe
#' set.seed(540)
#' dat <- data.frame(value = rbeta(20, 1, 50),
#'                  grp = sample(letters[1:3], 100, replace = TRUE))
#' eda_viol(dat, value, grp)
#'
#' # Points can be removed and the gap rendered narrower
#' eda_viol(dat, value, grp, size = 0, offset = 0.01)
#'
#' # Gap can be removed all together
#' eda_viol(dat, value, grp, size = 0, offset = 0)
#'
#' # Remove both mean and medians
#' eda_viol(dat, value, grp, size = 0, offset = 0, stat = "none")
#'
#' # Color can be modified. Here we modify the density plot  fill colors
#' eda_viol(dat, value, grp, size = 0, offset = 0.01,
#'             col.ends.dens = "#A1D99B", col.mid.dens = "#E5F5E0")
#'
#' # A power transformation can be applied to the data. Here
#' # we'll apply a log transformation
#' eda_viol(dat, value, grp, p = 0)


eda_viol <- function(dat, x=NULL, grp=NULL, p = 1,  tukey = FALSE,
                        show.par = TRUE, sq = FALSE, inner = 0.6826,
                        bw = "SJ-dpi", kernel = "gaussian", stat = "both",
                        pch = 16, size = 0.8, alpha = 0.3, p.col = "grey50",
                        p.fill = "grey80",grey = 0.6,
                        col.ends = "grey90", col.mid = "#EBC89B",
                        col.ends.dens = "grey90" , col.mid.dens = "#EBC89B",
                        offset = 0.02, tsize=1.5, reorder = FALSE,
                        reorder.stat = median,
                        xlab = NULL, ylab = NULL, ...){

  # Check for invalid arguments
  input <- names(list(...))
  check <- input %in% names(formals(cat))
  if (any(!check)) warning(sprintf("%s is not a valid argument.",
                                   paste(input[!check], collapse = ", ")))

  # Prep the data if input is dataframe
  if("data.frame" %in% class(dat)) {
    if(is.null(xlab)){
      xlab = substitute(grp)
    }
    grp <- eval(substitute(grp), dat)
  }

  # Prep the data if input is a vector
  if (is.vector(dat)) {
    grp <- rep(1, length(dat))
    x   <- dat
    if(is.null(ylab)){
      ylab = substitute(dat)
    }
  } else if (is.data.frame(dat) & is.null(grp)) {
    if(is.null(x)) stop("You must must specify an x variable from the dataframe.")
    if(is.null(ylab)){
      ylab = substitute(x)
    }
    grp <- rep(1, length(dat))
    x <- eval(substitute(x), dat)
  } else if (is.data.frame(dat) & !is.null(grp)){
    if(is.null(ylab)){
      ylab = substitute(x)
    }
    grp <- eval(substitute(grp), dat)
    x   <- eval(substitute(x), dat)
  } else {
    stop("You must specify a vector or a data frame.")
  }

  # Remove missing values from the data
  which_na <- which(is.na(x))
  if(length(which_na > 0)){
    x <- x[-which_na]
    grp <- grp[-which_na]
    warning(cat(length(which_na),"rows were removed due to NAs being present.\n"))
  }

  if( !stat %in% c("none", "mean", "median", "both"))
    stop("The stat argument must be either \"none\", \"median\", \"mean\", or \"both\"")

  # Set plot elements color
  plotcol <- rgb(1-grey, 1-grey, 1-grey)

  # Set point color parameters.
  if(!is.null(alpha)){
    if(p.col %in% colors() & p.fill %in% colors() ){
      p.col  <- adjustcolor( p.col,  alpha.f = alpha)
      p.fill <- adjustcolor( p.fill, alpha.f = alpha)
    }
  }

  # Re-express data if required
  if(p != 1L){
    x <- eda_re(x, p = p, tukey = tukey)
    x.nan <- is.na(x)
    if( any(x.nan)){
      x <- x[!x.nan]
      grp <- grp[!x.nan]
      warning(paste("\nRe-expression produced NaN values. These observations will",
                    "be removed from output. This will result in fewer points",
                    "in the ouptut."))
    }

  }

  # Remove groups that have just one value
  sngl_val <- ave(1:length(x), grp, FUN = function(x) length(x) == 1)
  x <- x[!sngl_val]
  grp <- grp[!sngl_val]
  if(TRUE %in% sngl_val){
    warning("\nSome groups were removed for having just one value.")
  }

  # Get quantiles from inner range
  lower <- (1 - inner) / 2
  upper <- 1 -lower

  grp <- factor(grp)

  # Reorder levels if requested
  if(reorder == TRUE && length(levels(grp)) > 1){
    ord.stat <- tapply(x, grp, reorder.stat )
    new_levels <- names(sort(ord.stat))
    grp <- factor(grp, levels = new_levels)
  }

  # Get rank number for factor
  fac.order <- as.numeric(grp)

  # Get group means
  means <- sapply(split(x,grp), function(x) mean(x, na.rm = TRUE))

  # Create list of density distributions by group
#  grp_unique <- unique(grp)
  grp_unique <- levels(grp)
  xi <- split(x, grp)
  out2 <- lapply(split(x, grp), function(x)
  {dnsty <- stats::density(x, n=120, bw = bw, kernel = kernel)
  densx <- dnsty$x
  densy <- dnsty$y
  data.frame(densx,densy)})

  # Get median
  m <- lapply(split(x, grp), function(x) median(x, na.rm = TRUE) )

  # Get upper/lower values from quantiles
  q <- lapply(split(x, grp), function(x) quantile(x,c(lower, upper) ))

  # Get axes range
  dx_rng <- range(unlist(lapply(out2, function(x) x$densx)))
  dn_rng <- range(unlist(lapply(out2, function(x) x$densx)))
  dy2_rng <- range(unlist(lapply(out2, function(x) x$densy)))

  out2 <- lapply(out2, function(df){ df$densy <- (df$densy / dy2_rng[2] ) * 0.45
                return(df)})

  # Generate plots ----

  # Get lines-to-inches ratio
  in2line <- ( par("mar") / par("mai") )[2]

  # Create a dummy plot to extract y-axis labels
  pdf(NULL)
  plot(x = NULL, y = NULL, type = "n", xlab = "", ylab = "", xaxt = "n",
       xlim=c(1 - 0.4, length(grp_unique) + 0.4), ylim = dx_rng,  yaxt='n',
       main = NULL)
  #  y.labs <- range(axTicks(2))
  y.wid <- max( strwidth( axTicks(2), units="inches")) * in2line + 1.2
  dev.off()

  # Set plot parameters
  if (sq == TRUE){
    .pardef <- par(pty = "s", col = plotcol, mar = c(3,y.wid,3.3,1))
  } else {
    .pardef <- par( col = plotcol, mar = c(3,y.wid,3.3,1))
  }
  on.exit(par(.pardef))

  # Base plot
  plot(x = 1:length(grp_unique), y = NULL, type = "n", xlab = "", ylab = "", xaxt = "n",
       xlim=c(1 - 0.45, length(grp_unique) + 0.45), ylim = dx_rng,  yaxt='n',
       main = NULL)

  # Add y-label and title
  #mtext(ylab,  side=3, line = 1, col=plotcol, adj = 0, padj = 0.5, cex=tsize - 0.3)

  # Get plot specs
  lbl_width <- strwidth(ylab, units = "inches", cex=tsize - 0.3)
  mar_width <- par("mai")[2]
  loc <- par("usr") # in XY coordinates
  xscl <- (loc[2] - loc[1]) / par("pin")[1]
  # Place y-label
  if(lbl_width/2 > mar_width * 0.6){
    xloc <- loc[1] + (lbl_width/2 - mar_width * 0.6) * xscl
  } else {
    xloc <- loc[1]
  }
  text(xloc, loc[4], labels = ylab, col=plotcol, cex=tsize - 0.3,
       xpd = TRUE, pos = 3, offset = 1)


  # Add x-axes labels if more than 1 group
  if (length(grp_unique) > 1 ){
    axis(1,col=plotcol, col.axis=plotcol, at = 1:length(grp_unique), padj = -0.8,
         grp_unique)
  }

  # Add x-axis title
  title(xlab = xlab, line =1.8, col.lab=plotcol)

  # Add y-axis values
  axis(2,col=plotcol, col.axis=plotcol, labels=TRUE, las=1, hadj = 0.9,
       tck = -0.02)

  # Add polygons and points
  for(i in 1:length(out2)){
    # x2  <- out[[i]]$dn
    # y2  <- out[[i]]$dx
    x3  <- out2[[i]]$densy
    y3  <- out2[[i]]$densx
    xi2  <- xi[[i]]


    # Plot polygons
    polygon(c(-x3 + i - offset, rep(i- offset, length(x3))), c(y3, rev(y3)),
            col = col.ends.dens, border = NA)
    polygon(c(x3 + i + offset, rep(i+ offset, length(x3))), c(y3, rev(y3)),
            col = col.ends.dens, border = NA)

    # Get inner range of density plot
    q1 <- q[[i]][1] ; q2 <- q[[i]][2]
    xsd2 <- x3[(y3 >= q1) & (y3 <= q2)]
    ysd2 <- y3[(y3 >= q1) & (y3 <= q2) ]

    # Get mean and median line ends for density plot
    xmu <- approx(y3,x3, means[[i]][1])$y
    xm <- approx(y3,x3, m[[i]][1])$y # median

    # Plot inner range
    polygon(c(-xsd2 + i -offset, rep(i -offset, length(xsd2))), c(ysd2, rev(ysd2)),
            col = col.mid.dens, border = NA)
    polygon(c(xsd2 + i + offset, rep(i +offset, length(xsd2))), c(ysd2, rev(ysd2)),
            col = col.mid.dens, border = NA)

    # Plot points
    points(rep(i,length(xi2)), xi2, pch = pch, col = p.col, bg = p.fill, cex = size)

    # Plot central values
    if(stat == "both"){
      lines(x=c(0, xmu)+ i +offset, y=c(means[i], means[i]), lty = 2) # Mean
      lines(x=c(0, -xmu)+ i -offset, y=c(means[i], means[i]), lty = 2) # Mean
      lines(x=c(0, +xm) + i +offset, y=c(m[[i]][1], m[[i]][1])) # Median
      lines(x=c(0, -xm) + i -offset, y=c(m[[i]][1], m[[i]][1])) # Median
    } else if (stat == "mean") {
      lines(x=c(0, xmu)+ i +offset, y=c(means[i], means[i]), lty = 2) # Mean
      lines(x=c(0, -xmu)+ i -offset, y=c(means[i], means[i]), lty = 2) # Mean
    } else if (stat == "median") {
      lines(x=c(0, +xm) + i +offset, y=c(m[[i]][1], m[[i]][1])) # Median
      lines(x=c(0, -xm) + i -offset, y=c(m[[i]][1], m[[i]][1])) # Median
    }

  }

  if(show.par == TRUE){
    mtext(side = 3, text=paste0("p=",p), adj=1, cex = 0.65)
  }

  # Reset plot parameters and  output values
  par(.pardef)

}