stat0002: Introduction to Probability and Statistics

Documented in box_plot qqexp scatter scatter_hist

# ================================ box_plot ===============================

#' Box plots
#'
#' Produce box-and-whisker plot(s) of the given (grouped) values, with an
#' option (using the argument \code{type}) to change the estimator used
#' to estimate the quartiles.  The only argument described below is
#' \code{type}.  For details of the other arguments see the
#' \strong{Arguments} section of \code{\link[graphics]{boxplot}}.
#'
#' @section \strong{Usage}:
#'   See the \strong{Usage} section of the \code{\link[graphics]{boxplot}}
#'   documentation.
#'
#' @param type an integer between 1 and 9 selecting one of the nine quantile
#'   algorithms detailed in the \strong{Details} section of the
#'   \code{\link[stats]{quantile}} documentation.  The default is
#'   \code{type = 6}.
#' @details See the \strong{Details} section of the \code{\link{boxplot}}
#'   documentation.
#' @return A list with the same contents as described in the \strong{Value}
#'   section of the \code{\link{boxplot}} documentation.
#' @examples
#' birth_times <- ox_births[, "time"]
#' box_plot(birth_times)
#' box_plot(birth_times, horizontal = TRUE)
#' @name box_plot
NULL
## NULL

#' @export
box_plot <- function(x, ...) {
  UseMethod("box_plot")
}

#' @export
box_plot.default <- function (x, ..., range = 1.5, width = NULL,
                              varwidth = FALSE, notch = FALSE, outline = TRUE,
                              names, plot = TRUE, border = graphics::par("fg"),
                              col = NULL, log = "",
                              pars = list(boxwex = 0.8, staplewex = 0.5,
                                          outwex = 0.5), horizontal = FALSE,
                              add = FALSE, at = NULL, type = 6) {
  args <- list(x, ...)
  namedargs <- if (!is.null(attributes(args)$names))
    attributes(args)$names != ""
  else rep_len(FALSE, length(args))
  groups <- if (is.list(x))
    x
  else args[!namedargs]
  if (0L == (n <- length(groups)))
    stop("invalid first argument")
  if (length(class(groups)))
    groups <- unclass(groups)
  if (!missing(names))
    attr(groups, "names") <- names
  else {
    if (is.null(attr(groups, "names")))
      attr(groups, "names") <- 1L:n
    names <- attr(groups, "names")
  }
  cls <- sapply(groups, function(x) class(x)[1L])
  cl <- if (all(cls == cls[1L]))
    cls[1L]
  else NULL
  for (i in 1L:n) groups[i] <- list(boxplot_stats(unclass(groups[[i]]),
                                                  range, type = type))
  stats <- matrix(0, nrow = 5L, ncol = n)
  conf <- matrix(0, nrow = 2L, ncol = n)
  ng <- out <- group <- numeric(0L)
  ct <- 1
  for (i in groups) {
    stats[, ct] <- i$stats
    conf[, ct] <- i$conf
    ng <- c(ng, i$n)
    if ((lo <- length(i$out))) {
      out <- c(out, i$out)
      group <- c(group, rep.int(ct, lo))
    }
    ct <- ct + 1
  }
  if (length(cl) && cl != "numeric")
    oldClass(stats) <- cl
  z <- list(stats = stats, n = ng, conf = conf, out = out,
            group = group, names = names)
  if (plot) {
    if (is.null(pars$boxfill) && is.null(args$boxfill))
      pars$boxfill <- col
    do.call("bxp", c(list(z, notch = notch, width = width,
                          varwidth = varwidth, log = log, border = border,
                          pars = pars, outline = outline,
                          horizontal = horizontal, add = add, at = at),
                     args[namedargs]))
    invisible(z)
  }
  else z
}

#' @export
box_plot.matrix <- function (x, use.cols = TRUE, ...) {
  groups <- if (use.cols) {
    split(c(x), rep.int(1L:ncol(x), rep.int(nrow(x), ncol(x))))
  }
  else split(c(x), seq(nrow(x)))
  if (length(nam <- dimnames(x)[[1 + use.cols]]))
    names(groups) <- nam
  invisible(box_plot(groups, ...))
}

#' @export
box_plot.formula <- function (formula, data = NULL, ..., subset,
                              na.action = NULL) {
  if (missing(formula) || (length(formula) != 3L))
    stop("'formula' missing or incorrect")
  m <- match.call(expand.dots = FALSE)
  if (is.matrix(eval(m$data, parent.frame())))
    m$data <- as.data.frame(data)
  m$... <- NULL
  m$na.action <- na.action
  m[[1L]] <- quote(stats::model.frame)
  mf <- eval(m, parent.frame())
  response <- attr(attr(mf, "terms"), "response")
  box_plot(split(mf[[response]], mf[-response]), ...)
}

#' Box Plot Statistics
#'
#' A copy of \code{\link[grDevices]{boxplot.stats}} that adds the argument
#' \code{type} to be passed to \code{\link[stats]{quantile}} when calculating
#' sample quantiles.
#' @param x a numeric vector for which the boxplot will be constructed
#'   (\code{\link{NA}}s and \code{\link{NaN}}s are allowed and omitted).
#' @param coef this determines how far the plot whiskers extend out from the
#'   box. If coef is positive, the whiskers extend to the most extreme data
#'   point which is no more than coef times the length of the box away from
#'   the box. A value of zero causes the whiskers to extend to the data
#'   extremes (and no outliers be returned).
#' @param do.conf,do.out logicals; if FALSE, the conf or out component
#'   respectively will be empty in the result.
#' @param type Argument \code{type} to \code{\link[stats]{quantile}}.
#' @export
boxplot_stats <- function (x, coef = 1.5, do.conf = TRUE, do.out = TRUE,
                           type = 6){
  if (coef < 0)
    stop("'coef' must not be negative")
  nna <- !is.na(x)
  n <- sum(nna)
  stats <- five_number(x, na.rm = TRUE, type = type)
  iqr <- diff(stats[c(2, 4)])
  if (coef == 0)
    do.out <- FALSE
  else {
    out <- if (!is.na(iqr)) {
      x < (stats[2L] - coef * iqr) | x > (stats[4L] + coef * iqr)
    }
    else !is.finite(x)
    if (any(out[nna], na.rm = TRUE))
      stats[c(1, 5)] <- range(x[!out], na.rm = TRUE)
  }
  conf <- if (do.conf)
    stats[3L] + c(-1.58, 1.58) * iqr / sqrt(n)
  list(stats = stats, n = n, conf = conf,
       out = if (do.out) x[out & nna] else numeric())
}

# ================================ scatter ===============================

#' Scatter plot with five number summary
#'
#' Produces a scatter plot in which the axes are labelled with
#' the respective five number summaries of the horizontal and vertical
#' axis data.
#'
#' @param x The argument \code{x} of \code{\link[graphics:plot.default]{plot}}.
#' @param y The argument \code{x} of \code{\link[graphics:plot.default]{plot}}.
#' @param ndec A numeric vector.  The numbers of decimal places to
#'   which to round the five number summaries.  If \code{ndec} is
#'   a scalar then this value is used for both axes.
#' @param type Argument \code{type} used in the call to
#'   \code{\link{five_number}} to estimate the 25\%, 50\% and 75\% quantiles.
#' @param na.rm A logical scalar.  If true, any \code{\link{NA}} and NaNs
#'   are removed before the sample quantiles are computed.
#' @param ... Further arguments to be passed to \code{plot}.
#' @return Nothing, just the plot.
#' @examples
#' x <- rnorm(100)
#' y <- rnorm(100)
#' scatter(x, y)
#' @export
scatter <- function(x, y, ndec = 1, type = 6, na.rm = FALSE, ...) {
  # Make sure that ndec has length 2
  ndec <- rep_len(ndec, 2)
  # Calculate five number summaries of x and y.
  fx <- five_number(x, type = type, na.rm = na.rm)
  fy <- five_number(y, type = type, na.rm = na.rm)
  # Round the five number summaries.
  lx <- round(fx, ndec)
  ly <- round(fy, ndec)
  # Produce the plot, but with no axes.
  graphics::plot(x, y, axes = FALSE, ...)
  # Add axes with only the five number summaries labelled.
  graphics::axis(1, at = fx, labels = lx)
  graphics::axis(2, at = fy, labels = ly)
  invisible()
}


# ============================== scatter_hist =============================

#' Scatter plot with marginal histograms
#'
#' Produces a scatter plot in which the axes are supplemented by
#' histograms of the marginal horizontal and vertical axis data.
#'
#' @param x The argument \code{x} of \code{\link[graphics:plot.default]{plot}}.
#' @param y The argument \code{x} of \code{\link[graphics:plot.default]{plot}}.
#' @param xbreaks A numeric vector.  Optional argument \code{breaks}
#'   to \code{\link[graphics]{hist}} when plotting the histogram
#'   on the horizontal axis.
#' @param ybreaks A numeric vector.  Optional argument \code{breaks}
#'   to \code{\link[graphics]{hist}} when plotting the histogram
#'   on the vertical axis.
#' @param ... Further arguments to be passed to \code{plot}.
#' @return Nothing, just the plot.
#' @examples
#' x <- rnorm(100)
#' y <- rnorm(100)
#' scatter_hist(x, y)
#' @export
scatter_hist <- function(x, y, xbreaks = NULL, ybreaks = NULL, ...) {
  # save default, for resetting...
  old_par <- graphics::par(no.readonly = TRUE)
  on.exit(graphics::par(old_par))
  # Extract any arguments supplied in ....
  user_args <- list(...)
  # If the user wants a log-scale on an axis make a transformation of
  # the data so that the histogram matches the scatterplot.
  if (!is.null(user_args$log)) {
    if (user_args$log == "x") {
      xh <- log(x)
      yh <- y
    }
    if (user_args$log == "y") {
      yh <- log(y)
      xh <- x
    }
    if (user_args$log == "xy") {
      xh <- log(x)
      yh <- log(y)
    }
  } else {
    xh <- x
    yh <- y
  }
  # create a layout to contain the main scatter plot and the histograms.
  nf <- graphics::layout(matrix(c(2, 0, 1, 3), 2, 2, byrow = TRUE), c(3, 1),
                         c(1, 3), TRUE)
  graphics::par(mar = c(4.5, 4.5, 1, 1), oma = c(0, 0, 0, 0))
  # Produce the scatter plot.
  graphics::plot(x, y, ...)
  graphics::par(mar = c(0, 3, 1, 1))
  # Produce the histogram for the horizontal axis.
  if (is.null(xbreaks)) {
    xhist <- graphics::hist(xh, plot = FALSE)
  } else {
    xhist <- graphics::hist(xh, breaks = xbreaks, plot = FALSE)
  }
  graphics::barplot(xhist$count, axes = FALSE, space = 0)
  graphics::par(mar = c(3, 0, 1, 1))
  # Produce the histogram for the vertical axis.
  if (is.null(ybreaks)) {
    yhist <- graphics::hist(yh, plot = FALSE)
  } else {
    yhist <- graphics::hist(yh, breaks = ybreaks, plot = FALSE)
  }
  graphics::barplot(yhist$count, axes = FALSE, space = 0, horiz = TRUE)
  invisible()
}


# ================================== qqexp ====================================

#' Exponential Quantile-Quantile plots
#'
#' Produces a QQ plot to compare ordered sample data to corresponding
#' quantiles of an exponential distribution fitted to these data.
#'
#' @param y Sample data
#' @param statistic A character scalar. Selects the summary statistic used to
#'   estimate \eqn{\lambda}, either the sample mean or sample median.
#' @param type An integer scalar. If \code{statistic = "median"} then
#'   this is passed to \code{\link[stats]{quantile}} to select the type of
#'   sample quantile used to estimate the sample median. The default,
#'   \code{type = 6}, selects the estimator defined in the STAT002 notes.
#' @param envelopes Determines whether or not simulation envelopes should be
#'   added to the plot.  If \code{envelopes = FALSE} then no envelopes are
#'   added. If \code{envelopes} is a positive integer (a common choice is 19)
#'   then simulation envelopes based on this many simulated datasets are added.
#'   The limits of of the envelopes are indicated using short horizontal lines.
#' @param ... Optional arguments to be passed to
#'   \code{\link[graphics:plot.default]{plot}} such as \code{xlab},
#'   \code{ylab}, \code{main} and/or
#'   \code{\link[graphics:par]{graphical parameters}} such as \code{pch},
#'   \code{lty} and \code{lwd}, to control the appearance of the main plot.
#'   To change the appearance of the line of equality use the argument
#'   \code{line}.
#' @param line Determines whether or not a line of equality is superimposed on
#'   the plot.  If a line is required then must be a list, which can contain
#'   \code{\link[graphics:par]{graphical parameters}}
#'   passed to \code{\link[graphics]{abline}}, such as \code{col}, \code{lty}
#'   and \code{lwd} to control the appearance of the line. If \code{line} is
#'   not a list, for example, \code{line = 0}, then no line is superimposed.
#' @details The rate parameter \eqn{\lambda} of the exponential distribution
#'   is estimated using \code{1/mean(y, na.rm = TRUE)} if
#'   \code{statistic = "mean"} and
#'   \code{log(2)/quantile(y, probs = 0.5, na.rm = TRUE)} if
#'   \code{statistic = "median"}.  The ordered sample data are plotted against
#'   quantiles of this fitted exponential distribution. Specifically, the
#'   \eqn{i}th smallest sample observation is plotted against the
#'   \eqn{100 i / (n + 1)\%} theoretical exponential quantile, where \eqn{n} is
#'   the sample size. The plot is constrained to be square. A line of equality
#'   is superimposed on the plot.
#' @return The estimate of \eqn{\lambda}.
#' @seealso \code{\link[stats]{qqnorm}} to produce a normal QQ plot.
#' @examples
#' ## Australian Birth Times Data
#'
#' # Calculate the waiting times until each birth
#' waits <- diff(c(0, aussie_births[, "time"]))
#'
#' # Estimating lambda using the sample mean
#' qqexp(waits)
#'
#' # Change the appearance of the points and line
#' qqexp(waits, pch = 16, line = list(lty = 2, col = "blue", lwd = 2))
#'
#' # Add simulation envelopes
#' qqexp(waits, envelopes = 19)
#'
#' # Estimating lambda using the sample median
#' qqexp(waits, statistic = "median", envelopes = 19)
#' @export
qqexp <- function(y, statistic = c("mean", "median"), type = 6,
                  envelopes = FALSE, ...,
                  line = list(col = "black", lty = 1, lwd = 1)) {
  # Save default, for resetting...
  old_par <- graphics::par(pty = "s", no.readonly = TRUE)
  on.exit(graphics::par(old_par))
  # Check that a suitable summary statistic has been specified
  statistic <- match.arg(statistic)
  # Sample size
  n <- length(y)
  # Estimate lambda, using the required statistic
  if (statistic == "mean") {
    lambdahat <- 1 / mean(y, na.rm = TRUE)
  } else {
    lambdahat <- log(2) / stats::quantile(y, probs = 0.5, na.rm = TRUE)
  }
  # Exponential quantiles
  exp_quantiles <- stats::qexp((1:n) / (n + 1), rate = lambdahat)
  add_env <- FALSE
  # If envelopes are required then perform the simulation
  if (is.numeric(envelopes)) {
    if (!is.positiveinteger(envelopes)) {
      stop("''envelopes'' must be a positive integer")
    }
    # Create a n by envelopes matrix of simulated values
    sim_values <- matrix(stats::rexp(envelopes * n, rate = lambdahat),
                         ncol = envelopes, nrow = n)
    # Sort each column, i.e., each of the envelopes datasets
    sorted_sim_values <- apply(sim_values, 2, sort)
    # Find the minimum and maximum values for each row
    lower <- apply(sorted_sim_values, 1, min)
    upper <- apply(sorted_sim_values, 1, max)
    add_env <- TRUE
  } else {
    upper <- NULL
  }
  # Produce the plot: ordered data vs exponential quantiles
  my_plot_fn <- function(x, y, ..., xlab = my_xlab, ylab = my_ylab,
                         xlim = axis_range, ylim = axis_range,
                         main = "Exponential QQ plot") {
    graphics::plot(x, y, ..., xlab = xlab, ylab = ylab, xlim = xlim,
                   ylim = ylim, main = main)
    # Add a line of equality
    if (is.list(line)) {
      for_abline <- c(line, a = 0, b = 1)
      do.call(graphics::abline, for_abline)
    }
  }
  my_xlab <- paste0("Theoretical exponential quantiles")
  my_ylab <- "Sample quantiles"
  max_value <- max(exp_quantiles, y, upper)
  axis_range <- c(0, max_value)
  my_plot_fn(x = exp_quantiles, y = sort(y), ...)
  # Add simulation envelopes (if required)
  if (add_env) {
    graphics::points(x = exp_quantiles, y = lower, pch = "_")
    graphics::points(x = exp_quantiles, y = upper, pch = "_")
  }
  # Return the estimate of lambda
  names(lambdahat) <- "estimate of lambda"
  return(lambdahat)
}