R/s216-applets.R

Defines functions regression_bootstrap_CI regression_test two_mean_bootstrap_CI two_mean_test two_proportion_bootstrap_CI two_proportion_test paired_bootstrap_CI paired_test paired_observed_plot one_proportion_bootstrap_CI one_proportion_test add_norm

Documented in add_norm one_proportion_bootstrap_CI one_proportion_test paired_bootstrap_CI paired_observed_plot paired_test regression_bootstrap_CI regression_test two_mean_bootstrap_CI two_mean_test two_proportion_bootstrap_CI two_proportion_test

#' Add normal curve to histogram
#'
#' This function will add a normal curve to a frequency or density histogram
#' using the raw data and histogram object as arguments.
#'
#' @param h Histogram object generated by \code{\link[graphics:hist]{hist}}.
#' @param data Vector of values used to generate the histogram.
#' @param freq Logical value indicating whether y-axis of
#'   histogram is frequency. Defaults to TRUE.
#' @param na.rm Logical value indicating whether NA values should
#'   be stripped before the computation proceeds. Defaults to TRUE.
#' @param ... Other \code{\link[graphics:par]{graphical parameters}}.
#'
#' @examples
#' x <- rnorm(30, 5, 2)
#' hi <- hist(x)
#' add_norm(h = hi, data = x)
#' @export

add_norm <- function(h, data, freq = TRUE, na.rm = FALSE,
                     lty = 1, lwd = 2, col = "purple", ...) {
  x <- seq(min(data), max(data), length = 200)
  y <- dnorm(x, mean(data, na.rm = na.rm), sd(data, na.rm = na.rm))
  if (freq == FALSE) {
    lines(x, y, lty = lty, lwd = lwd, col = col, ...)
  } else {
    y <- y * diff(h$mids[1:2]) * length(data)
    lines(x, y, lty = lty, lwd = lwd, col = col, ...)
  }
}



#' Simulation-based hypothesis test for one proportion
#'
#' This function will run a simulation-based hypothesis test for a
#' single proportion of successes.
#'
#' @param probability_success Value between 0 and 1
#'    representing the null hypothesis value for proportion.
#' @param sample_size Number of trials used to compute proportion.
#' @param number_repetitions Number of simulated samples.
#' @param summary_measure Name of summary measure to return from simulations.
#'    Allowed values are
#'    `"number"` for number of successes or
#'    `"proportion"` for proportion of successes.
#' @param as_extreme_as Value of observed statistic.
#'    Between 0 and 1 if `summary_measure` is `"proportion"`;
#'    an integer between 1 and `sample_size` if `summary_measure` is `"number"`.
#' @param direction Direction of alternative hypothesis.
#'    Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param add_normal Logical value indicating whether to superimpose a normal
#'   curve on the histogram. Defaults to FALSE.
#'
#' @return Returns plot of distribution of simulated statistics,
#'    with values as or more extreme than specified
#'    value highlighted, and reports
#'    proportion of simulations as or more extreme than specified
#'    as subtitle on plot.
#'
#' @examples
#' one_proportion_test(
#'   probability_success = 0.5,
#'   sample_size = 150,
#'   summary_measure = "proportion",
#'   as_extreme_as = 0.65,
#'   direction = "greater",
#'   number_repetitions = 1000
#' )
#' @export

one_proportion_test <- function(probability_success = 0.5,
                                sample_size = 5,
                                summary_measure = c("number", "proportion"),
                                as_extreme_as,
                                direction = c("greater", "less", "two-sided"),
                                number_repetitions = 1,
                                add_normal = FALSE) {
  if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
    stop("Number of repetitions must be positive and integer valued.")
  }
  if (sample_size < 1 | !(sample_size %% 1 == 0)) {
    stop("Sample size must be positive and integer valued.")
  }
  if (probability_success < 0 | probability_success > 1) {
    stop("Probability of success must be between zero and one.")
  }
  if (!(direction %in% c("greater", "less", "two-sided"))) {
    stop("Direction must be one of 'greater',  'less', or 'two-sided'.")
  }
  if (!(summary_measure %in% c("number", "proportion"))) {
    stop("Summary measure must be either
         'number' or 'proportion' of successes.")
  }
  if (is.null(as_extreme_as)) {
    stop("Must enter cutoff value for 'as_extreme_as'.")
  }

  # Simulate data
  number_heads <- rbinom(
    number_repetitions,
    sample_size,
    probability_success
  )
  range_heads <- max(number_heads) - min(number_heads)
  success_tab <- table(number_heads)

  # Set plot margins
  original_mar <- par()$mar
  par(mar = c(5, 3, 3, 5))

  # Not two-sided:
  if (!(direction == "two-sided")) {
    # Calculate p-value
    if (summary_measure == "number") {
      proportion_extreme <- ifelse(direction == "greater",
        mean(number_heads >= as_extreme_as),
        mean(number_heads <= as_extreme_as)
      )
    } else {
      proportion_extreme <- ifelse(direction == "greater",
        mean(number_heads / sample_size >= as_extreme_as),
        mean(number_heads / sample_size <= as_extreme_as)
      )
    }

    # Plot
    if (summary_measure == "number") { # Number of successes; not two-sided
      plot(0, 0, "n",
        frame.plot = FALSE,
        xlim = c(
          min(
            min(number_heads),
            as_extreme_as - range_heads / 5
          ),
          max(
            max(number_heads),
            as_extreme_as + range_heads / 5
          )
        ),
        ylim = c(0, max(success_tab) + 3),
        xlab = "Number of Successes",
        ylab = "",
        yaxt = "n",
        sub = paste("Proportion of Samples = ",
          proportion_extreme * number_repetitions,
          "/", number_repetitions, " = ",
          round(proportion_extreme, 4),
          sep = ""
        )
      )
      for (i in 1:length(success_tab)) {
        lines(rep(names(success_tab)[i], 2),
          c(0, success_tab[i]),
          lwd = 5,
          col = ifelse(direction == "greater",
            ifelse(as.numeric(names(success_tab[i])) >=
              as_extreme_as, "blue", "black"),
            ifelse(as.numeric(names(success_tab[i])) <=
              as_extreme_as, "blue", "black")
          )
        )
      }
      abline(
        v = ifelse(direction == "greater",
          as_extreme_as - 0.5, as_extreme_as + 0.5
        ),
        col = "blue", lwd = 2
      )

      cutoff_label <- ifelse(direction == "greater",
        paste(">=", as_extreme_as),
        paste("<=", as_extreme_as)
      )

      text(ifelse(direction == "greater",
        as_extreme_as - 0.25, as_extreme_as + 0.25
      ),
      max(success_tab) + 3,
      labels = cutoff_label,
      pos = ifelse(direction == "greater", 4, 2)
      )

      mtext(paste(
        "Mean =",
        round(mean(number_heads, na.rm = T), 3), "\n",
        "SD =",
        round(sd(number_heads, na.rm = T), 3)
      ),
      line = -3, side = 4, las = 2
      )
    } else { # Proportion of successes; not two-sided
      plot(0, 0, "n",
        frame.plot = FALSE,
        xlim = c(
          min(
            min(number_heads) / sample_size,
            as_extreme_as - range_heads / (5 * sample_size)
          ),
          max(
            max(number_heads) / sample_size,
            as_extreme_as + range_heads / (5 * sample_size)
          )
        ),
        ylim = c(0, max(success_tab) + 3),
        xlab = "Proportion of Successes",
        ylab = "",
        yaxt = "n",
        sub = paste("Proportion of Samples = ",
          proportion_extreme * number_repetitions,
          "/", number_repetitions, " = ",
          round(proportion_extreme, 4),
          sep = ""
        )
      )
      for (i in 1:length(success_tab)) {
        lines(rep(as.numeric(names(success_tab)[i]) / sample_size, 2),
          c(0, success_tab[i]),
          lwd = 5,
          col = ifelse(direction == "greater",
            ifelse(as.numeric(names(success_tab[i])) / sample_size
            >= as_extreme_as, "blue", "black"),
            ifelse(as.numeric(names(success_tab[i])) / sample_size
            <= as_extreme_as, "blue", "black")
          )
        )
      }
      abline(
        v = ifelse(direction == "greater",
          as_extreme_as - 0.5 / sample_size,
          as_extreme_as + 0.5 / sample_size
        ),
        col = "blue", lwd = 2
      )

      cutoff_label <- ifelse(direction == "greater",
        paste(">=", as_extreme_as),
        paste("<=", as_extreme_as)
      )
      text(ifelse(direction == "greater",
        as_extreme_as - 0.25 / sample_size,
        as_extreme_as + 0.25 / sample_size
      ),
      max(success_tab) + 3,
      labels = cutoff_label,
      pos = ifelse(direction == "greater", 4, 2)
      )

      mtext(paste(
        "Mean =",
        round(mean(number_heads / sample_size, na.rm = T), 3), "\n",
        "SD =",
        round(sd(number_heads / sample_size, na.rm = T), 3)
      ),
      line = -3, side = 4, las = 2
      )
    }
  } else if (summary_measure == "number") { # Number of successes; two-sided
    if (as_extreme_as > probability_success * sample_size) {
      upper <- as_extreme_as
      lower <- max(0, 2 * probability_success * sample_size - as_extreme_as)
    } else {
      lower <- as_extreme_as
      upper <- min(sample_size, 2 * probability_success * sample_size - as_extreme_as)
    }
    proportion_extreme <- mean(number_heads <= lower | number_heads >= upper)

    plot(0, 0, "n",
      frame.plot = FALSE,
      xlim = c(
        min(min(number_heads), lower - range_heads / 5),
        max(max(number_heads), upper + range_heads / 5)
      ),
      ylim = c(0, max(success_tab) + 3),
      xlab = "Number of Successes",
      ylab = "",
      yaxt = "n",
      sub = paste("Proportion of Samples = ",
        proportion_extreme * number_repetitions,
        "/", number_repetitions, " = ",
        round(proportion_extreme, 4),
        sep = ""
      )
    )
    for (i in 1:length(success_tab)) {
      lines(rep(names(success_tab)[i], 2), c(0, success_tab[i]),
        lwd = 5,
        col = ifelse(as.numeric(names(success_tab[i])) <= lower |
          as.numeric(names(success_tab[i])) >= upper,
        "blue", "black"
        )
      )
    }
    abline(
      v = c(lower + 0.5, upper - 0.5),
      col = "blue", lwd = 2
    )
    cutoff_label <- c(paste("<=", lower), paste(">=", upper))
    text(c(lower - 0.25, upper + 0.25),
      rep(max(success_tab) + 3, 2),
      labels = cutoff_label,
      pos = c(4, 2)
    )
    mtext(paste(
      "Mean =",
      round(mean(number_heads, na.rm = T), 3), "\n",
      "SD =",
      round(sd(number_heads, na.rm = T), 3)
    ),
    line = -3, side = 4, las = 2
    )
  } else { # Proportion of successes; two-sided
    if (as_extreme_as > probability_success) {
      upper <- as_extreme_as
      lower <- max(0, 2 * probability_success - as_extreme_as)
    } else {
      lower <- as_extreme_as
      upper <- min(1, 2 * probability_success - as_extreme_as)
    }
    proportion_extreme <- mean(number_heads <= lower * sample_size |
      number_heads >= upper * sample_size)

    plot(0, 0, "n",
      frame.plot = FALSE,
      xlim = c(
        min(
          min(number_heads) / sample_size,
          lower - range_heads / (5 * sample_size)
        ),
        max(
          max(number_heads) / sample_size,
          upper + range_heads / (5 * sample_size)
        )
      ),
      ylim = c(0, max(success_tab) + 3),
      xlab = "Proportion of Successes",
      ylab = "",
      yaxt = "n",
      sub = paste("Proportion of Samples = ",
        proportion_extreme * number_repetitions,
        "/", number_repetitions, " = ",
        round(proportion_extreme, 4),
        sep = ""
      )
    )
    for (i in 1:length(success_tab)) {
      lines(rep(as.numeric(names(success_tab)[i]) / sample_size, 2),
        c(0, success_tab[i]),
        lwd = 5,
        col = ifelse(as.numeric(names(success_tab[i])) / sample_size <= lower |
          as.numeric(names(success_tab[i])) / sample_size >= upper,
        "blue", "black"
        )
      )
    }
    abline(
      v = c(lower + 0.5 / sample_size, upper - 0.5 / sample_size),
      col = "blue", lwd = 2
    )

    cutoff_label <- c(paste("<=", lower), paste(">=", upper))
    text(c(lower + 0.25 / sample_size, upper - 0.25 / sample_size),
      rep(max(success_tab) + 3, 2),
      labels = cutoff_label,
      pos = c(2, 4)
    )

    mtext(paste(
      "Mean =",
      round(mean(number_heads / sample_size, na.rm = T), 3), "\n",
      "SD =",
      round(sd(number_heads / sample_size, na.rm = T), 3)
    ),
    line = -3, side = 4, las = 2
    )
  }

  # Add normal curve
  if (add_normal == TRUE) {
    if (summary_measure == "number") {
      h <- hist(number_heads,
        breaks = seq(min(number_heads), max(number_heads), by = 1),
        plot = FALSE
      )
      add_norm(h, number_heads)
    } else {
      p <- number_heads / sample_size
      h <- hist(p,
        breaks = seq(min(p), max(p), by = 1 / sample_size),
        plot = FALSE
      )
      add_norm(h, p)
    }
  }

  # Set plot margins back
  par(mar = original_mar)
}


#' Bootstrap confidence interval for one proportion
#'
#' This function will create a bootstrap confidence interval for a
#' single proportion of successes.
#'
#' @param sample_size Number of trials used to compute proportion.
#' @param number_successes How many successes were observed in those trials?
#' @param number_repetitions Number of bootstrapped resamples.
#' @param confidence_level Confidence level for interval in decimal form.
#'   Defaults to 0.95 (95% confidence interval).
#'
#' @return Returns plot of distribution of bootstrapped statistics,
#'   with values as or more extreme than percentile confidence interval range
#'   highlighted, and reports confidence interval as subtitle on plot.
#'
#' @examples
#' one_proportion_bootstrap_CI(
#'   sample_size = 150,
#'   number_successes = 98,
#'   confidence_level = 0.99,
#'   number_repetitions = 1000
#' )
#' @export

one_proportion_bootstrap_CI <- function(sample_size,
                                        number_successes,
                                        confidence_level = 0.95,
                                        number_repetitions = 100) {
  if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
    stop("Number of repetitions must be positive and integer valued.")
  }
  if (number_successes < 1 | !(number_successes %% 1 == 0)) {
    stop("Number of successes must be positive and integer valued.")
  }
  if (sample_size < 1 | !(sample_size %% 1 == 0)) {
    stop("Sample size must be positive and integer valued.")
  }
  if (confidence_level < 0 | confidence_level > 1) {
    stop("Confidence level must be given in decimal form.")
  }

  sim_pop <- rep(0, sample_size)
  sim_pop[1:number_successes] <- 1

  sim_props <- rep(NA, number_repetitions)
  for (i in 1:number_repetitions) {
    sim_props[i] <- mean(sample(sim_pop, sample_size, replace = TRUE))
  }

  success_tab <- table(sim_props)

  low_ci <- quantile(sim_props, (1 - confidence_level) / 2)
  high_ci <- quantile(sim_props, 1 - (1 - confidence_level) / 2)

  plot(0, 0, "n",
    frame.plot = FALSE,
    xlim = c(
      min(
        min(sim_props),
        low_ci - (max(sim_props) - min(sim_props)) / 5
      ),
      max(
        max(sim_props),
        high_ci + (max(sim_props) - min(sim_props)) / 5
      )
    ),
    ylim = c(0, max(success_tab) + 3),
    xlab = "Bootstrapped Proportions",
    ylab = "",
    yaxt = "n",
    sub = paste0(
      "Mean: ", round(mean(sim_props), 3), ", SD: ",
      round(sd(sim_props), 3), ", ", 100 * confidence_level, "% CI: (",
      round(low_ci, 3), ", ", round(high_ci, 3), ")"
    ),
    cex.sub = 0.75
  )
  for (i in 1:length(success_tab)) {
    lines(rep(as.numeric(names(success_tab)[i]), 2), c(0, success_tab[i]),
      lwd = 5,
      col = ifelse(as.numeric(names(success_tab[i])) <= low_ci |
        as.numeric(names(success_tab[i])) >= high_ci, "blue", "black")
    )
  }
  abline(
    v = c(low_ci, high_ci),
    col = "blue", lwd = 2
  )
  cutoff_label <- c(
    paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
    paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
  )
  text(c(low_ci, high_ci),
    rep(max(success_tab) + 3, 2),
    labels = cutoff_label,
    pos = c(2, 4), cex = .75
  )
}



#' Plots for observed matched pairs data
#'
#' This function will create two plots of a quantitative variable
#' and its differences for matched pairs data.
#'
#' @param data Data frame
#'   with values for each group in the first two columns.
#' @param which_first Which column is first in order of subtraction?
#'   `1` if subtracting second column from first (1 - 2);
#'   `2` if subtracting first column from second (2 - 1). Defaults to `1`.
#'
#' @return Returns plot of distribution of observed values,
#'   with means & standard deviations in groups and paired observations
#'   linked by a line segment, and histogram of differences.
#'
#' @examples
#' set.seed(117)
#' x <- rnorm(25)
#' y <- x + 1 + rnorm(25, 0, 1.8)
#' data <- data.frame(x, y)
#' paired_observed_plot(data, which_first = 1)
#' @export

paired_observed_plot <- function(data,
                                 which_first = 1) {
  if (ncol(data) < 2) {
    stop("Data should have at least two columns.")
  }

  data <- as.data.frame(data[, 1:2])
  differences <- data[, 1] - data[, 2]
  if (which_first == 2) {
    differences <- data[, 2] - data[, 1]
  }

  rg <- max(data) - min(data)
  par(mfrow = c(2, 1), mgp = c(2, .5, 0), mar = c(4, 4, 0, 0) + .1)
  plot(0, 0, "n",
    xlim = c(min(data), max(data) + 0.3 * rg),
    ylim = c(0, 5.5), yaxt = "n", xlab = "Outcomes",
    ylab = ""
  )
  for (i in 1:nrow(data)) {
    lines(c(data[i, ]), c(3, 0.5), col = "grey80")
  }
  points(data[, 1], rep(3, nrow(data)), pch = 15, col = "blue")
  points(data[, 2], rep(0.5, nrow(data)), pch = 15, col = "red")

  leg.loc <- min(data) + 0.95 * (max(data) - min(data))
  legend(leg.loc, 5.5,
    col = "white", bty = "n",
    legend = c(
      dimnames(data)[[2]][1],
      paste("Mean =", round(mean(data[, 1], na.rm = T), 3)),
      paste("SD =", round(sd(data[, 1], na.rm = T), 3))
    ),
    text.col = "blue", cex = 0.75
  )
  legend(leg.loc, 2.5,
    col = "white", bty = "n",
    legend = c(
      dimnames(data)[[2]][2],
      paste("Mean =", round(mean(data[, 2], na.rm = T), 3)),
      paste("SD =", round(sd(data[, 2], na.rm = T), 3))
    ),
    text.col = "red", cex = 0.75
  )

  freqs <- hist(differences, breaks = round(nrow(data) / 3), plot = FALSE)$counts
  hist(differences,
    xlab = paste(
      "Observed Differences (",
      dimnames(data)[[2]][1], "-",
      dimnames(data)[[2]][2], ")"
    ),
    ylab = "Frequency", col = "grey80",
    main = "", breaks = round(nrow(data) / 3),
    ylim = c(0, 1.2 * max(freqs))
  )
  legend("topleft",
    cex = 0.8,
    legend =
      c(
        paste("Mean =", round(mean(differences, na.rm = T), 3)),
        paste("SD =", round(sd(differences, na.rm = T), 3))
      ),
    col = "white", bty = "n"
  )
}



#' Simulation-based hypothesis test for a paired mean difference
#'
#' This function will run a simulation-based hypothesis test for a
#' paired mean difference or paired median difference between two
#' quantitative variables for matched pairs data.
#'
#' @param data Vector of differences or a
#'   two- or three-column data frame with values for each group in last two columns.
#' @param summary_measure Name of summary measure to return from simulations.
#'    Allowed values are
#'    `"mean"` for test of paired mean difference or
#'    `"median"` for test of paired median difference.
#'    Defaults to `"mean"`.
#' @param shift Amount to shift differences for bootstrapping of null distribution.
#' @param number_repetitions Number of simulated samples.
#' @param as_extreme_as Value of observed paired mean difference.
#' @param direction Direction of alternative hypothesis.
#'    Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param which_first Which column is first in order of subtraction?
#'   `1` if subtracting second column from first (1 - 2);
#'   `2` if subtracting first column from second (2 - 1). Defaults to `1`.
#' @param add_normal Logical value indicating whether to superimpose a normal
#'   curve on the histogram. Defaults to FALSE.
#'
#' @return Returns plot of distribution of simulated statistics,
#'    with values as or more extreme than specified
#'    value highlighted, and reports
#'    proportion of simulations as or more extreme than specified
#'    as subtitle on plot.
#'
#' @examples
#' set.seed(117)
#' x <- rnorm(25)
#' y <- x + 1 + rnorm(25, 0, 1.8)
#' data <- data.frame(x, y)
#' obs_diff <- mean(x - y)
#' paired_test(data,
#'   which_first = 1,
#'   shift = -obs_diff,
#'   as_extreme_as = obs_diff,
#'   direction = "two-sided",
#'   number_repetitions = 1000
#' )
#' @export

paired_test <- function(data,
                        summary_measure = "mean",
                        which_first = 1,
                        shift = 0,
                        as_extreme_as,
                        direction = c("greater", "less", "two-sided"),
                        number_repetitions = 1,
                        add_normal = FALSE) {
  if (!(summary_measure %in% c("mean", "median"))) {
    stop("Summary measure must be either 'mean' or 'median'.")
  }
  if (!(direction %in% c("greater", "less", "two-sided"))) {
    stop("Direction must be 'greater', 'less', or 'two-sided'.")
  }
  if (is.null(as_extreme_as)) {
    stop("Must enter cutoff value for 'as_extreme_as'.")
  }
  if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
    stop("Number of repetitions must be positive and integer valued.")
  }
  if (!(which_first %in% 1:2)) {
    stop("'which_first' must equal 1 or 2 to indicate order of subtraction.")
  }
  if (is.vector(data)) {
    warning("Assuming data entered is vector of differences.")
    differences <- data
    n <- length(data)
  } else {
    if (ncol(data) < 2 | ncol(data) > 3) {
      stop("Data should have two or three variables.")
    }
    if (ncol(data) == 3) {
      data <- data[, 2:3]
    }
    differences <- data[, which_first] - data[, ifelse(which_first == 1, 2, 1)]
    n <- nrow(data)
  }

  sim_diffs <- rep(NA, number_repetitions)

  if (summary_measure == "mean") {
    obs_diff <- mean(differences)
    for (i in 1:number_repetitions) {
      sim_diffs[i] <- mean(sample(x = differences + shift, size = n, replace = TRUE))
    }
  } else {
    obs_diff <- median(differences)
    for (i in 1:number_repetitions) {
      sim_diffs[i] <- median(sample(x = differences + shift, size = n, replace = TRUE))
    }
  }

  count_extreme <- ifelse(direction == "greater", sum(sim_diffs >= as_extreme_as),
    ifelse(direction == "less", sum(sim_diffs <= as_extreme_as),
      sum(sim_diffs <= -abs(as_extreme_as)) +
        sum(sim_diffs >= abs(as_extreme_as))
    )
  )

  h <- hist(sim_diffs, plot = FALSE, breaks = "FD")
  if (direction == "two-sided") {
    cuts <- cut(h$breaks, c(-Inf, -abs(as_extreme_as), abs(as_extreme_as), Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[1]] <- "red"
    col.vec[cuts == levels(cuts)[3]] <- "red"
  } else if (direction == "greater") {
    cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[2]] <- "red"
  } else {
    cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[1]] <- "red"
  }

  range_diffs <- max(sim_diffs) - min(sim_diffs)

  ifelse(summary_measure == "mean",
    xlabel <- "Mean Difference",
    xlabel <- "Median Difference"
  )
  plot(h,
    col = col.vec, main = "",
    ylab = "",
    yaxt = "n",
    xlab = xlabel,
    xlim = c(
      min(
        min(sim_diffs),
        ifelse(direction == "two-sided",
          -abs(as_extreme_as) - range_diffs / 5,
          as_extreme_as - range_diffs / 5
        )
      ),
      max(
        max(sim_diffs),
        ifelse(direction == "two-sided",
          abs(as_extreme_as) + range_diffs / 5,
          as_extreme_as + range_diffs / 5
        )
      )
    ),
    sub = paste("Count = ",
      count_extreme,
      "/", number_repetitions, " = ",
      round(count_extreme / number_repetitions, 4),
      sep = ""
    )
  )

  if (add_normal == TRUE) {
    add_norm(h, sim_diffs)
  }

  legend("topright",
    legend =
      c(
        paste("Mean =", round(mean(sim_diffs, na.rm = T), 3)),
        paste("SD =", round(sd(sim_diffs, na.rm = T), 3))
      ),
    col = "white", bty = "n"
  )
  if (direction == "two-sided") {
    abline(v = abs(as_extreme_as), col = "red", lwd = 2)
    abline(v = -abs(as_extreme_as), col = "red", lwd = 2)
  } else {
    abline(v = as_extreme_as, col = "red", lwd = 2)
  }
}


#' Bootstrap confidence interval for a paired mean difference
#'
#' This function will create a bootstrap confidence interval for a
#' paired mean difference or paired median difference between two
#' quantitative variables for matched pairs data.
#'
#' @param data Vector of differences or a two- or three-column data frame
#'   with values for each group in last two columns.
#' @param summary_measure Name of summary measure to return from simulations.
#'    Allowed values are
#'    `"mean"` for confidence interval for paired mean difference or
#'    `"median"` for confidence interval for paired median difference.
#'    Defaults to `"mean"`.
#' @param number_repetitions Number of bootstrapped resamples.
#' @param which_first Which column is first in order of subtraction?
#'   `1` if subtracting second column from first (1 - 2);
#'   `2` if subtracting first column from second (2 - 1). Defaults to `1`.
#' @param confidence_level Confidence level for interval in decimal form.
#'   Defaults to `0.95` (95% confidence interval).
#'
#' @return Returns plot of distribution of bootstrapped statistics,
#'   with values as or more extreme than percentile confidence interval range
#'   highlighted, and reports confidence interval as subtitle on plot.
#'
#' @examples
#' set.seed(117)
#' x <- rnorm(25)
#' y <- x + 1 + rnorm(25, 0, 1.8)
#' data <- data.frame(x, y)
#' paired_bootstrap_CI(data,
#'   which_first = 1,
#'   confidence_level = 0.9,
#'   number_repetitions = 1000
#' )
#' @export

paired_bootstrap_CI <- function(data,
                                summary_measure = "mean",
                                which_first = 1,
                                confidence_level = 0.95,
                                number_repetitions = 100) {
  if (!(summary_measure %in% c("mean", "median"))) {
    stop("Summary measure must be either 'mean' or 'median'.")
  }
  if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
    stop("Number of repetitions must be positive and integer valued.")
  }
  if (confidence_level < 0 | confidence_level > 1) {
    stop("Confidence level must be given in decimal form.")
  }
  if (!(which_first %in% 1:2)) {
    stop("'which_first' must equal 1 or 2 to indicate order of subtraction.")
  }
  if (is.vector(data)) {
    warning("Assuming data entered is vector of differences.")
    differences <- data
    n <- length(data)
  } else {
    if (ncol(data) < 2 | ncol(data) > 3) {
      stop("Data should have two or three variables.")
    }
    if (ncol(data) == 3) {
      data <- data[, 2:3]
    }
    differences <- data[, which_first] - data[, ifelse(which_first == 1, 2, 1)]
    n <- nrow(data)
  }

  sim_diffs <- rep(NA, number_repetitions)

  if (summary_measure == "mean") {
    obs_diff <- mean(differences)
    for (i in 1:number_repetitions) {
      boot_samp <- sample(differences, n, replace = TRUE)
      sim_diffs[i] <- mean(boot_samp)
    }
  } else {
    obs_diff <- median(differences)
    for (i in 1:number_repetitions) {
      boot_samp <- sample(differences, n, replace = TRUE)
      sim_diffs[i] <- median(boot_samp)
    }
  }

  low_ci <- quantile(sim_diffs, (1 - confidence_level) / 2)
  high_ci <- quantile(sim_diffs, 1 - (1 - confidence_level) / 2)

  h <- hist(sim_diffs, plot = FALSE, breaks = "FD")

  cuts <- cut(h$breaks, c(-Inf, low_ci, high_ci, Inf))
  col.vec <- rep("grey80", length(cuts))
  col.vec[cuts == levels(cuts)[1]] <- "red"
  col.vec[cuts == levels(cuts)[3]] <- "red"

  break_range <- max(h$breaks) - min(h$breaks)
  ifelse(summary_measure == "mean",
    xlabel <- "Bootstrap Mean Difference",
    xlabel <- "Bootstrap Median Difference"
  )
  plot(h,
    col = col.vec, main = "",
    ylab = "",
    yaxt = "n",
    xlim = c(
      min(min(h$breaks), low_ci - break_range / 6),
      max(max(h$breaks), high_ci + break_range / 6)
    ),
    xlab = xlabel,
    sub = paste0(
      100 * confidence_level, "% CI: (",
      round(low_ci, 3), ", ", round(high_ci, 3), ")"
    )
  )
  abline(v = c(low_ci, high_ci), col = "red", lwd = 2)

  cutoff_label <- c(
    paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
    paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
  )
  text(c(low_ci, high_ci),
    rep(max(h$counts), 2),
    labels = cutoff_label,
    pos = c(2, 4), cex = .75
  )
}


#' Simulation-based hypothesis test for a difference in proportions
#'
#' This function will run a simulation-based hypothesis test for a
#' difference in proportion of successes between two independent groups.
#'
#' @param formula Formula of the form `response ~ predictor`,
#'   where `predictor` defines the two groups of the explanatory variable and
#'   `response` is binary or a two-level categorical variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param first_in_subtraction Value of predictor variable
#'   that should be first in order of subtraction for computing
#'   difference in proportions.
#' @param response_value_numerator Value of response that corresponds
#'   to "success" when computing proportions.
#' @param number_repetitions Number of simulated samples.
#' @param as_extreme_as Value of observed difference in proportions.
#' @param direction Direction of alternative hypothesis.
#'    Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param add_normal Logical value indicating whether to superimpose a normal
#'   curve on the histogram. Defaults to FALSE.
#'
#' @return Returns plot of distribution of simulated statistics,
#'    with values as or more extreme than specified
#'    value highlighted, and reports
#'    proportion of simulations as or more extreme than specified
#'    as subtitle on plot.
#'
#' @examples
#' data(pt)
#' pt$twoSeconds <- ifelse(pt$responses >= 2, "Yes", "No")
#' two_proportion_test(twoSeconds ~ brand,
#'   data = pt,
#'   first_in_subtraction = "B1",
#'   response_value_numerator = "Yes",
#'   as_extreme_as = -.4,
#'   direction = "two-sided",
#'   number_repetitions = 1000
#' )
#' @export

two_proportion_test <- function(formula,
                                data,
                                first_in_subtraction,
                                response_value_numerator,
                                as_extreme_as,
                                direction = c("greater", "less", "two-sided"),
                                number_repetitions = 1,
                                add_normal = FALSE) {
  if (!(direction %in% c("greater", "less", "two-sided"))) {
    stop("Direction must be one of 'greater',  'less', or 'two-sided'.")
  }
  if (number_repetitions < 1 | number_repetitions %% 1 != 0) {
    stop("Number of repetitions must be positive and integer valued.")
  }
  resp.name <- all.vars(formula)[1]
  pred.name <- all.vars(formula)[2]
  eval(parse(text = paste0("data$", resp.name, " = factor(data$", resp.name, ")")))
  eval(parse(text = paste0("data$", pred.name, " = factor(data$", pred.name, ")")))
  response <- eval(parse(text = paste0("data$", resp.name)))
  predictor <- eval(parse(text = paste0("data$", pred.name)))

  if (!(first_in_subtraction %in% unique(predictor))) {
    stop("First term in order of subtraction must match values in data.")
  }
  if (!(response_value_numerator %in% unique(response))) {
    stop("Value of response in numerator must match values in data.")
  }

  row.pcts <- prop.table(table(predictor, response), 1)

  obs.diff <- row.pcts[
    eval(parse(text = paste0("'", first_in_subtraction, "'"))),
    eval(parse(text = paste0("'", response_value_numerator, "'")))
  ] -
    row.pcts[
      setdiff(unique(predictor), eval(parse(text = paste0("'", first_in_subtraction, "'")))),
      eval(parse(text = paste0("'", response_value_numerator, "'")))
    ]


  sim_diffs <- rep(NA, number_repetitions)
  for (i in 1:number_repetitions) {
    newResponse <- sample(response)
    row.pcts <- prop.table(table(predictor, newResponse), 1)
    sim_diffs[i] <- row.pcts[
      eval(parse(text = paste0("'", first_in_subtraction, "'"))),
      eval(parse(text = paste0("'", response_value_numerator, "'")))
    ] -
      row.pcts[
        setdiff(unique(predictor), eval(parse(text = paste0("'", first_in_subtraction, "'")))),
        eval(parse(text = paste0("'", response_value_numerator, "'")))
      ]
  }

  count_extreme <- ifelse(direction == "greater", sum(sim_diffs >= as_extreme_as),
    ifelse(direction == "less", sum(sim_diffs <= as_extreme_as),
      sum(sim_diffs <= -abs(as_extreme_as)) +
        sum(sim_diffs >= abs(as_extreme_as))
    )
  )

  h <- hist(sim_diffs, plot = FALSE, breaks = "FD")
  if (direction == "two-sided") {
    cuts <- cut(h$breaks, c(-Inf, -abs(as_extreme_as), abs(as_extreme_as), Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[1]] <- "red"
    col.vec[cuts == levels(cuts)[3]] <- "red"
  } else if (direction == "greater") {
    cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[2]] <- "red"
  } else {
    cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[1]] <- "red"
  }
  range_diffs <- max(sim_diffs) - min(sim_diffs)
  plot(h,
    col = col.vec, main = "",
    ylab = "",
    yaxt = "n",
    xlab = "Difference in Proportions",
    xlim = c(
      min(min(sim_diffs), ifelse(direction == "two-sided",
                                 -abs(as_extreme_as) - range_diffs / 5,
                                 as_extreme_as - range_diffs / 5)),
      max(max(sim_diffs), ifelse(direction == "two-sided",
                                 abs(as_extreme_as) + range_diffs / 5,
                                 as_extreme_as + range_diffs / 5))
    ),
    sub = paste("Count = ",
      count_extreme,
      "/", number_repetitions, " = ",
      round(count_extreme / number_repetitions, 4),
      sep = ""
    )
  )

  if (add_normal == TRUE) {
    add_norm(h, sim_diffs)
  }

  legend("topright",
    legend = c(
      paste("Mean =", round(mean(sim_diffs, na.rm = T), 3)),
      paste("SD =", round(sd(sim_diffs, na.rm = T), 3))
    ),
    col = "white", bty = "n"
  )
  if (direction == "two-sided") {
    abline(v = abs(as_extreme_as), col = "red", lwd = 2)
    abline(v = -abs(as_extreme_as), col = "red", lwd = 2)
  } else {
    abline(v = as_extreme_as, col = "red", lwd = 2)
  }
}


#' Bootstrap confidence interval for a difference in proportions
#'
#' This function will create a bootstrap confidence interval for a
#' difference in proportion of successes between two independent groups.
#'
#' @param formula Formula of the form `response ~ predictor`,
#'   where `predictor` defines the two groups of the explanatory variable and
#'   `response` is binary or a two-level categorical variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param first_in_subtraction Value of predictor variable
#'   that should be first in order of subtraction for computing
#'   difference in proportions.
#' @param response_value_numerator Value of response that corresponds
#'   to "success" when computing proportions.
#' @param number_repetitions Number of bootstrapped resamples.
#' @param confidence_level Confidence level for interval in decimal form.
#'   Defaults to 0.95 (95% confidence interval).
#'
#' @return Returns plot of distribution of bootstrapped statistics,
#'   with values as or more extreme than percentile confidence interval range
#'   highlighted, and reports confidence interval as subtitle on plot.
#'
#' @examples
#' data(pt)
#' pt$twoSeconds <- ifelse(pt$responses >= 2, "Yes", "No")
#' two_proportion_bootstrap_CI(twoSeconds ~ brand,
#'   data = pt,
#'   first_in_subtraction = "B1",
#'   response_value_numerator = "Yes",
#'   confidence_level = 0.95,
#'   number_repetitions = 1000
#' )
#' @export

two_proportion_bootstrap_CI <- function(formula,
                                        data,
                                        first_in_subtraction,
                                        response_value_numerator,
                                        confidence_level = 0.95,
                                        number_repetitions = 100) {
  if (number_repetitions < 1 | number_repetitions %% 1 != 0) {
    stop("Number of repetitions must be positive and integer valued.")
  }
  resp.name <- all.vars(formula)[1]
  pred.name <- all.vars(formula)[2]
  eval(parse(text = paste0("data$", resp.name, " = factor(data$", resp.name, ")")))
  eval(parse(text = paste0("data$", pred.name, " = factor(data$", pred.name, ")")))
  response <- eval(parse(text = paste0("data$", resp.name)))
  predictor <- eval(parse(text = paste0("data$", pred.name)))

  if (!(first_in_subtraction %in% unique(predictor))) {
    stop("First term in order of subtraction must match values in data.")
  }
  if (!(response_value_numerator %in% unique(response))) {
    stop("Value of response in numerator must match values in data.")
  }
  if (confidence_level < 0 | confidence_level > 1) {
    stop("Confidence level must be given in decimal form.")
  }

  row.pcts <- prop.table(table(predictor, response), 1)

  obs.diff <- row.pcts[
    eval(parse(text = paste0("'", first_in_subtraction, "'"))),
    eval(parse(text = paste0("'", response_value_numerator, "'")))
  ] -
    row.pcts[
      setdiff(unique(predictor), eval(parse(text = paste0("'", first_in_subtraction, "'")))),
      eval(parse(text = paste0("'", response_value_numerator, "'")))
    ]

  ng1 <- sum(predictor == eval(parse(text = paste0("'", first_in_subtraction, "'"))))
  ng2 <- sum(predictor != eval(parse(text = paste0("'", first_in_subtraction, "'"))))
  sim_diffs <- rep(NA, number_repetitions)
  for (i in 1:number_repetitions) {
    newResponse <- response # sets up as factor with correct levels
    newResponse[predictor == eval(parse(text = paste0("'", first_in_subtraction, "'")))] <-
      sample(response[predictor == eval(parse(text = paste0("'", first_in_subtraction, "'")))], ng1, replace = TRUE)
    newResponse[predictor != eval(parse(text = paste0("'", first_in_subtraction, "'")))] <-
      sample(response[predictor != eval(parse(text = paste0("'", first_in_subtraction, "'")))], ng2, replace = TRUE)

    row.pcts <- prop.table(table(predictor, newResponse), 1)
    sim_diffs[i] <- row.pcts[
      eval(parse(text = paste0("'", first_in_subtraction, "'"))),
      eval(parse(text = paste0("'", response_value_numerator, "'")))
    ] -
      row.pcts[
        setdiff(unique(predictor), eval(parse(text = paste0("'", first_in_subtraction, "'")))),
        eval(parse(text = paste0("'", response_value_numerator, "'")))
      ]
  }

  low_ci <- quantile(sim_diffs, (1 - confidence_level) / 2)
  high_ci <- quantile(sim_diffs, 1 - (1 - confidence_level) / 2)

  h <- hist(sim_diffs, plot = FALSE, breaks = "FD")

  cuts <- cut(h$breaks, c(-Inf, low_ci, high_ci, Inf))
  col.vec <- rep("grey80", length(cuts))
  col.vec[cuts == levels(cuts)[1]] <- "red"
  col.vec[cuts == levels(cuts)[3]] <- "red"

  break_range <- max(h$breaks) - min(h$breaks)
  plot(h,
    col = col.vec, main = "",
    ylab = "",
    yaxt = "n",
    xlim = c(
      min(min(h$breaks), low_ci - break_range / 6),
      max(max(h$breaks), high_ci + break_range / 6)
    ),
    xlab = "Bootstrap Difference in Proportions",
    sub = paste0(
      "Mean: ", round(mean(sim_diffs), 3),
      " SD: ", round(sd(sim_diffs), 3), " ",
      100 * confidence_level, "% CI: (",
      round(low_ci, 3), ", ", round(high_ci, 3), ")"
    )
  )
  abline(v = c(low_ci, high_ci), col = "red", lwd = 2)

  cutoff_label <- c(
    paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
    paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
  )
  text(c(low_ci, high_ci),
    rep(max(h$counts), 2),
    labels = cutoff_label,
    pos = c(2, 4), cex = .75
  )
}


#' Simulation-based hypothesis test for a difference in means
#'
#' This function will run a simulation-based hypothesis test for a
#' difference in means or difference in medians of a quantitative variable
#' for two independent groups.
#'
#' @param formula Formula of the form `response ~ predictor`,
#'   where `predictor` defines the two groups of the explanatory variable and
#'   `response` is a quantitative response variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param summary_measure Name of summary measure to return from simulations.
#'    Allowed values are
#'    `"mean"` for test of difference in means or
#'    `"median"` for test of difference in medians.
#'    Defaults to `"mean"`.
#' @param first_in_subtraction Value of predictor variable
#'   that should be first in order of subtraction for computing
#'   difference in means.
#' @param number_repetitions Number of simulated samples.
#' @param as_extreme_as Value of observed difference in means.
#' @param direction Direction of alternative hypothesis.
#'    Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param add_normal Logical value indicating whether to superimpose a normal
#'   curve on the histogram. Defaults to FALSE.
#'
#' @return Returns plot of distribution of simulated statistics,
#'    with values as or more extreme than specified
#'    value highlighted, and reports
#'    proportion of simulations as or more extreme than specified
#'    as subtitle on plot.
#'
#' @examples
#' data(pt)
#' two_mean_test(responses ~ brand,
#'   data = pt,
#'   first_in_subtraction = "B1",
#'   as_extreme_as = -.4,
#'   direction = "two-sided",
#'   number_repetitions = 1000
#' )
#' @export

two_mean_test <- function(formula,
                          data,
                          summary_measure = "mean",
                          first_in_subtraction,
                          as_extreme_as,
                          direction = c("greater", "less", "two-sided"),
                          number_repetitions = 1,
                          add_normal = FALSE) {
  if (!(summary_measure %in% c("mean", "median"))) {
    stop("Summary measure must be either 'mean' or 'median'.")
  }
  if (!(direction %in% c("greater", "less", "two-sided"))) {
    stop("Direction must be 'greater', 'less', or 'two-sided'.")
  }
  if (is.null(as_extreme_as)) {
    stop("Must enter cutoff value for 'as_extreme_as'.")
  }
  if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
    stop("Number of repetitions must be positive and integer valued.")
  }

  resp.name <- all.vars(formula)[1]
  pred.name <- all.vars(formula)[2]
  eval(parse(text = paste0("data$", pred.name, " = factor(data$", pred.name, ")")))
  response <- eval(parse(text = paste0("data$", resp.name)))
  predictor <- eval(parse(text = paste0("data$", pred.name)))
  if (!(first_in_subtraction %in% unique(predictor))) {
    stop("First term in order of subtraction must match values in data.")
  }
  n <- nrow(data)

  sim_diffs <- rep(NA, number_repetitions)

  if (summary_measure == "mean") {
    obs.diff <- mean(response[predictor ==
      eval(parse(text = paste0(
        "'",
        first_in_subtraction, "'"
      )))]) -
      mean(response[predictor == setdiff(
        unique(predictor),
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))
      )])
    for (i in 1:number_repetitions) {
      newResponse <- sample(response)
      sim_diffs[i] <- mean(newResponse[predictor ==
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))]) -
        mean(newResponse[predictor == setdiff(
          unique(predictor),
          eval(parse(text = paste0(
            "'",
            first_in_subtraction, "'"
          )))
        )])
    }
  } else {
    obs.diff <- median(response[predictor ==
      eval(parse(text = paste0(
        "'",
        first_in_subtraction, "'"
      )))]) -
      median(response[predictor == setdiff(
        unique(predictor),
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))
      )])
    for (i in 1:number_repetitions) {
      newResponse <- sample(response)
      sim_diffs[i] <- median(newResponse[predictor ==
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))]) -
        median(newResponse[predictor == setdiff(
          unique(predictor),
          eval(parse(text = paste0(
            "'",
            first_in_subtraction, "'"
          )))
        )])
    }
  }

  count_extreme <- ifelse(direction == "greater", sum(sim_diffs >= as_extreme_as),
    ifelse(direction == "less", sum(sim_diffs <= as_extreme_as),
      sum(sim_diffs <= -abs(as_extreme_as)) +
        sum(sim_diffs >= abs(as_extreme_as))
    )
  )

  h <- hist(sim_diffs, plot = FALSE, breaks = "FD")
  if (direction == "two-sided") {
    cuts <- cut(h$breaks, c(-Inf, -abs(as_extreme_as), abs(as_extreme_as), Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[1]] <- "red"
    col.vec[cuts == levels(cuts)[3]] <- "red"
  } else if (direction == "greater") {
    cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[2]] <- "red"
  } else {
    cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[1]] <- "red"
  }
  range_diffs <- max(sim_diffs) - min(sim_diffs)
  ifelse(summary_measure == "mean",
    xlabel <- "Difference in Means",
    xlabel <- "Difference in Medians"
  )
  plot(h,
    col = col.vec, main = "",
    ylab = "",
    yaxt = "n",
    xlab = xlabel,
    xlim = c(
      min(
        min(sim_diffs),
        ifelse(direction == "two-sided",
          -abs(as_extreme_as) - range_diffs / 5,
          as_extreme_as - range_diffs / 5
        )
      ),
      max(
        max(sim_diffs),
        ifelse(direction == "two-sided",
          abs(as_extreme_as) + range_diffs / 5,
          as_extreme_as + range_diffs / 5
        )
      )
    ),
    sub = paste("Count = ",
      count_extreme,
      "/", number_repetitions, " = ",
      round(count_extreme / number_repetitions, 4),
      sep = ""
    )
  )

  if (add_normal == TRUE) {
    add_norm(h, sim_diffs)
  }

  legend("topright",
    legend = c(
      paste("Mean =", round(mean(sim_diffs, na.rm = T), 3)),
      paste("SD =", round(sd(sim_diffs, na.rm = T), 3))
    ),
    col = "white", bty = "n"
  )
  if (direction == "two-sided") {
    abline(v = abs(as_extreme_as), col = "red", lwd = 2)
    abline(v = -abs(as_extreme_as), col = "red", lwd = 2)
  } else {
    abline(v = as_extreme_as, col = "red", lwd = 2)
  }
}


#' Bootstrap confidence interval for a difference in means
#'
#' This function will create a bootstrap confidence interval for a
#' difference in means or difference in medians of a quantitative variable
#' for two independent groups.
#'
#' @param formula Formula of the form `response ~ predictor`,
#'   where `predictor` defines the two groups of the explanatory variable and
#'   `response` is a quantitative response variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param summary_measure Name of summary measure to return from simulations.
#'    Allowed values are
#'    `"mean"` for confidence interval for difference in means
#'    `"median"` for confidence interval for difference in medians.
#'    Defaults to `"mean"`.
#' @param first_in_subtraction Value of predictor variable
#'   that should be first in order of subtraction for computing
#'   difference in means.
#' @param number_repetitions Number of bootstrapped resamples.
#' @param confidence_level Confidence level for interval in decimal form.
#'   Defaults to 0.95 (95% confidence interval).
#'
#' @return Returns plot of distribution of bootstrapped statistics,
#'   with values as or more extreme than percentile confidence interval range
#'   highlighted, and reports confidence interval as subtitle on plot.
#'
#' @examples
#' data(pt)
#' two_mean_bootstrap_CI(responses ~ brand,
#'   data = pt,
#'   first_in_subtraction = "B1",
#'   confidence_level = 0.98,
#'   number_repetitions = 1000
#' )
#' @export

two_mean_bootstrap_CI <- function(formula,
                                  data,
                                  summary_measure = "mean",
                                  first_in_subtraction,
                                  confidence_level = 0.95,
                                  number_repetitions = 100) {
  if (!(summary_measure %in% c("mean", "median"))) {
    stop("Summary measure must be either 'mean' or 'median'.")
  }
  if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
    stop("Number of repetitions must be positive and integer valued.")
  }
  resp.name <- all.vars(formula)[1]
  pred.name <- all.vars(formula)[2]
  eval(parse(text = paste0("data$", pred.name, " = factor(data$", pred.name, ")")))
  response <- eval(parse(text = paste0("data$", resp.name)))
  predictor <- eval(parse(text = paste0("data$", pred.name)))
  if (!(first_in_subtraction %in% unique(predictor))) {
    stop("First term in order of subtraction must match values in data.")
  }
  if (confidence_level < 0 | confidence_level > 1) {
    stop("Confidence level must be given in decimal form.")
  }
  n <- nrow(data)

  ng1 <- sum(predictor == eval(parse(text = paste0("'", first_in_subtraction, "'"))))
  ng2 <- sum(predictor != eval(parse(text = paste0("'", first_in_subtraction, "'"))))
  sim_diffs <- rep(NA, number_repetitions)

  if (summary_measure == "mean") {
    obs.diff <- mean(response[predictor ==
      eval(parse(text = paste0(
        "'",
        first_in_subtraction, "'"
      )))]) -
      mean(response[predictor == setdiff(
        unique(predictor),
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))
      )])
    for (i in 1:number_repetitions) {
      newResponse <- rep(NA, length(response))
      newResponse[predictor ==
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))] <-
        sample(response[predictor ==
          eval(parse(text = paste0(
            "'",
            first_in_subtraction, "'"
          )))],
        ng1,
        replace = TRUE
        )
      newResponse[predictor !=
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))] <-
        sample(response[predictor !=
          eval(parse(text = paste0(
            "'",
            first_in_subtraction, "'"
          )))],
        ng2,
        replace = TRUE
        )
      sim_diffs[i] <- mean(newResponse[predictor ==
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))]) -
        mean(newResponse[predictor == setdiff(
          unique(predictor),
          eval(parse(text = paste0(
            "'",
            first_in_subtraction, "'"
          )))
        )])
    }
  } else {
    obs.diff <- median(response[predictor ==
      eval(parse(text = paste0(
        "'",
        first_in_subtraction, "'"
      )))]) -
      median(response[predictor == setdiff(
        unique(predictor),
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))
      )])
    for (i in 1:number_repetitions) {
      newResponse <- rep(NA, length(response))
      newResponse[predictor ==
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))] <-
        sample(response[predictor ==
          eval(parse(text = paste0(
            "'",
            first_in_subtraction, "'"
          )))],
        ng1,
        replace = TRUE
        )
      newResponse[predictor !=
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))] <-
        sample(response[predictor !=
          eval(parse(text = paste0(
            "'",
            first_in_subtraction, "'"
          )))],
        ng2,
        replace = TRUE
        )
      sim_diffs[i] <- median(newResponse[predictor ==
        eval(parse(text = paste0(
          "'",
          first_in_subtraction, "'"
        )))]) -
        median(newResponse[predictor == setdiff(
          unique(predictor),
          eval(parse(text = paste0(
            "'",
            first_in_subtraction, "'"
          )))
        )])
    }
  }

  low_ci <- quantile(sim_diffs, (1 - confidence_level) / 2)
  high_ci <- quantile(sim_diffs, 1 - (1 - confidence_level) / 2)

  h <- hist(sim_diffs, plot = FALSE, breaks = "FD")

  cuts <- cut(h$breaks, c(-Inf, low_ci, high_ci, Inf))
  col.vec <- rep("grey80", length(cuts))
  col.vec[cuts == levels(cuts)[1]] <- "red"
  col.vec[cuts == levels(cuts)[3]] <- "red"

  break_range <- max(h$breaks) - min(h$breaks)
  ifelse(summary_measure == "mean",
    xlabel <- "Bootstrap Difference in Means",
    xlabel <- "Bootstrap Difference in Medians"
  )
  plot(h,
    col = col.vec, main = "",
    ylab = "",
    yaxt = "n",
    xlim = c(
      min(min(h$breaks), low_ci - break_range / 6),
      max(max(h$breaks), high_ci + break_range / 6)
    ),
    xlab = xlabel,
    sub = paste0(
      100 * confidence_level, "% CI: (",
      round(low_ci, 3), ", ", round(high_ci, 3), ")"
    )
  )
  abline(v = c(low_ci, high_ci), col = "red", lwd = 2)

  cutoff_label <- c(
    paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
    paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
  )
  text(c(low_ci, high_ci),
    rep(max(h$counts), 2),
    labels = cutoff_label,
    pos = c(2, 4), cex = .75
  )
}

#' Simulation-based hypothesis test for regression
#'
#' This function will run a simulation-based hypothesis test for a
#' slope of simple linear regression or correlation
#' between two quantitative variables.
#'
#' @param formula Formula of the form `response ~ predictor`,
#'   where `predictor` defines a quantitative explanatory variable and
#'   `response` is a quantitative response variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param summary_measure Name of summary measure to return from simulations.
#'    Allowed values are
#'    `"slope"` for test of slope or
#'    `"correlation"` for test of correlation.
#' @param as_extreme_as Value of observed slope or correlation.
#' @param direction Direction of alternative hypothesis.
#'    Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param number_repetitions Number of simulated samples.
#' @param add_normal Logical value indicating whether to superimpose a normal
#'   curve on the histogram. Defaults to FALSE.
#'
#' @examples
#' data(mtfires)
#' mtfires$logHect <- log(mtfires$Hectares)
#' regression_test(logHect ~ Temperature,
#'   data = mtfires,
#'   summary_measure = "correlation",
#'   as_extreme_as = 1.388,
#'   direction = "greater",
#'   number_repetitions = 1000
#' )
#' @export

regression_test <- function(formula,
                            data,
                            summary_measure = c("slope", "correlation"),
                            as_extreme_as,
                            direction = c("greater", "less", "two-sided"),
                            number_repetitions = 1,
                            add_normal = FALSE) {
  if (!(summary_measure %in% c("slope", "correlation"))) {
    stop("Summary measure must be either 'slope' or 'correlation'.")
  }
  if (!(direction %in% c("greater", "less", "two-sided"))) {
    stop("Direction must be 'greater', 'less', or 'two-sided'.")
  }
  if (is.null(as_extreme_as)) {
    stop("Must enter cutoff value for 'as_extreme_as'.")
  }
  if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
    stop("Number of repetitions must be positive and integer valued.")
  }

  n <- nrow(data)

  resp.name <- all.vars(formula)[1]
  pred.name <- all.vars(formula)[2]
  response <- eval(parse(text = paste0("data$", resp.name)))
  predictor <- eval(parse(text = paste0("data$", pred.name)))

  obs.fit <- lm(formula, data = data)
  obs.stat <- obs.fit$coef[2]

  sim_vals <- rep(NA, number_repetitions)
  for (i in 1:number_repetitions) {
    newResponse <- sample(response)
    sim.fit <- lm(newResponse ~ predictor)
    sim_vals[i] <- sim.fit$coef[2]
  }

  if (summary_measure == "correlation") {
    sdRatio <- sd(response) / sd(predictor)
    sim_vals <- sim_vals / sdRatio
    obs.stat <- obs.stat / sdRatio
  }

  count_extreme <- ifelse(direction == "greater",
    sum(sim_vals >= as_extreme_as),
    ifelse(direction == "less",
      sum(sim_vals <= as_extreme_as),
      sum(sim_vals <= -abs(as_extreme_as)) +
        sum(sim_vals >= abs(as_extreme_as))
    )
  )

  h <- hist(sim_vals, plot = FALSE, breaks = "FD")
  if (direction == "two-sided") {
    cuts <- cut(h$breaks, c(-Inf, -abs(as_extreme_as), abs(as_extreme_as), Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[1]] <- "red"
    col.vec[cuts == levels(cuts)[3]] <- "red"
  } else if (direction == "greater") {
    cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[2]] <- "red"
  } else {
    cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
    col.vec <- rep("grey80", length(cuts))
    col.vec[cuts == levels(cuts)[1]] <- "red"
  }
  range_diffs <- max(sim_vals) - min(sim_vals)
  ifelse(summary_measure == "correlation",
    xlabel <- "Correlation",
    xlabel <- "Slope"
  )
  plot(h,
    col = col.vec, main = "",
    ylab = "",
    yaxt = "n",
    xlab = xlabel,
    xlim = c(
      min(
        min(sim_vals),
        ifelse(direction == "two-sided",
          -abs(as_extreme_as) - range_diffs / 5,
          as_extreme_as - range_diffs / 5
        )
      ),
      max(
        max(sim_vals),
        ifelse(direction == "two-sided",
          abs(as_extreme_as) + range_diffs / 5,
          as_extreme_as + range_diffs / 5
        )
      )
    ),
    sub = paste("Count = ",
      count_extreme,
      "/", number_repetitions, " = ",
      round(count_extreme / number_repetitions, 4),
      sep = ""
    )
  )

  if (add_normal == TRUE) {
    add_norm(h, sim_vals)
  }

  legend("topright",
    legend = c(
      paste("Mean =", round(mean(sim_vals, na.rm = T), 3)),
      paste("SD =", round(sd(sim_vals, na.rm = T), 3))
    ),
    col = "white", bty = "n"
  )
  if (direction == "two-sided") {
    abline(v = abs(as_extreme_as), col = "red", lwd = 2)
    abline(v = -abs(as_extreme_as), col = "red", lwd = 2)
  } else {
    abline(v = as_extreme_as, col = "red", lwd = 2)
  }
}


#' Bootstrap confidence interval for regression
#'
#' This function will create a bootstrap confidence interval for a
#' slope of simple linear regression or correlation
#' between two quantitative variables.
#'
#' @param formula Formula of the form `response ~ predictor`,
#'   where `predictor` defines a quantitative explanatory variable and
#'   `response` is a quantitative response variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param summary_measure Name of summary measure to return from simulations.
#'    Allowed values are
#'    `"slope"` for confidence interval for slope or
#'    `"correlation"` for confidence interval for correlation.
#' @param confidence_level Confidence level for interval in decimal form.
#'   Defaults to 0.95 (95% confidence interval).
#' @param number_repetitions Number of bootstrapped resamples.
#'
#' @examples
#' data(mtfires)
#' mtfires$logHect <- log(mtfires$Hectares)
#' regression_bootstrap_CI(logHect ~ Temperature,
#'   data = mtfires,
#'   summary_measure = "correlation",
#'   confidence_level = 0.9,
#'   number_repetitions = 1000
#' )
#' @export

regression_bootstrap_CI <- function(formula,
                                    data,
                                    summary_measure = c("slope", "correlation"),
                                    confidence_level = 0.95,
                                    number_repetitions = 100) {
  if (!(summary_measure %in% c("slope", "correlation"))) {
    stop("Summary measure must be either 'slope' or 'correlation'.")
  }
  if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
    stop("Number of repetitions must be positive and integer valued.")
  }
  if (confidence_level < 0 | confidence_level > 1) {
    stop("Confidence level must be given in decimal form.")
  }

  n <- nrow(data)

  resp.name <- all.vars(formula)[1]
  pred.name <- all.vars(formula)[2]
  response <- eval(parse(text = paste0("data$", resp.name)))
  predictor <- eval(parse(text = paste0("data$", pred.name)))

  sim_vals <- rep(NA, number_repetitions)
  for (i in 1:number_repetitions) {
    sample_obs <- sample(n, n, replace = TRUE)
    newResponse <- response[sample_obs]
    newPred <- predictor[sample_obs]
    sim.fit <- lm(newResponse ~ newPred)
    sim_vals[i] <- sim.fit$coef[2]
  }

  if (summary_measure == "correlation") {
    sdRatio <- sd(response) / sd(predictor)
    sim_vals <- sim_vals / sdRatio
  }

  low_ci <- quantile(sim_vals, (1 - confidence_level) / 2)
  high_ci <- quantile(sim_vals, 1 - (1 - confidence_level) / 2)

  h <- hist(sim_vals, plot = FALSE, breaks = "FD")

  cuts <- cut(h$breaks, c(-Inf, low_ci, high_ci, Inf))
  col.vec <- rep("grey80", length(cuts))
  col.vec[cuts == levels(cuts)[1]] <- "red"
  col.vec[cuts == levels(cuts)[3]] <- "red"

  break_range <- max(h$breaks) - min(h$breaks)
  ifelse(summary_measure == "correlation",
    xlabel <- "Bootstrap Correlation",
    xlabel <- "Bootstrap Slope"
  )
  plot(h,
    col = col.vec, main = "",
    ylab = "",
    yaxt = "n",
    xlim = c(
      min(min(h$breaks), low_ci - break_range / 6),
      max(max(h$breaks), high_ci + break_range / 6)
    ),
    xlab = xlabel,
    sub = paste0(
      100 * confidence_level, "% CI: (",
      round(low_ci, 3), ", ", round(high_ci, 3), ")"
    )
  )
  abline(v = c(low_ci, high_ci), col = "red", lwd = 2)

  cutoff_label <- c(
    paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
    paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
  )
  text(c(low_ci, high_ci),
    rep(max(h$counts), 2),
    labels = cutoff_label,
    pos = c(2, 4), cex = .75
  )
}
greenwood-stat/catstats documentation built on Aug. 1, 2022, 2:04 p.m.