#' Add normal curve to histogram
#'
#' This function will add a normal curve to a frequency or density histogram
#' using the raw data and histogram object as arguments.
#'
#' @param h Histogram object generated by \code{\link[graphics:hist]{hist}}.
#' @param data Vector of values used to generate the histogram.
#' @param freq Logical value indicating whether y-axis of
#' histogram is frequency. Defaults to TRUE.
#' @param na.rm Logical value indicating whether NA values should
#' be stripped before the computation proceeds. Defaults to TRUE.
#' @param ... Other \code{\link[graphics:par]{graphical parameters}}.
#'
#' @examples
#' x <- rnorm(30, 5, 2)
#' hi <- hist(x)
#' add_norm(h = hi, data = x)
#' @export
add_norm <- function(h, data, freq = TRUE, na.rm = FALSE,
lty = 1, lwd = 2, col = "purple", ...) {
x <- seq(min(data), max(data), length = 200)
y <- dnorm(x, mean(data, na.rm = na.rm), sd(data, na.rm = na.rm))
if (freq == FALSE) {
lines(x, y, lty = lty, lwd = lwd, col = col, ...)
} else {
y <- y * diff(h$mids[1:2]) * length(data)
lines(x, y, lty = lty, lwd = lwd, col = col, ...)
}
}
#' Simulation-based hypothesis test for one proportion
#'
#' This function will run a simulation-based hypothesis test for a
#' single proportion of successes.
#'
#' @param probability_success Value between 0 and 1
#' representing the null hypothesis value for proportion.
#' @param sample_size Number of trials used to compute proportion.
#' @param number_repetitions Number of simulated samples.
#' @param summary_measure Name of summary measure to return from simulations.
#' Allowed values are
#' `"number"` for number of successes or
#' `"proportion"` for proportion of successes.
#' @param as_extreme_as Value of observed statistic.
#' Between 0 and 1 if `summary_measure` is `"proportion"`;
#' an integer between 1 and `sample_size` if `summary_measure` is `"number"`.
#' @param direction Direction of alternative hypothesis.
#' Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param add_normal Logical value indicating whether to superimpose a normal
#' curve on the histogram. Defaults to FALSE.
#'
#' @return Returns plot of distribution of simulated statistics,
#' with values as or more extreme than specified
#' value highlighted, and reports
#' proportion of simulations as or more extreme than specified
#' as subtitle on plot.
#'
#' @examples
#' one_proportion_test(
#' probability_success = 0.5,
#' sample_size = 150,
#' summary_measure = "proportion",
#' as_extreme_as = 0.65,
#' direction = "greater",
#' number_repetitions = 1000
#' )
#' @export
one_proportion_test <- function(probability_success = 0.5,
sample_size = 5,
summary_measure = c("number", "proportion"),
as_extreme_as,
direction = c("greater", "less", "two-sided"),
number_repetitions = 1,
add_normal = FALSE) {
if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
stop("Number of repetitions must be positive and integer valued.")
}
if (sample_size < 1 | !(sample_size %% 1 == 0)) {
stop("Sample size must be positive and integer valued.")
}
if (probability_success < 0 | probability_success > 1) {
stop("Probability of success must be between zero and one.")
}
if (!(direction %in% c("greater", "less", "two-sided"))) {
stop("Direction must be one of 'greater', 'less', or 'two-sided'.")
}
if (!(summary_measure %in% c("number", "proportion"))) {
stop("Summary measure must be either
'number' or 'proportion' of successes.")
}
if (is.null(as_extreme_as)) {
stop("Must enter cutoff value for 'as_extreme_as'.")
}
# Simulate data
number_heads <- rbinom(
number_repetitions,
sample_size,
probability_success
)
range_heads <- max(number_heads) - min(number_heads)
success_tab <- table(number_heads)
# Set plot margins
original_mar <- par()$mar
par(mar = c(5, 3, 3, 5))
# Not two-sided:
if (!(direction == "two-sided")) {
# Calculate p-value
if (summary_measure == "number") {
proportion_extreme <- ifelse(direction == "greater",
mean(number_heads >= as_extreme_as),
mean(number_heads <= as_extreme_as)
)
} else {
proportion_extreme <- ifelse(direction == "greater",
mean(number_heads / sample_size >= as_extreme_as),
mean(number_heads / sample_size <= as_extreme_as)
)
}
# Plot
if (summary_measure == "number") { # Number of successes; not two-sided
plot(0, 0, "n",
frame.plot = FALSE,
xlim = c(
min(
min(number_heads),
as_extreme_as - range_heads / 5
),
max(
max(number_heads),
as_extreme_as + range_heads / 5
)
),
ylim = c(0, max(success_tab) + 3),
xlab = "Number of Successes",
ylab = "",
yaxt = "n",
sub = paste("Proportion of Samples = ",
proportion_extreme * number_repetitions,
"/", number_repetitions, " = ",
round(proportion_extreme, 4),
sep = ""
)
)
for (i in 1:length(success_tab)) {
lines(rep(names(success_tab)[i], 2),
c(0, success_tab[i]),
lwd = 5,
col = ifelse(direction == "greater",
ifelse(as.numeric(names(success_tab[i])) >=
as_extreme_as, "blue", "black"),
ifelse(as.numeric(names(success_tab[i])) <=
as_extreme_as, "blue", "black")
)
)
}
abline(
v = ifelse(direction == "greater",
as_extreme_as - 0.5, as_extreme_as + 0.5
),
col = "blue", lwd = 2
)
cutoff_label <- ifelse(direction == "greater",
paste(">=", as_extreme_as),
paste("<=", as_extreme_as)
)
text(ifelse(direction == "greater",
as_extreme_as - 0.25, as_extreme_as + 0.25
),
max(success_tab) + 3,
labels = cutoff_label,
pos = ifelse(direction == "greater", 4, 2)
)
mtext(paste(
"Mean =",
round(mean(number_heads, na.rm = T), 3), "\n",
"SD =",
round(sd(number_heads, na.rm = T), 3)
),
line = -3, side = 4, las = 2
)
} else { # Proportion of successes; not two-sided
plot(0, 0, "n",
frame.plot = FALSE,
xlim = c(
min(
min(number_heads) / sample_size,
as_extreme_as - range_heads / (5 * sample_size)
),
max(
max(number_heads) / sample_size,
as_extreme_as + range_heads / (5 * sample_size)
)
),
ylim = c(0, max(success_tab) + 3),
xlab = "Proportion of Successes",
ylab = "",
yaxt = "n",
sub = paste("Proportion of Samples = ",
proportion_extreme * number_repetitions,
"/", number_repetitions, " = ",
round(proportion_extreme, 4),
sep = ""
)
)
for (i in 1:length(success_tab)) {
lines(rep(as.numeric(names(success_tab)[i]) / sample_size, 2),
c(0, success_tab[i]),
lwd = 5,
col = ifelse(direction == "greater",
ifelse(as.numeric(names(success_tab[i])) / sample_size
>= as_extreme_as, "blue", "black"),
ifelse(as.numeric(names(success_tab[i])) / sample_size
<= as_extreme_as, "blue", "black")
)
)
}
abline(
v = ifelse(direction == "greater",
as_extreme_as - 0.5 / sample_size,
as_extreme_as + 0.5 / sample_size
),
col = "blue", lwd = 2
)
cutoff_label <- ifelse(direction == "greater",
paste(">=", as_extreme_as),
paste("<=", as_extreme_as)
)
text(ifelse(direction == "greater",
as_extreme_as - 0.25 / sample_size,
as_extreme_as + 0.25 / sample_size
),
max(success_tab) + 3,
labels = cutoff_label,
pos = ifelse(direction == "greater", 4, 2)
)
mtext(paste(
"Mean =",
round(mean(number_heads / sample_size, na.rm = T), 3), "\n",
"SD =",
round(sd(number_heads / sample_size, na.rm = T), 3)
),
line = -3, side = 4, las = 2
)
}
} else if (summary_measure == "number") { # Number of successes; two-sided
if (as_extreme_as > probability_success * sample_size) {
upper <- as_extreme_as
lower <- max(0, 2 * probability_success * sample_size - as_extreme_as)
} else {
lower <- as_extreme_as
upper <- min(sample_size, 2 * probability_success * sample_size - as_extreme_as)
}
proportion_extreme <- mean(number_heads <= lower | number_heads >= upper)
plot(0, 0, "n",
frame.plot = FALSE,
xlim = c(
min(min(number_heads), lower - range_heads / 5),
max(max(number_heads), upper + range_heads / 5)
),
ylim = c(0, max(success_tab) + 3),
xlab = "Number of Successes",
ylab = "",
yaxt = "n",
sub = paste("Proportion of Samples = ",
proportion_extreme * number_repetitions,
"/", number_repetitions, " = ",
round(proportion_extreme, 4),
sep = ""
)
)
for (i in 1:length(success_tab)) {
lines(rep(names(success_tab)[i], 2), c(0, success_tab[i]),
lwd = 5,
col = ifelse(as.numeric(names(success_tab[i])) <= lower |
as.numeric(names(success_tab[i])) >= upper,
"blue", "black"
)
)
}
abline(
v = c(lower + 0.5, upper - 0.5),
col = "blue", lwd = 2
)
cutoff_label <- c(paste("<=", lower), paste(">=", upper))
text(c(lower - 0.25, upper + 0.25),
rep(max(success_tab) + 3, 2),
labels = cutoff_label,
pos = c(4, 2)
)
mtext(paste(
"Mean =",
round(mean(number_heads, na.rm = T), 3), "\n",
"SD =",
round(sd(number_heads, na.rm = T), 3)
),
line = -3, side = 4, las = 2
)
} else { # Proportion of successes; two-sided
if (as_extreme_as > probability_success) {
upper <- as_extreme_as
lower <- max(0, 2 * probability_success - as_extreme_as)
} else {
lower <- as_extreme_as
upper <- min(1, 2 * probability_success - as_extreme_as)
}
proportion_extreme <- mean(number_heads <= lower * sample_size |
number_heads >= upper * sample_size)
plot(0, 0, "n",
frame.plot = FALSE,
xlim = c(
min(
min(number_heads) / sample_size,
lower - range_heads / (5 * sample_size)
),
max(
max(number_heads) / sample_size,
upper + range_heads / (5 * sample_size)
)
),
ylim = c(0, max(success_tab) + 3),
xlab = "Proportion of Successes",
ylab = "",
yaxt = "n",
sub = paste("Proportion of Samples = ",
proportion_extreme * number_repetitions,
"/", number_repetitions, " = ",
round(proportion_extreme, 4),
sep = ""
)
)
for (i in 1:length(success_tab)) {
lines(rep(as.numeric(names(success_tab)[i]) / sample_size, 2),
c(0, success_tab[i]),
lwd = 5,
col = ifelse(as.numeric(names(success_tab[i])) / sample_size <= lower |
as.numeric(names(success_tab[i])) / sample_size >= upper,
"blue", "black"
)
)
}
abline(
v = c(lower + 0.5 / sample_size, upper - 0.5 / sample_size),
col = "blue", lwd = 2
)
cutoff_label <- c(paste("<=", lower), paste(">=", upper))
text(c(lower + 0.25 / sample_size, upper - 0.25 / sample_size),
rep(max(success_tab) + 3, 2),
labels = cutoff_label,
pos = c(2, 4)
)
mtext(paste(
"Mean =",
round(mean(number_heads / sample_size, na.rm = T), 3), "\n",
"SD =",
round(sd(number_heads / sample_size, na.rm = T), 3)
),
line = -3, side = 4, las = 2
)
}
# Add normal curve
if (add_normal == TRUE) {
if (summary_measure == "number") {
h <- hist(number_heads,
breaks = seq(min(number_heads), max(number_heads), by = 1),
plot = FALSE
)
add_norm(h, number_heads)
} else {
p <- number_heads / sample_size
h <- hist(p,
breaks = seq(min(p), max(p), by = 1 / sample_size),
plot = FALSE
)
add_norm(h, p)
}
}
# Set plot margins back
par(mar = original_mar)
}
#' Bootstrap confidence interval for one proportion
#'
#' This function will create a bootstrap confidence interval for a
#' single proportion of successes.
#'
#' @param sample_size Number of trials used to compute proportion.
#' @param number_successes How many successes were observed in those trials?
#' @param number_repetitions Number of bootstrapped resamples.
#' @param confidence_level Confidence level for interval in decimal form.
#' Defaults to 0.95 (95% confidence interval).
#'
#' @return Returns plot of distribution of bootstrapped statistics,
#' with values as or more extreme than percentile confidence interval range
#' highlighted, and reports confidence interval as subtitle on plot.
#'
#' @examples
#' one_proportion_bootstrap_CI(
#' sample_size = 150,
#' number_successes = 98,
#' confidence_level = 0.99,
#' number_repetitions = 1000
#' )
#' @export
one_proportion_bootstrap_CI <- function(sample_size,
number_successes,
confidence_level = 0.95,
number_repetitions = 100) {
if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
stop("Number of repetitions must be positive and integer valued.")
}
if (number_successes < 1 | !(number_successes %% 1 == 0)) {
stop("Number of successes must be positive and integer valued.")
}
if (sample_size < 1 | !(sample_size %% 1 == 0)) {
stop("Sample size must be positive and integer valued.")
}
if (confidence_level < 0 | confidence_level > 1) {
stop("Confidence level must be given in decimal form.")
}
sim_pop <- rep(0, sample_size)
sim_pop[1:number_successes] <- 1
sim_props <- rep(NA, number_repetitions)
for (i in 1:number_repetitions) {
sim_props[i] <- mean(sample(sim_pop, sample_size, replace = TRUE))
}
success_tab <- table(sim_props)
low_ci <- quantile(sim_props, (1 - confidence_level) / 2)
high_ci <- quantile(sim_props, 1 - (1 - confidence_level) / 2)
plot(0, 0, "n",
frame.plot = FALSE,
xlim = c(
min(
min(sim_props),
low_ci - (max(sim_props) - min(sim_props)) / 5
),
max(
max(sim_props),
high_ci + (max(sim_props) - min(sim_props)) / 5
)
),
ylim = c(0, max(success_tab) + 3),
xlab = "Bootstrapped Proportions",
ylab = "",
yaxt = "n",
sub = paste0(
"Mean: ", round(mean(sim_props), 3), ", SD: ",
round(sd(sim_props), 3), ", ", 100 * confidence_level, "% CI: (",
round(low_ci, 3), ", ", round(high_ci, 3), ")"
),
cex.sub = 0.75
)
for (i in 1:length(success_tab)) {
lines(rep(as.numeric(names(success_tab)[i]), 2), c(0, success_tab[i]),
lwd = 5,
col = ifelse(as.numeric(names(success_tab[i])) <= low_ci |
as.numeric(names(success_tab[i])) >= high_ci, "blue", "black")
)
}
abline(
v = c(low_ci, high_ci),
col = "blue", lwd = 2
)
cutoff_label <- c(
paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
)
text(c(low_ci, high_ci),
rep(max(success_tab) + 3, 2),
labels = cutoff_label,
pos = c(2, 4), cex = .75
)
}
#' Plots for observed matched pairs data
#'
#' This function will create two plots of a quantitative variable
#' and its differences for matched pairs data.
#'
#' @param data Data frame
#' with values for each group in the first two columns.
#' @param which_first Which column is first in order of subtraction?
#' `1` if subtracting second column from first (1 - 2);
#' `2` if subtracting first column from second (2 - 1). Defaults to `1`.
#'
#' @return Returns plot of distribution of observed values,
#' with means & standard deviations in groups and paired observations
#' linked by a line segment, and histogram of differences.
#'
#' @examples
#' set.seed(117)
#' x <- rnorm(25)
#' y <- x + 1 + rnorm(25, 0, 1.8)
#' data <- data.frame(x, y)
#' paired_observed_plot(data, which_first = 1)
#' @export
paired_observed_plot <- function(data,
which_first = 1) {
if (ncol(data) < 2) {
stop("Data should have at least two columns.")
}
data <- as.data.frame(data[, 1:2])
differences <- data[, 1] - data[, 2]
if (which_first == 2) {
differences <- data[, 2] - data[, 1]
}
rg <- max(data) - min(data)
par(mfrow = c(2, 1), mgp = c(2, .5, 0), mar = c(4, 4, 0, 0) + .1)
plot(0, 0, "n",
xlim = c(min(data), max(data) + 0.3 * rg),
ylim = c(0, 5.5), yaxt = "n", xlab = "Outcomes",
ylab = ""
)
for (i in 1:nrow(data)) {
lines(c(data[i, ]), c(3, 0.5), col = "grey80")
}
points(data[, 1], rep(3, nrow(data)), pch = 15, col = "blue")
points(data[, 2], rep(0.5, nrow(data)), pch = 15, col = "red")
leg.loc <- min(data) + 0.95 * (max(data) - min(data))
legend(leg.loc, 5.5,
col = "white", bty = "n",
legend = c(
dimnames(data)[[2]][1],
paste("Mean =", round(mean(data[, 1], na.rm = T), 3)),
paste("SD =", round(sd(data[, 1], na.rm = T), 3))
),
text.col = "blue", cex = 0.75
)
legend(leg.loc, 2.5,
col = "white", bty = "n",
legend = c(
dimnames(data)[[2]][2],
paste("Mean =", round(mean(data[, 2], na.rm = T), 3)),
paste("SD =", round(sd(data[, 2], na.rm = T), 3))
),
text.col = "red", cex = 0.75
)
freqs <- hist(differences, breaks = round(nrow(data) / 3), plot = FALSE)$counts
hist(differences,
xlab = paste(
"Observed Differences (",
dimnames(data)[[2]][1], "-",
dimnames(data)[[2]][2], ")"
),
ylab = "Frequency", col = "grey80",
main = "", breaks = round(nrow(data) / 3),
ylim = c(0, 1.2 * max(freqs))
)
legend("topleft",
cex = 0.8,
legend =
c(
paste("Mean =", round(mean(differences, na.rm = T), 3)),
paste("SD =", round(sd(differences, na.rm = T), 3))
),
col = "white", bty = "n"
)
}
#' Simulation-based hypothesis test for a paired mean difference
#'
#' This function will run a simulation-based hypothesis test for a
#' paired mean difference or paired median difference between two
#' quantitative variables for matched pairs data.
#'
#' @param data Vector of differences or a
#' two- or three-column data frame with values for each group in last two columns.
#' @param summary_measure Name of summary measure to return from simulations.
#' Allowed values are
#' `"mean"` for test of paired mean difference or
#' `"median"` for test of paired median difference.
#' Defaults to `"mean"`.
#' @param shift Amount to shift differences for bootstrapping of null distribution.
#' @param number_repetitions Number of simulated samples.
#' @param as_extreme_as Value of observed paired mean difference.
#' @param direction Direction of alternative hypothesis.
#' Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param which_first Which column is first in order of subtraction?
#' `1` if subtracting second column from first (1 - 2);
#' `2` if subtracting first column from second (2 - 1). Defaults to `1`.
#' @param add_normal Logical value indicating whether to superimpose a normal
#' curve on the histogram. Defaults to FALSE.
#'
#' @return Returns plot of distribution of simulated statistics,
#' with values as or more extreme than specified
#' value highlighted, and reports
#' proportion of simulations as or more extreme than specified
#' as subtitle on plot.
#'
#' @examples
#' set.seed(117)
#' x <- rnorm(25)
#' y <- x + 1 + rnorm(25, 0, 1.8)
#' data <- data.frame(x, y)
#' obs_diff <- mean(x - y)
#' paired_test(data,
#' which_first = 1,
#' shift = -obs_diff,
#' as_extreme_as = obs_diff,
#' direction = "two-sided",
#' number_repetitions = 1000
#' )
#' @export
paired_test <- function(data,
summary_measure = "mean",
which_first = 1,
shift = 0,
as_extreme_as,
direction = c("greater", "less", "two-sided"),
number_repetitions = 1,
add_normal = FALSE) {
if (!(summary_measure %in% c("mean", "median"))) {
stop("Summary measure must be either 'mean' or 'median'.")
}
if (!(direction %in% c("greater", "less", "two-sided"))) {
stop("Direction must be 'greater', 'less', or 'two-sided'.")
}
if (is.null(as_extreme_as)) {
stop("Must enter cutoff value for 'as_extreme_as'.")
}
if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
stop("Number of repetitions must be positive and integer valued.")
}
if (!(which_first %in% 1:2)) {
stop("'which_first' must equal 1 or 2 to indicate order of subtraction.")
}
if (is.vector(data)) {
warning("Assuming data entered is vector of differences.")
differences <- data
n <- length(data)
} else {
if (ncol(data) < 2 | ncol(data) > 3) {
stop("Data should have two or three variables.")
}
if (ncol(data) == 3) {
data <- data[, 2:3]
}
differences <- data[, which_first] - data[, ifelse(which_first == 1, 2, 1)]
n <- nrow(data)
}
sim_diffs <- rep(NA, number_repetitions)
if (summary_measure == "mean") {
obs_diff <- mean(differences)
for (i in 1:number_repetitions) {
sim_diffs[i] <- mean(sample(x = differences + shift, size = n, replace = TRUE))
}
} else {
obs_diff <- median(differences)
for (i in 1:number_repetitions) {
sim_diffs[i] <- median(sample(x = differences + shift, size = n, replace = TRUE))
}
}
count_extreme <- ifelse(direction == "greater", sum(sim_diffs >= as_extreme_as),
ifelse(direction == "less", sum(sim_diffs <= as_extreme_as),
sum(sim_diffs <= -abs(as_extreme_as)) +
sum(sim_diffs >= abs(as_extreme_as))
)
)
h <- hist(sim_diffs, plot = FALSE, breaks = "FD")
if (direction == "two-sided") {
cuts <- cut(h$breaks, c(-Inf, -abs(as_extreme_as), abs(as_extreme_as), Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
col.vec[cuts == levels(cuts)[3]] <- "red"
} else if (direction == "greater") {
cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[2]] <- "red"
} else {
cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
}
range_diffs <- max(sim_diffs) - min(sim_diffs)
ifelse(summary_measure == "mean",
xlabel <- "Mean Difference",
xlabel <- "Median Difference"
)
plot(h,
col = col.vec, main = "",
ylab = "",
yaxt = "n",
xlab = xlabel,
xlim = c(
min(
min(sim_diffs),
ifelse(direction == "two-sided",
-abs(as_extreme_as) - range_diffs / 5,
as_extreme_as - range_diffs / 5
)
),
max(
max(sim_diffs),
ifelse(direction == "two-sided",
abs(as_extreme_as) + range_diffs / 5,
as_extreme_as + range_diffs / 5
)
)
),
sub = paste("Count = ",
count_extreme,
"/", number_repetitions, " = ",
round(count_extreme / number_repetitions, 4),
sep = ""
)
)
if (add_normal == TRUE) {
add_norm(h, sim_diffs)
}
legend("topright",
legend =
c(
paste("Mean =", round(mean(sim_diffs, na.rm = T), 3)),
paste("SD =", round(sd(sim_diffs, na.rm = T), 3))
),
col = "white", bty = "n"
)
if (direction == "two-sided") {
abline(v = abs(as_extreme_as), col = "red", lwd = 2)
abline(v = -abs(as_extreme_as), col = "red", lwd = 2)
} else {
abline(v = as_extreme_as, col = "red", lwd = 2)
}
}
#' Bootstrap confidence interval for a paired mean difference
#'
#' This function will create a bootstrap confidence interval for a
#' paired mean difference or paired median difference between two
#' quantitative variables for matched pairs data.
#'
#' @param data Vector of differences or a two- or three-column data frame
#' with values for each group in last two columns.
#' @param summary_measure Name of summary measure to return from simulations.
#' Allowed values are
#' `"mean"` for confidence interval for paired mean difference or
#' `"median"` for confidence interval for paired median difference.
#' Defaults to `"mean"`.
#' @param number_repetitions Number of bootstrapped resamples.
#' @param which_first Which column is first in order of subtraction?
#' `1` if subtracting second column from first (1 - 2);
#' `2` if subtracting first column from second (2 - 1). Defaults to `1`.
#' @param confidence_level Confidence level for interval in decimal form.
#' Defaults to `0.95` (95% confidence interval).
#'
#' @return Returns plot of distribution of bootstrapped statistics,
#' with values as or more extreme than percentile confidence interval range
#' highlighted, and reports confidence interval as subtitle on plot.
#'
#' @examples
#' set.seed(117)
#' x <- rnorm(25)
#' y <- x + 1 + rnorm(25, 0, 1.8)
#' data <- data.frame(x, y)
#' paired_bootstrap_CI(data,
#' which_first = 1,
#' confidence_level = 0.9,
#' number_repetitions = 1000
#' )
#' @export
paired_bootstrap_CI <- function(data,
summary_measure = "mean",
which_first = 1,
confidence_level = 0.95,
number_repetitions = 100) {
if (!(summary_measure %in% c("mean", "median"))) {
stop("Summary measure must be either 'mean' or 'median'.")
}
if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
stop("Number of repetitions must be positive and integer valued.")
}
if (confidence_level < 0 | confidence_level > 1) {
stop("Confidence level must be given in decimal form.")
}
if (!(which_first %in% 1:2)) {
stop("'which_first' must equal 1 or 2 to indicate order of subtraction.")
}
if (is.vector(data)) {
warning("Assuming data entered is vector of differences.")
differences <- data
n <- length(data)
} else {
if (ncol(data) < 2 | ncol(data) > 3) {
stop("Data should have two or three variables.")
}
if (ncol(data) == 3) {
data <- data[, 2:3]
}
differences <- data[, which_first] - data[, ifelse(which_first == 1, 2, 1)]
n <- nrow(data)
}
sim_diffs <- rep(NA, number_repetitions)
if (summary_measure == "mean") {
obs_diff <- mean(differences)
for (i in 1:number_repetitions) {
boot_samp <- sample(differences, n, replace = TRUE)
sim_diffs[i] <- mean(boot_samp)
}
} else {
obs_diff <- median(differences)
for (i in 1:number_repetitions) {
boot_samp <- sample(differences, n, replace = TRUE)
sim_diffs[i] <- median(boot_samp)
}
}
low_ci <- quantile(sim_diffs, (1 - confidence_level) / 2)
high_ci <- quantile(sim_diffs, 1 - (1 - confidence_level) / 2)
h <- hist(sim_diffs, plot = FALSE, breaks = "FD")
cuts <- cut(h$breaks, c(-Inf, low_ci, high_ci, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
col.vec[cuts == levels(cuts)[3]] <- "red"
break_range <- max(h$breaks) - min(h$breaks)
ifelse(summary_measure == "mean",
xlabel <- "Bootstrap Mean Difference",
xlabel <- "Bootstrap Median Difference"
)
plot(h,
col = col.vec, main = "",
ylab = "",
yaxt = "n",
xlim = c(
min(min(h$breaks), low_ci - break_range / 6),
max(max(h$breaks), high_ci + break_range / 6)
),
xlab = xlabel,
sub = paste0(
100 * confidence_level, "% CI: (",
round(low_ci, 3), ", ", round(high_ci, 3), ")"
)
)
abline(v = c(low_ci, high_ci), col = "red", lwd = 2)
cutoff_label <- c(
paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
)
text(c(low_ci, high_ci),
rep(max(h$counts), 2),
labels = cutoff_label,
pos = c(2, 4), cex = .75
)
}
#' Simulation-based hypothesis test for a difference in proportions
#'
#' This function will run a simulation-based hypothesis test for a
#' difference in proportion of successes between two independent groups.
#'
#' @param formula Formula of the form `response ~ predictor`,
#' where `predictor` defines the two groups of the explanatory variable and
#' `response` is binary or a two-level categorical variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param first_in_subtraction Value of predictor variable
#' that should be first in order of subtraction for computing
#' difference in proportions.
#' @param response_value_numerator Value of response that corresponds
#' to "success" when computing proportions.
#' @param number_repetitions Number of simulated samples.
#' @param as_extreme_as Value of observed difference in proportions.
#' @param direction Direction of alternative hypothesis.
#' Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param add_normal Logical value indicating whether to superimpose a normal
#' curve on the histogram. Defaults to FALSE.
#'
#' @return Returns plot of distribution of simulated statistics,
#' with values as or more extreme than specified
#' value highlighted, and reports
#' proportion of simulations as or more extreme than specified
#' as subtitle on plot.
#'
#' @examples
#' data(pt)
#' pt$twoSeconds <- ifelse(pt$responses >= 2, "Yes", "No")
#' two_proportion_test(twoSeconds ~ brand,
#' data = pt,
#' first_in_subtraction = "B1",
#' response_value_numerator = "Yes",
#' as_extreme_as = -.4,
#' direction = "two-sided",
#' number_repetitions = 1000
#' )
#' @export
two_proportion_test <- function(formula,
data,
first_in_subtraction,
response_value_numerator,
as_extreme_as,
direction = c("greater", "less", "two-sided"),
number_repetitions = 1,
add_normal = FALSE) {
if (!(direction %in% c("greater", "less", "two-sided"))) {
stop("Direction must be one of 'greater', 'less', or 'two-sided'.")
}
if (number_repetitions < 1 | number_repetitions %% 1 != 0) {
stop("Number of repetitions must be positive and integer valued.")
}
resp.name <- all.vars(formula)[1]
pred.name <- all.vars(formula)[2]
eval(parse(text = paste0("data$", resp.name, " = factor(data$", resp.name, ")")))
eval(parse(text = paste0("data$", pred.name, " = factor(data$", pred.name, ")")))
response <- eval(parse(text = paste0("data$", resp.name)))
predictor <- eval(parse(text = paste0("data$", pred.name)))
if (!(first_in_subtraction %in% unique(predictor))) {
stop("First term in order of subtraction must match values in data.")
}
if (!(response_value_numerator %in% unique(response))) {
stop("Value of response in numerator must match values in data.")
}
row.pcts <- prop.table(table(predictor, response), 1)
obs.diff <- row.pcts[
eval(parse(text = paste0("'", first_in_subtraction, "'"))),
eval(parse(text = paste0("'", response_value_numerator, "'")))
] -
row.pcts[
setdiff(unique(predictor), eval(parse(text = paste0("'", first_in_subtraction, "'")))),
eval(parse(text = paste0("'", response_value_numerator, "'")))
]
sim_diffs <- rep(NA, number_repetitions)
for (i in 1:number_repetitions) {
newResponse <- sample(response)
row.pcts <- prop.table(table(predictor, newResponse), 1)
sim_diffs[i] <- row.pcts[
eval(parse(text = paste0("'", first_in_subtraction, "'"))),
eval(parse(text = paste0("'", response_value_numerator, "'")))
] -
row.pcts[
setdiff(unique(predictor), eval(parse(text = paste0("'", first_in_subtraction, "'")))),
eval(parse(text = paste0("'", response_value_numerator, "'")))
]
}
count_extreme <- ifelse(direction == "greater", sum(sim_diffs >= as_extreme_as),
ifelse(direction == "less", sum(sim_diffs <= as_extreme_as),
sum(sim_diffs <= -abs(as_extreme_as)) +
sum(sim_diffs >= abs(as_extreme_as))
)
)
h <- hist(sim_diffs, plot = FALSE, breaks = "FD")
if (direction == "two-sided") {
cuts <- cut(h$breaks, c(-Inf, -abs(as_extreme_as), abs(as_extreme_as), Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
col.vec[cuts == levels(cuts)[3]] <- "red"
} else if (direction == "greater") {
cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[2]] <- "red"
} else {
cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
}
range_diffs <- max(sim_diffs) - min(sim_diffs)
plot(h,
col = col.vec, main = "",
ylab = "",
yaxt = "n",
xlab = "Difference in Proportions",
xlim = c(
min(min(sim_diffs), ifelse(direction == "two-sided",
-abs(as_extreme_as) - range_diffs / 5,
as_extreme_as - range_diffs / 5)),
max(max(sim_diffs), ifelse(direction == "two-sided",
abs(as_extreme_as) + range_diffs / 5,
as_extreme_as + range_diffs / 5))
),
sub = paste("Count = ",
count_extreme,
"/", number_repetitions, " = ",
round(count_extreme / number_repetitions, 4),
sep = ""
)
)
if (add_normal == TRUE) {
add_norm(h, sim_diffs)
}
legend("topright",
legend = c(
paste("Mean =", round(mean(sim_diffs, na.rm = T), 3)),
paste("SD =", round(sd(sim_diffs, na.rm = T), 3))
),
col = "white", bty = "n"
)
if (direction == "two-sided") {
abline(v = abs(as_extreme_as), col = "red", lwd = 2)
abline(v = -abs(as_extreme_as), col = "red", lwd = 2)
} else {
abline(v = as_extreme_as, col = "red", lwd = 2)
}
}
#' Bootstrap confidence interval for a difference in proportions
#'
#' This function will create a bootstrap confidence interval for a
#' difference in proportion of successes between two independent groups.
#'
#' @param formula Formula of the form `response ~ predictor`,
#' where `predictor` defines the two groups of the explanatory variable and
#' `response` is binary or a two-level categorical variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param first_in_subtraction Value of predictor variable
#' that should be first in order of subtraction for computing
#' difference in proportions.
#' @param response_value_numerator Value of response that corresponds
#' to "success" when computing proportions.
#' @param number_repetitions Number of bootstrapped resamples.
#' @param confidence_level Confidence level for interval in decimal form.
#' Defaults to 0.95 (95% confidence interval).
#'
#' @return Returns plot of distribution of bootstrapped statistics,
#' with values as or more extreme than percentile confidence interval range
#' highlighted, and reports confidence interval as subtitle on plot.
#'
#' @examples
#' data(pt)
#' pt$twoSeconds <- ifelse(pt$responses >= 2, "Yes", "No")
#' two_proportion_bootstrap_CI(twoSeconds ~ brand,
#' data = pt,
#' first_in_subtraction = "B1",
#' response_value_numerator = "Yes",
#' confidence_level = 0.95,
#' number_repetitions = 1000
#' )
#' @export
two_proportion_bootstrap_CI <- function(formula,
data,
first_in_subtraction,
response_value_numerator,
confidence_level = 0.95,
number_repetitions = 100) {
if (number_repetitions < 1 | number_repetitions %% 1 != 0) {
stop("Number of repetitions must be positive and integer valued.")
}
resp.name <- all.vars(formula)[1]
pred.name <- all.vars(formula)[2]
eval(parse(text = paste0("data$", resp.name, " = factor(data$", resp.name, ")")))
eval(parse(text = paste0("data$", pred.name, " = factor(data$", pred.name, ")")))
response <- eval(parse(text = paste0("data$", resp.name)))
predictor <- eval(parse(text = paste0("data$", pred.name)))
if (!(first_in_subtraction %in% unique(predictor))) {
stop("First term in order of subtraction must match values in data.")
}
if (!(response_value_numerator %in% unique(response))) {
stop("Value of response in numerator must match values in data.")
}
if (confidence_level < 0 | confidence_level > 1) {
stop("Confidence level must be given in decimal form.")
}
row.pcts <- prop.table(table(predictor, response), 1)
obs.diff <- row.pcts[
eval(parse(text = paste0("'", first_in_subtraction, "'"))),
eval(parse(text = paste0("'", response_value_numerator, "'")))
] -
row.pcts[
setdiff(unique(predictor), eval(parse(text = paste0("'", first_in_subtraction, "'")))),
eval(parse(text = paste0("'", response_value_numerator, "'")))
]
ng1 <- sum(predictor == eval(parse(text = paste0("'", first_in_subtraction, "'"))))
ng2 <- sum(predictor != eval(parse(text = paste0("'", first_in_subtraction, "'"))))
sim_diffs <- rep(NA, number_repetitions)
for (i in 1:number_repetitions) {
newResponse <- response # sets up as factor with correct levels
newResponse[predictor == eval(parse(text = paste0("'", first_in_subtraction, "'")))] <-
sample(response[predictor == eval(parse(text = paste0("'", first_in_subtraction, "'")))], ng1, replace = TRUE)
newResponse[predictor != eval(parse(text = paste0("'", first_in_subtraction, "'")))] <-
sample(response[predictor != eval(parse(text = paste0("'", first_in_subtraction, "'")))], ng2, replace = TRUE)
row.pcts <- prop.table(table(predictor, newResponse), 1)
sim_diffs[i] <- row.pcts[
eval(parse(text = paste0("'", first_in_subtraction, "'"))),
eval(parse(text = paste0("'", response_value_numerator, "'")))
] -
row.pcts[
setdiff(unique(predictor), eval(parse(text = paste0("'", first_in_subtraction, "'")))),
eval(parse(text = paste0("'", response_value_numerator, "'")))
]
}
low_ci <- quantile(sim_diffs, (1 - confidence_level) / 2)
high_ci <- quantile(sim_diffs, 1 - (1 - confidence_level) / 2)
h <- hist(sim_diffs, plot = FALSE, breaks = "FD")
cuts <- cut(h$breaks, c(-Inf, low_ci, high_ci, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
col.vec[cuts == levels(cuts)[3]] <- "red"
break_range <- max(h$breaks) - min(h$breaks)
plot(h,
col = col.vec, main = "",
ylab = "",
yaxt = "n",
xlim = c(
min(min(h$breaks), low_ci - break_range / 6),
max(max(h$breaks), high_ci + break_range / 6)
),
xlab = "Bootstrap Difference in Proportions",
sub = paste0(
"Mean: ", round(mean(sim_diffs), 3),
" SD: ", round(sd(sim_diffs), 3), " ",
100 * confidence_level, "% CI: (",
round(low_ci, 3), ", ", round(high_ci, 3), ")"
)
)
abline(v = c(low_ci, high_ci), col = "red", lwd = 2)
cutoff_label <- c(
paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
)
text(c(low_ci, high_ci),
rep(max(h$counts), 2),
labels = cutoff_label,
pos = c(2, 4), cex = .75
)
}
#' Simulation-based hypothesis test for a difference in means
#'
#' This function will run a simulation-based hypothesis test for a
#' difference in means or difference in medians of a quantitative variable
#' for two independent groups.
#'
#' @param formula Formula of the form `response ~ predictor`,
#' where `predictor` defines the two groups of the explanatory variable and
#' `response` is a quantitative response variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param summary_measure Name of summary measure to return from simulations.
#' Allowed values are
#' `"mean"` for test of difference in means or
#' `"median"` for test of difference in medians.
#' Defaults to `"mean"`.
#' @param first_in_subtraction Value of predictor variable
#' that should be first in order of subtraction for computing
#' difference in means.
#' @param number_repetitions Number of simulated samples.
#' @param as_extreme_as Value of observed difference in means.
#' @param direction Direction of alternative hypothesis.
#' Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param add_normal Logical value indicating whether to superimpose a normal
#' curve on the histogram. Defaults to FALSE.
#'
#' @return Returns plot of distribution of simulated statistics,
#' with values as or more extreme than specified
#' value highlighted, and reports
#' proportion of simulations as or more extreme than specified
#' as subtitle on plot.
#'
#' @examples
#' data(pt)
#' two_mean_test(responses ~ brand,
#' data = pt,
#' first_in_subtraction = "B1",
#' as_extreme_as = -.4,
#' direction = "two-sided",
#' number_repetitions = 1000
#' )
#' @export
two_mean_test <- function(formula,
data,
summary_measure = "mean",
first_in_subtraction,
as_extreme_as,
direction = c("greater", "less", "two-sided"),
number_repetitions = 1,
add_normal = FALSE) {
if (!(summary_measure %in% c("mean", "median"))) {
stop("Summary measure must be either 'mean' or 'median'.")
}
if (!(direction %in% c("greater", "less", "two-sided"))) {
stop("Direction must be 'greater', 'less', or 'two-sided'.")
}
if (is.null(as_extreme_as)) {
stop("Must enter cutoff value for 'as_extreme_as'.")
}
if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
stop("Number of repetitions must be positive and integer valued.")
}
resp.name <- all.vars(formula)[1]
pred.name <- all.vars(formula)[2]
eval(parse(text = paste0("data$", pred.name, " = factor(data$", pred.name, ")")))
response <- eval(parse(text = paste0("data$", resp.name)))
predictor <- eval(parse(text = paste0("data$", pred.name)))
if (!(first_in_subtraction %in% unique(predictor))) {
stop("First term in order of subtraction must match values in data.")
}
n <- nrow(data)
sim_diffs <- rep(NA, number_repetitions)
if (summary_measure == "mean") {
obs.diff <- mean(response[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))]) -
mean(response[predictor == setdiff(
unique(predictor),
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))
)])
for (i in 1:number_repetitions) {
newResponse <- sample(response)
sim_diffs[i] <- mean(newResponse[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))]) -
mean(newResponse[predictor == setdiff(
unique(predictor),
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))
)])
}
} else {
obs.diff <- median(response[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))]) -
median(response[predictor == setdiff(
unique(predictor),
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))
)])
for (i in 1:number_repetitions) {
newResponse <- sample(response)
sim_diffs[i] <- median(newResponse[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))]) -
median(newResponse[predictor == setdiff(
unique(predictor),
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))
)])
}
}
count_extreme <- ifelse(direction == "greater", sum(sim_diffs >= as_extreme_as),
ifelse(direction == "less", sum(sim_diffs <= as_extreme_as),
sum(sim_diffs <= -abs(as_extreme_as)) +
sum(sim_diffs >= abs(as_extreme_as))
)
)
h <- hist(sim_diffs, plot = FALSE, breaks = "FD")
if (direction == "two-sided") {
cuts <- cut(h$breaks, c(-Inf, -abs(as_extreme_as), abs(as_extreme_as), Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
col.vec[cuts == levels(cuts)[3]] <- "red"
} else if (direction == "greater") {
cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[2]] <- "red"
} else {
cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
}
range_diffs <- max(sim_diffs) - min(sim_diffs)
ifelse(summary_measure == "mean",
xlabel <- "Difference in Means",
xlabel <- "Difference in Medians"
)
plot(h,
col = col.vec, main = "",
ylab = "",
yaxt = "n",
xlab = xlabel,
xlim = c(
min(
min(sim_diffs),
ifelse(direction == "two-sided",
-abs(as_extreme_as) - range_diffs / 5,
as_extreme_as - range_diffs / 5
)
),
max(
max(sim_diffs),
ifelse(direction == "two-sided",
abs(as_extreme_as) + range_diffs / 5,
as_extreme_as + range_diffs / 5
)
)
),
sub = paste("Count = ",
count_extreme,
"/", number_repetitions, " = ",
round(count_extreme / number_repetitions, 4),
sep = ""
)
)
if (add_normal == TRUE) {
add_norm(h, sim_diffs)
}
legend("topright",
legend = c(
paste("Mean =", round(mean(sim_diffs, na.rm = T), 3)),
paste("SD =", round(sd(sim_diffs, na.rm = T), 3))
),
col = "white", bty = "n"
)
if (direction == "two-sided") {
abline(v = abs(as_extreme_as), col = "red", lwd = 2)
abline(v = -abs(as_extreme_as), col = "red", lwd = 2)
} else {
abline(v = as_extreme_as, col = "red", lwd = 2)
}
}
#' Bootstrap confidence interval for a difference in means
#'
#' This function will create a bootstrap confidence interval for a
#' difference in means or difference in medians of a quantitative variable
#' for two independent groups.
#'
#' @param formula Formula of the form `response ~ predictor`,
#' where `predictor` defines the two groups of the explanatory variable and
#' `response` is a quantitative response variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param summary_measure Name of summary measure to return from simulations.
#' Allowed values are
#' `"mean"` for confidence interval for difference in means
#' `"median"` for confidence interval for difference in medians.
#' Defaults to `"mean"`.
#' @param first_in_subtraction Value of predictor variable
#' that should be first in order of subtraction for computing
#' difference in means.
#' @param number_repetitions Number of bootstrapped resamples.
#' @param confidence_level Confidence level for interval in decimal form.
#' Defaults to 0.95 (95% confidence interval).
#'
#' @return Returns plot of distribution of bootstrapped statistics,
#' with values as or more extreme than percentile confidence interval range
#' highlighted, and reports confidence interval as subtitle on plot.
#'
#' @examples
#' data(pt)
#' two_mean_bootstrap_CI(responses ~ brand,
#' data = pt,
#' first_in_subtraction = "B1",
#' confidence_level = 0.98,
#' number_repetitions = 1000
#' )
#' @export
two_mean_bootstrap_CI <- function(formula,
data,
summary_measure = "mean",
first_in_subtraction,
confidence_level = 0.95,
number_repetitions = 100) {
if (!(summary_measure %in% c("mean", "median"))) {
stop("Summary measure must be either 'mean' or 'median'.")
}
if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
stop("Number of repetitions must be positive and integer valued.")
}
resp.name <- all.vars(formula)[1]
pred.name <- all.vars(formula)[2]
eval(parse(text = paste0("data$", pred.name, " = factor(data$", pred.name, ")")))
response <- eval(parse(text = paste0("data$", resp.name)))
predictor <- eval(parse(text = paste0("data$", pred.name)))
if (!(first_in_subtraction %in% unique(predictor))) {
stop("First term in order of subtraction must match values in data.")
}
if (confidence_level < 0 | confidence_level > 1) {
stop("Confidence level must be given in decimal form.")
}
n <- nrow(data)
ng1 <- sum(predictor == eval(parse(text = paste0("'", first_in_subtraction, "'"))))
ng2 <- sum(predictor != eval(parse(text = paste0("'", first_in_subtraction, "'"))))
sim_diffs <- rep(NA, number_repetitions)
if (summary_measure == "mean") {
obs.diff <- mean(response[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))]) -
mean(response[predictor == setdiff(
unique(predictor),
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))
)])
for (i in 1:number_repetitions) {
newResponse <- rep(NA, length(response))
newResponse[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))] <-
sample(response[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))],
ng1,
replace = TRUE
)
newResponse[predictor !=
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))] <-
sample(response[predictor !=
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))],
ng2,
replace = TRUE
)
sim_diffs[i] <- mean(newResponse[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))]) -
mean(newResponse[predictor == setdiff(
unique(predictor),
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))
)])
}
} else {
obs.diff <- median(response[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))]) -
median(response[predictor == setdiff(
unique(predictor),
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))
)])
for (i in 1:number_repetitions) {
newResponse <- rep(NA, length(response))
newResponse[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))] <-
sample(response[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))],
ng1,
replace = TRUE
)
newResponse[predictor !=
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))] <-
sample(response[predictor !=
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))],
ng2,
replace = TRUE
)
sim_diffs[i] <- median(newResponse[predictor ==
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))]) -
median(newResponse[predictor == setdiff(
unique(predictor),
eval(parse(text = paste0(
"'",
first_in_subtraction, "'"
)))
)])
}
}
low_ci <- quantile(sim_diffs, (1 - confidence_level) / 2)
high_ci <- quantile(sim_diffs, 1 - (1 - confidence_level) / 2)
h <- hist(sim_diffs, plot = FALSE, breaks = "FD")
cuts <- cut(h$breaks, c(-Inf, low_ci, high_ci, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
col.vec[cuts == levels(cuts)[3]] <- "red"
break_range <- max(h$breaks) - min(h$breaks)
ifelse(summary_measure == "mean",
xlabel <- "Bootstrap Difference in Means",
xlabel <- "Bootstrap Difference in Medians"
)
plot(h,
col = col.vec, main = "",
ylab = "",
yaxt = "n",
xlim = c(
min(min(h$breaks), low_ci - break_range / 6),
max(max(h$breaks), high_ci + break_range / 6)
),
xlab = xlabel,
sub = paste0(
100 * confidence_level, "% CI: (",
round(low_ci, 3), ", ", round(high_ci, 3), ")"
)
)
abline(v = c(low_ci, high_ci), col = "red", lwd = 2)
cutoff_label <- c(
paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
)
text(c(low_ci, high_ci),
rep(max(h$counts), 2),
labels = cutoff_label,
pos = c(2, 4), cex = .75
)
}
#' Simulation-based hypothesis test for regression
#'
#' This function will run a simulation-based hypothesis test for a
#' slope of simple linear regression or correlation
#' between two quantitative variables.
#'
#' @param formula Formula of the form `response ~ predictor`,
#' where `predictor` defines a quantitative explanatory variable and
#' `response` is a quantitative response variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param summary_measure Name of summary measure to return from simulations.
#' Allowed values are
#' `"slope"` for test of slope or
#' `"correlation"` for test of correlation.
#' @param as_extreme_as Value of observed slope or correlation.
#' @param direction Direction of alternative hypothesis.
#' Allowed values are `"greater"`, `"less"`, or `"two-sided"`.
#' @param number_repetitions Number of simulated samples.
#' @param add_normal Logical value indicating whether to superimpose a normal
#' curve on the histogram. Defaults to FALSE.
#'
#' @examples
#' data(mtfires)
#' mtfires$logHect <- log(mtfires$Hectares)
#' regression_test(logHect ~ Temperature,
#' data = mtfires,
#' summary_measure = "correlation",
#' as_extreme_as = 1.388,
#' direction = "greater",
#' number_repetitions = 1000
#' )
#' @export
regression_test <- function(formula,
data,
summary_measure = c("slope", "correlation"),
as_extreme_as,
direction = c("greater", "less", "two-sided"),
number_repetitions = 1,
add_normal = FALSE) {
if (!(summary_measure %in% c("slope", "correlation"))) {
stop("Summary measure must be either 'slope' or 'correlation'.")
}
if (!(direction %in% c("greater", "less", "two-sided"))) {
stop("Direction must be 'greater', 'less', or 'two-sided'.")
}
if (is.null(as_extreme_as)) {
stop("Must enter cutoff value for 'as_extreme_as'.")
}
if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
stop("Number of repetitions must be positive and integer valued.")
}
n <- nrow(data)
resp.name <- all.vars(formula)[1]
pred.name <- all.vars(formula)[2]
response <- eval(parse(text = paste0("data$", resp.name)))
predictor <- eval(parse(text = paste0("data$", pred.name)))
obs.fit <- lm(formula, data = data)
obs.stat <- obs.fit$coef[2]
sim_vals <- rep(NA, number_repetitions)
for (i in 1:number_repetitions) {
newResponse <- sample(response)
sim.fit <- lm(newResponse ~ predictor)
sim_vals[i] <- sim.fit$coef[2]
}
if (summary_measure == "correlation") {
sdRatio <- sd(response) / sd(predictor)
sim_vals <- sim_vals / sdRatio
obs.stat <- obs.stat / sdRatio
}
count_extreme <- ifelse(direction == "greater",
sum(sim_vals >= as_extreme_as),
ifelse(direction == "less",
sum(sim_vals <= as_extreme_as),
sum(sim_vals <= -abs(as_extreme_as)) +
sum(sim_vals >= abs(as_extreme_as))
)
)
h <- hist(sim_vals, plot = FALSE, breaks = "FD")
if (direction == "two-sided") {
cuts <- cut(h$breaks, c(-Inf, -abs(as_extreme_as), abs(as_extreme_as), Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
col.vec[cuts == levels(cuts)[3]] <- "red"
} else if (direction == "greater") {
cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[2]] <- "red"
} else {
cuts <- cut(h$breaks, c(-Inf, as_extreme_as, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
}
range_diffs <- max(sim_vals) - min(sim_vals)
ifelse(summary_measure == "correlation",
xlabel <- "Correlation",
xlabel <- "Slope"
)
plot(h,
col = col.vec, main = "",
ylab = "",
yaxt = "n",
xlab = xlabel,
xlim = c(
min(
min(sim_vals),
ifelse(direction == "two-sided",
-abs(as_extreme_as) - range_diffs / 5,
as_extreme_as - range_diffs / 5
)
),
max(
max(sim_vals),
ifelse(direction == "two-sided",
abs(as_extreme_as) + range_diffs / 5,
as_extreme_as + range_diffs / 5
)
)
),
sub = paste("Count = ",
count_extreme,
"/", number_repetitions, " = ",
round(count_extreme / number_repetitions, 4),
sep = ""
)
)
if (add_normal == TRUE) {
add_norm(h, sim_vals)
}
legend("topright",
legend = c(
paste("Mean =", round(mean(sim_vals, na.rm = T), 3)),
paste("SD =", round(sd(sim_vals, na.rm = T), 3))
),
col = "white", bty = "n"
)
if (direction == "two-sided") {
abline(v = abs(as_extreme_as), col = "red", lwd = 2)
abline(v = -abs(as_extreme_as), col = "red", lwd = 2)
} else {
abline(v = as_extreme_as, col = "red", lwd = 2)
}
}
#' Bootstrap confidence interval for regression
#'
#' This function will create a bootstrap confidence interval for a
#' slope of simple linear regression or correlation
#' between two quantitative variables.
#'
#' @param formula Formula of the form `response ~ predictor`,
#' where `predictor` defines a quantitative explanatory variable and
#' `response` is a quantitative response variable.
#' @param data Data frame with columns for response and predictor variables.
#' @param summary_measure Name of summary measure to return from simulations.
#' Allowed values are
#' `"slope"` for confidence interval for slope or
#' `"correlation"` for confidence interval for correlation.
#' @param confidence_level Confidence level for interval in decimal form.
#' Defaults to 0.95 (95% confidence interval).
#' @param number_repetitions Number of bootstrapped resamples.
#'
#' @examples
#' data(mtfires)
#' mtfires$logHect <- log(mtfires$Hectares)
#' regression_bootstrap_CI(logHect ~ Temperature,
#' data = mtfires,
#' summary_measure = "correlation",
#' confidence_level = 0.9,
#' number_repetitions = 1000
#' )
#' @export
regression_bootstrap_CI <- function(formula,
data,
summary_measure = c("slope", "correlation"),
confidence_level = 0.95,
number_repetitions = 100) {
if (!(summary_measure %in% c("slope", "correlation"))) {
stop("Summary measure must be either 'slope' or 'correlation'.")
}
if (number_repetitions < 1 | !(number_repetitions %% 1 == 0)) {
stop("Number of repetitions must be positive and integer valued.")
}
if (confidence_level < 0 | confidence_level > 1) {
stop("Confidence level must be given in decimal form.")
}
n <- nrow(data)
resp.name <- all.vars(formula)[1]
pred.name <- all.vars(formula)[2]
response <- eval(parse(text = paste0("data$", resp.name)))
predictor <- eval(parse(text = paste0("data$", pred.name)))
sim_vals <- rep(NA, number_repetitions)
for (i in 1:number_repetitions) {
sample_obs <- sample(n, n, replace = TRUE)
newResponse <- response[sample_obs]
newPred <- predictor[sample_obs]
sim.fit <- lm(newResponse ~ newPred)
sim_vals[i] <- sim.fit$coef[2]
}
if (summary_measure == "correlation") {
sdRatio <- sd(response) / sd(predictor)
sim_vals <- sim_vals / sdRatio
}
low_ci <- quantile(sim_vals, (1 - confidence_level) / 2)
high_ci <- quantile(sim_vals, 1 - (1 - confidence_level) / 2)
h <- hist(sim_vals, plot = FALSE, breaks = "FD")
cuts <- cut(h$breaks, c(-Inf, low_ci, high_ci, Inf))
col.vec <- rep("grey80", length(cuts))
col.vec[cuts == levels(cuts)[1]] <- "red"
col.vec[cuts == levels(cuts)[3]] <- "red"
break_range <- max(h$breaks) - min(h$breaks)
ifelse(summary_measure == "correlation",
xlabel <- "Bootstrap Correlation",
xlabel <- "Bootstrap Slope"
)
plot(h,
col = col.vec, main = "",
ylab = "",
yaxt = "n",
xlim = c(
min(min(h$breaks), low_ci - break_range / 6),
max(max(h$breaks), high_ci + break_range / 6)
),
xlab = xlabel,
sub = paste0(
100 * confidence_level, "% CI: (",
round(low_ci, 3), ", ", round(high_ci, 3), ")"
)
)
abline(v = c(low_ci, high_ci), col = "red", lwd = 2)
cutoff_label <- c(
paste(round(100 * (1 - confidence_level) / 2, 1), "percentile"),
paste(round(100 * (1 - (1 - confidence_level) / 2), 1), "percentile")
)
text(c(low_ci, high_ci),
rep(max(h$counts), 2),
labels = cutoff_label,
pos = c(2, 4), cex = .75
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.