#'Bayesian First Aid Alternative to a Test of Proportions
#'
#'\code{bayes.prop.test} estimates the relative frequency of success for two or
#'more groups using Bayesian estimation and is intended as a replacement for
#'\code{\link{prop.test}}.
#'
#'Given data on the number of successes and failures \code{bayes.prop.test}
#'estimates \ifelse{latex}{\eqn{\theta_{1...m}}}{\eqn{\theta}[1...m]}, the relative
#'frequencies of success for each of the \eqn{m} groups. The following model is
#'assumed for each group:
#'
#'\deqn{x \sim \mathrm{Binom}(\theta, n)}{x ~ Binomial(\theta, n)} \deqn{\theta
#'\sim \mathrm{Beta}(1, 1)}{\theta ~ Beta(1, 1)}
#'
#'
#'Here the prior on the \eqn{\theta}s is a non-informative \eqn{\mathrm{Beta}(1,
#'1)}{Beta(1, 1)} distribution which is identical to a \eqn{\mathrm{Uniform}(0,
#'1)}{Uniform(0, 1)} distribution. By \code{plot}ing and looking at a
#'\code{summary} of the object returned by \code{bayes.prop.test} you can get
#'more information about the shape of the posterior and the posterior predictive
#'distribution. \code{\link{model.code}} prints out the corresponding R code
#'underlying \code{bayes.prop.test} which can be copy-n-pasted into an R script
#'and modified, for example, changing the prior on \eqn{\theta}.
#'
#'The \code{print} and \code{plot} function will only work well with a small
#'number of groups (2 to 6). If you have more groups you might want to run the
#'model with a small number of groups and then print the model code using
#'\code{\link{model.code}} and fit the model using that code. The current model does
#'not assume any dependency between the groups, if this is an unreasonable assumption
#'you might want to modify the model code (from \code{\link{model.code}}) to
#'include a dependency between the groups (see
#'\href{http://lingpipe-blog.com/2009/09/23/bayesian-estimators-for-the-beta-binomial-model-of-batting-ability/}{here}
#'for an example).
#'
#'@param x a vector of counts of successes, a one-dimensional table with two
#' entries, or a two-dimensional table (or matrix) with 2 columns, giving the
#' counts of successes and failures, respectively.
#'@param n a vector of counts of trials; ignored if x is a matrix or a table.
#'@param comp.theta a vector of fixed relative frequencies of success to compare
#' with the estimated relative frequency of success. The length of
#' \code{comp.theta} must be the same as the number of groups specified by
#' \code{x}, and its elements must be greater than 0 and less than 1. This
#' argument fills a similar role as \code{p} in \code{\link{prop.test}}.
#'@param alternative ignored and is only retained in order to mantain
#' compatibility with \code{\link{prop.test}}.
#'@param cred.mass the amount of probability mass that will be contained in
#' reported credible intervals. This argument fills a similar role as
#' \code{conf.level} in \code{\link{prop.test}}.
#'@param correct ignored and is only retained in order to mantain compatibility
#' with \code{\link{prop.test}}
#'@param n.iter The number of iterations to run the MCMC sampling.
#'@param progress.bar The type of progress bar. Possible values are "text",
#' "gui", and "none".
#'@param p same as \code{comp.theta} and is only retained in order to mantain
#' compatibility with \code{\link{prop.test}}.
#'@param conf.level same as \code{cred.mass} and is only retained in order to
#' mantain compatibility with \code{\link{prop.test}}.
#'
#'
#'@return A list of class \code{bayes_prop_test} that contains information about
#' the analysis. It can be further inspected using the functions
#' \code{summary}, \code{plot}, \code{\link{diagnostics}} and
#' \code{\link{model.code}}.
#'
#' @examples
#'
#'
#' # Data from Muller, F. H., Tobakmissbrauch und Lungencarcinom,
#' # Zeit. f. Krebsforsch. 49, 57-85, 1939. One of the early papers
#' # investigating the relation between smoking and lung cancer.
#'
#' # Number of heavy smokers in one group of 86 lung cancer patients
#' # and one group of 86 healthy individuals.
#' no_heavy_smokers <- c(56, 31)
#' no_cases <- c(86, 86)
#'
#' bayes.prop.test(no_heavy_smokers, no_cases)
#'
#' # Save the return value in order to inspect the model result further.
#' fit <- bayes.prop.test(no_heavy_smokers, no_cases)
#' summary(fit)
#' plot(fit)
#'
#' # MCMC diagnostics (should not be necessary for such a simple model)
#' diagnostics(fit)
#'
#' # Print out the R code to run the model. This can be copy'n'pasted into
#' # an R-script and further modified.
#' model.code(fit)
#'
#'
#'@seealso \code{\link{bayes.binom.test}} for when you want to estimate the
#' relative frequency for only one group.
#'
#'@export
bayes.prop.test <- function (x, n, comp.theta = NULL, alternative = NULL, cred.mass = 0.95, correct = NULL, n.iter=15000, progress.bar="none", p, conf.level) {
if(! missing(alternative)) {
warning("The argument 'alternative' is ignored by bayes.prop.test")
}
if(! missing(correct)) {
warning("The argument 'correct' is ignored by bayes.prop.test")
}
if(! missing(p)) {
comp.theta <- p
}
if(! missing(conf.level)) {
cred.mass <- conf.level
}
x_name <- deparse(substitute(x))
n_name <- deparse(substitute(n))
### Begin slightly modified code from prop.test
DNAME <- deparse(substitute(x))
if (is.table(x) && length(dim(x)) == 1L) {
if (dim(x) != 2L)
stop("table 'x' should have 2 entries")
l <- 1
n <- sum(x)
x <- x[1L]
}
else if (is.matrix(x)) {
if (ncol(x) != 2L)
stop("'x' must have 2 columns")
l <- nrow(x)
n <- rowSums(x)
x <- x[, 1L]
}
else {
DNAME <- paste(DNAME, "out of", deparse(substitute(n)))
if ((l <- length(x)) != length(n))
stop("'x' and 'n' must have the same length")
}
OK <- complete.cases(x, n)
x <- x[OK]
n <- n[OK]
if ((k <- length(x)) < 1L)
stop("not enough data")
if (any(n <= 0))
stop("elements of 'n' must be positive")
if (any(x < 0))
stop("elements of 'x' must be nonnegative")
if (any(x > n))
stop("elements of 'x' must not be greater than those of 'n'")
if(length(comp.theta) == 1) {
comp.theta <- rep(comp.theta, length(x))
}
if (is.null(comp.theta) && (k == 1))
comp.theta <- 0.5
if (!is.null(comp.theta)) {
if (length(comp.theta) != l)
stop("'comp.theta' must have the same length as 'x' and 'n' or be a single number")
comp.theta <- comp.theta[OK]
if (any((comp.theta <= 0) | (comp.theta >= 1)))
stop("elements of 'comp.theta' must be in (0,1)")
}
if ((length(cred.mass) != 1L) || is.na(cred.mass) ||
(cred.mass <= 0) || (cred.mass >= 1))
stop("'cred.mass' must be a single number between 0 and 1")
### END code from prop.test
if(length(x) == 1) {
return(bayes.binom.test(x, n, comp.theta, cred.mass = ifelse(is.null(cred.mass), 0.5, cred.mass),
n.iter = n.iter, progress.bar = progress.bar))
}
mcmc_samples <- jags_prop_test(x, n, n.chains = 3, n.iter = ceiling(n.iter / 3) , progress.bar=progress.bar)
if(is.null(comp.theta)) {
temp_comp_val <- 0.5
} else {
temp_comp_val <- comp.theta
}
stats <- mcmc_stats(mcmc_samples, cred_mass = cred.mass, comp_val = temp_comp_val)
diff_stats <- mcmc_stats(create_theta_diff_matrix(as.matrix(mcmc_samples)), cred_mass = cred.mass, comp_val = 0)
stats <- rbind(stats, diff_stats)
bfa_object <- list(x = x, n = n, comp_theta = comp.theta, cred_mass = cred.mass,
x_name = x_name, n_name = n_name, data_name = DNAME,
mcmc_samples = mcmc_samples, stats = stats)
class(bfa_object) <- c("bayes_prop_test", "bayesian_first_aid")
bfa_object
}
create_theta_diff_matrix <- function(samples_mat) {
n_groups <- sum(str_count(colnames(samples_mat), "theta\\["))
combs <- combn(n_groups, 2)
theta_diffs <- sapply(1:ncol(combs), function(comb_i) {
i <- combs[1, comb_i]
j <- combs[2, comb_i]
theta_diff <- samples_mat[,paste0("theta[", i,"]")] - samples_mat[,paste0("theta[", j,"]")]
theta_diff <- matrix(theta_diff, nrow = 1, dimnames = NULL)
theta_diff
})
colnames(theta_diffs) <- apply(combs, 2, function(comb) {paste0("theta_diff[", comb[1], ",", comb[2], "]")})
theta_diffs
}
prop_model_string <- "model {
for(i in 1:length(x)) {
x[i] ~ dbinom(theta[i], n[i])
theta[i] ~ dbeta(1, 1)
x_pred[i] ~ dbinom(theta[i], n[i])
}
}"
jags_prop_test <- function(x, n, n.chains=3, n.iter=5000, progress.bar="none") {
mcmc_samples <- run_jags(prop_model_string, data = list(x = x, n = n), inits = list(theta = (x + 1) / (n + 2)),
params = c("theta", "x_pred"), n.chains = n.chains, n.adapt = 0,
n.update = 0, n.iter = n.iter, thin = 1, progress.bar=progress.bar)
mcmc_samples
}
format_group_diffs <- function(bfa_object) {
s <- bfa_object$stats
n_groups <- length(bfa_object$x)
med_diff_mat <- matrix("", nrow = n_groups, ncol = n_groups)
hdi_diff_mat <- matrix("", nrow = n_groups, ncol = n_groups)
diff_names <- rownames(s)[ str_detect(rownames(s), "theta_diff\\[")]
for(diff_name in diff_names) {
indices_match <- str_match(diff_name, "\\[(\\d+),(\\d+)\\]$")
i <- as.numeric(indices_match[1,2])
j <- as.numeric(indices_match[1,3])
med_diff_mat[i, j] <- as.character(round(s[diff_name, "median"], 2) )
hdi_diff_mat[i, j] <- paste("[", signif(s[diff_name, "HDIlo"], 2), ", ", signif(s[diff_name, "HDIup"], 2), "]", sep="")
}
diff_mat <- matrix("", nrow = n_groups * 2, ncol = n_groups)
for(i in seq_len(n_groups)) {
diff_mat[1 + (i - 1) * 2,] <- med_diff_mat[i,]
diff_mat[2 + (i - 1) * 2,] <- hdi_diff_mat[i,]
}
rownames(diff_mat) <- rep(seq_len(n_groups), each = 2)
rownames(diff_mat)[1:nrow(diff_mat) %% 2 == 0] <- ""
rownames(diff_mat) <- paste0(" ", rownames(diff_mat))
diff_mat <- format(diff_mat, width = max(nchar(diff_mat)), justify = "centre")
colnames(diff_mat) <- format(as.character(1:ncol(diff_mat)), width = max(nchar(diff_mat)), justify = "centre")
diff_mat <- diff_mat[-c(nrow(diff_mat) - 1, nrow(diff_mat)), -1, drop=FALSE]
diff_mat
}
#' @method plot bayes_prop_test
#' @export
plot.bayes_prop_test <- function(x, ...) {
samples <- as.matrix(x$mcmc_samples)
# Throw away everything except the what we want to plot, the theta samples.
samples <- samples[,str_detect(colnames(samples), "^theta\\[")]
n_groups <- length(x$x)
diff_samples <- create_theta_diff_matrix(as.matrix(x$mcmc_samples))
layout_mat <- matrix( 0 , nrow=n_groups, ncol=n_groups)
#layout_mat[,1] <- seq_len(n_groups)
diag(layout_mat) <- seq_len(n_groups)
old_par <- par(no.readonly = TRUE)
layout_mat <- t(layout_mat)
layout_mat[lower.tri(layout_mat)] <- seq(n_groups + 1, by = 2,length.out = (ncol(diff_samples)))
layout_mat <- t(layout_mat)
layout_mat[lower.tri(layout_mat)] <- seq(n_groups + 2, by = 2,length.out = (ncol(diff_samples)))
layout(layout_mat)
par( mar=c(3.5,2,2,2) , mgp=c(2.25,0.7,0) )
post_xlim <- range(apply(samples, 2, quantile, probs = c(0.001, 0.999)))
# Some rules for making the post_xlim nice, with a preference for showing endpoints of the scale
xlim_length <- abs(diff(post_xlim))
if( post_xlim[1] - xlim_length < 0) {
post_xlim[1] <- 0
}
if(post_xlim[2] + xlim_length > 1) {
post_xlim[2] <- 1
}
plotPost(samples[,"theta[1]"], cex.lab = 1.5, xlab=bquote(theta[1]), main=paste("Rel. Freq. Group 1"),
cred_mass= x$cred_mass, col="#5DE293" , show_median=TRUE, comp_val=x$comp_theta[1], xlim=post_xlim)
for(i in 2:n_groups) {
plotPost(samples[,paste0("theta[",i, "]")], cex.lab = 1.5, xlab=bquote(theta[.(i)]), main=paste("Group", i),
cred_mass= x$cred_mass, col="#5DE293" , show_median=TRUE, comp_val=x$comp_theta[i], xlim=post_xlim, show_labels = FALSE)
}
diff_xlim <- range(apply(diff_samples, 2, quantile, probs = c(0.001, 0.999)))
if(all(diff_xlim < 0)) {
diff_xlim[2] <- 0
} else if(all(diff_xlim > 0)) {
diff_xlim[1] <- 0
}
for(i in 1:ncol(diff_samples)) {
diff_name <- colnames(diff_samples)[i]
indices_match <- str_match(diff_name, "\\[(\\d+),(\\d+)\\]$")
group_i <- as.numeric(indices_match[1,2])
group_j <- as.numeric(indices_match[1,3])
plotPost(diff_samples[,i], cex.lab = 1.5, xlab=bquote(theta[.(group_i)] - theta[.(group_j)]),
main="", cred_mass= x$cred_mass, col="skyblue" , show_median=TRUE,
comp_val=0, xlim=diff_xlim, show_labels = FALSE)
plotPost(-diff_samples[,i], cex.lab = 1.5, xlab=bquote(theta[.(group_j)] - theta[.(group_i)]),
main="", cred_mass= x$cred_mass, col="skyblue" , show_median=TRUE,
comp_val=0, xlim=sort(-diff_xlim), show_labels = FALSE)
}
par(old_par)
invisible(NULL)
}
#' @export
print.bayes_prop_test <- function(x, ...) {
s <- format_stats(x$stats)
cat("\n")
cat("\tBayesian First Aid proportion test\n")
cat("\n")
cat("data: ", x$data_name, "\n", sep="")
pad_width <- max(nchar(as.character(c(x$x, x$n)))) + 1
cat("number of successes: ", paste(str_pad(x$x, pad_width), collapse = ","), "\n", sep="")
cat("number of trials: ", paste(str_pad(x$n, pad_width), collapse = ","), "\n", sep="")
cat("Estimated relative frequency of success [", s[1, "HDI%"] ,"% credible interval]:\n", sep="")
for(param_i in which(str_detect(rownames(s), "theta\\["))) {
param <- paste("theta[", param_i, "]", sep="")
cat(" Group ", param_i,": " ,s[param, "median"], " [", paste(s[param, c("HDIlo", "HDIup")], collapse = ", "),"]\n", sep = "")
}
group_diffs <- format_group_diffs(x)
if(ncol(group_diffs) > 1) {
cat("Estimated pairwise group differences (row - column) with", s[1, "HDI%"] ,"% cred. intervals:\n")
cat(format("Group", width = 2 + nchar(rownames(group_diffs)[1]) * 2 + sum(nchar(colnames(group_diffs))),
justify = "centre"), "\n", sep="")
print(format_group_diffs(x), quote=FALSE)
} else {
cat("Estimated group difference (Group 1 - Group 2):\n")
cat(" ", str_trim(group_diffs[1,1]), " ",group_diffs[2,1], "\n", sep="")
}
if(! is.null(x$comp_theta)) {
cat("The prob. that the relative frequency of success is less/more than comp. val:\n")
comp_table <- s[str_detect(rownames(s), "theta\\["), c("comp", "%<comp", "%>comp")]
rownames(comp_table) <- paste(" Group ", 1:nrow(comp_table), ":", sep="")
colnames(comp_table) <- c("comp. val.", " <", " >")
print(format(comp_table, justify="centre"), quote=FALSE)
}
if(ncol(group_diffs) == 1) {
cat("The relative frequency of success is larger for Group 1 by a probability\n")
cat("of", s["theta_diff[1,2]", "%>comp"], "and larger for Group 2 by a probability of", s["theta_diff[1,2]", "%<comp"], ".\n")
}
cat("\n")
invisible(NULL)
}
#' @export
summary.bayes_prop_test <- function(object, ...) {
s <- round(object$stats, 3)
cat(" Data\n")
pad_width <- max(nchar(as.character(c(object$x, object$n)))) + 1
cat("number of successes: ", paste(str_pad(object$x, pad_width), collapse = ","), "\n", sep="")
cat("number of trials: ", paste(str_pad(object$n, pad_width), collapse = ","), "\n", sep="")
cat("\n")
cat(" Model parameters and generated quantities\n")
cat("theta[i]: the relative frequency of success for Group i\n")
cat("x_pred[i]: predicted number of successes in a replication for Group i\n")
cat("theta_diff[i,j]: the difference between two groups (theta[i] - theta[j])\n")
cat("\n")
cat(" Measures\n" )
print(s[, c("mean", "sd", "HDIlo", "HDIup", "%<comp", "%>comp")])
cat("\n")
cat("'HDIlo' and 'HDIup' are the limits of a ", s[1, "HDI%"] ,"% HDI credible interval.\n", sep="")
cat("'%<comp' and '%>comp' are the probabilities of the respective parameter being\n")
cat("smaller or larger than ", s[1, "comp"] ," (except for the theta_diff parameters where\n", sep="")
cat("the comparison value comp is 0.0).\n", sep="")
cat("\n")
cat(" Quantiles\n" )
print(s[, c("q2.5%", "q25%", "median","q75%", "q97.5%")] )
invisible(object$stats)
}
#' @export
diagnostics.bayes_prop_test <- function(fit) {
print_mcmc_info(fit$mcmc_samples)
cat("\n")
print_diagnostics_measures(round(fit$stats, 3))
cat("\n")
cat(" Model parameters and generated quantities\n")
cat("theta: The relative frequency of success\n")
cat("x_pred: Predicted number of successes in a replication\n")
cat("theta_diff[i,j]: the difference between two groups (theta[i] - theta[j])\n")
old_par <- par( mar=c(3.5,2.5,2.5,0.6) , mgp=c(2.25,0.7,0) )
plot(fit$mcmc_samples)
par(old_par)
invisible(NULL)
}
# Model code for the Bayesian First Aid alternative to the test of proportions #
#' @export
model.code.bayes_prop_test <- function(fit) {
cat("### Model code for the Bayesian First Aid ###\n### alternative to the test of proportions ###\n")
cat("require(rjags)\n\n")
cat("# Setting up the data\n")
cat("x <-", deparse(fit$x, ), "\n")
cat("n <-", deparse(fit$n), "\n")
cat("\n")
pretty_print_function_body(prop_model_code)
invisible(NULL)
}
# Not to be run, just to be printed
prop_model_code <- function(x, n) {
# The model string written in the JAGS language
BayesianFirstAid::replace_this_with_model_string
# Running the model
model <- jags.model(textConnection(model_string), data = list(x = x, n = n),
n.chains = 3, n.adapt=1000)
samples <- coda.samples(model, c("theta", "x_pred"), n.iter=5000)
# Inspecting the posterior
plot(samples)
summary(samples)
# You can extract the mcmc samples as a matrix and compare the thetas
# of the groups. For example, the following shows the median and 95%
# credible interval for the difference between Group 1 and Group 2.
samp_mat <- as.matrix(samples)
quantile(samp_mat[, "theta[1]"] - samp_mat[, "theta[2]"], c(0.025, 0.5, 0.975))
}
prop_model_code <- inject_model_string(prop_model_code, prop_model_string)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.