#' Bayes Factor Hypothesis Tests
#'
#' This function allows you to test a hypothesis. Requires prior and posterior samples.
#' Returns the Savage-Dickey Density Ratio (Bayes Factor), Posterior Probabilities,
#' and Credible Interval.
#'
#' @param prior vector of prior samples
#' @param post vector of posterior samples
#' @param H0 the null hypothesis. Defaults to 0.
#' @param plot should the results be plotted? Defaults to FALSE.
#' @export
#' @examples
#' bf_test()
bf_test <- function(prior, post, H0 = 0, plot = FALSE){
xlab = "\u03B8"
ylab = "density"
## Check inputs
post = as.vector(as.matrix(post))
prior = as.vector(as.matrix(prior))
stopifnot(is.logical(plot),
is.numeric(post) | is.integer(post),
is.numeric(prior) | is.integer(prior),
is.numeric(H0) | is.integer(H0))
## Get densities at the null hypothesis
max_prior = EnvStats::demp(H0, prior)
if(max_prior==0){max_prior=.Machine$double.eps*1000}
max_posterior = EnvStats::demp(H0, post)
if(max_posterior==0){max_posterior=.Machine$double.eps*1000}
## Calculate Bayes factors and probabilities
BF01 = max_posterior / max_prior
BF10 = max_prior / max_posterior
pep = BF01 / (1+BF01)
## Make ggplot2 graphs
if (isTRUE(plot)){
label_null = paste("P(Null|\u03B8)=", round(pep,digits=3))
# Gather posterior and prior
results <- data.frame("prior" = prior,
"posterior" = post )
plot_ <- results %>%
tidyr::gather("postprior", "gathered") %>%
ggplot2::ggplot() +
ggplot2::scale_fill_manual(values = c("deepskyblue2", "red3")) +
ggplot2::scale_color_manual(values = c("deepskyblue2", "red3")) +
ggplot2::geom_density(ggplot2::aes(gathered, fill = postprior, color=postprior, y = ..density..), size = .75, alpha = .2) +
ggplot2::labs(x=xlab, y=ylab) +
theme(legend.title=element_blank()) +
annotate("linerange", x = H0, linetype="dotted",size=1.2,
ymin = min(EnvStats::demp(H0,post),EnvStats::demp(H0,prior)),
ymax=max(EnvStats::demp(H0,post),EnvStats::demp(H0,prior)), colour = "black")
probs = cbind.data.frame(m= c("P(null|D)\n","P(alt|D)\n"), prob = c(pep, 1-pep))
probs$m = factor(probs$m, levels(probs$m))
bar <- ggplot(probs, aes(x=m, y=prob,fill=m)) +
geom_col(position = "dodge",width=.35) +
scale_fill_manual(values=c("deepskyblue2", "red3")) +
guides(fill=FALSE) +
coord_cartesian(ylim=c(0,1)) +
ggplot2::labs(x="hypothesis", y="probability")
gridExtra::grid.arrange(bar,plot_,ncol=2,widths=c(.95,2))
## Results
df = c("BF01" = BF01, "BF10" = BF10, "pep"=pep)
return(df)
} else {
df = c("BF01" = BF01, "BF10" = BF10, "pep"=pep)
return(df)
}
}
#' Calculate Equal Tailed or Highest Density Interval
#'
#'
#' @param object vector of posterior samples
#' @param cred.level the confidence level. defaults to .95
#' @param method type of credible interval. One of "ETI" or default "HDI".
#' @export
#' @examples
#' cred_interval()
cred_interval = function (object, cred.level = .95, method="HDI")
{
if (method=="HDI"){
object = as.vector(as.matrix(object))
result <- c(NA_real_, NA_real_)
if (is.numeric(object)) {
attributes(object) <- NULL
x <- sort.int(object, method = "quick")
n <- length(x)
if (n > 0) {
exclude <- n - floor(n * cred.level)
low.poss <- x[1:exclude]
upp.poss <- x[(n - exclude + 1):n]
best <- which.min(upp.poss - low.poss)
result <- c(low.poss[best], upp.poss[best])
names(result) <- c("hdi.low", "hdi.high")
}
}
} else if (method=="ETI"){
object = as.vector(as.matrix(object))
alpha = 1-cred.level
upper = cred.level + (alpha/2)
lower = alpha/2
result <- quantile(object,probs=c(lower,upper), type=5)
names(result) <- c("cred.low", "cred.high")
}
return(result)
}
#' Compute the lowest posterior loss interval
#'
#' @param posterior vector of posterior samples
#' @param prior vector of prior samples
#' @param cred.level the confidence level. defaults to .95
#' @param plot Should the output be plotted? Defaults to FALSE.
#' @export
#' @examples
#' lpl_interval()
lpl_interval = function (prior, posterior, cred.level = .95, plot = FALSE)
{
prior = as.vector(as.matrix(prior))
posterior = as.vector(as.matrix(posterior))
if (missing(prior))
stop("Must input a prior distribution!.")
if (missing(posterior))
stop("Must input a posterior distribution!.")
if (!is.vector(prior))
prior <- as.vector(prior)
if (!is.vector(posterior))
posterior <- as.vector(posterior)
if (length(prior) != length(posterior))
stop("Prior and posterior must have the same number of samples!.")
name <- names(posterior)
if (is.null(name))
name <- "theta"
ord <- order(posterior)
prior <- EnvStats::demp(prior,prior, discrete = TRUE)
prior <- prior[ord]
posterior <- posterior[ord]
loss <- abdisttools:::kld(prior, posterior)[[3]]
x = c(min(posterior), posterior, max(posterior))
y = c(min(loss), loss, min(loss))
coords = cbind.data.frame(x=x,y=y)
postloss = cbind.data.frame(loss=loss,posterior=posterior,prior=prior,ord=ord)
lowest = which(postloss$loss==min(postloss$loss))
lowest = as.vector(postloss$posterior)[lowest]
n <- length(loss)
gap <- max(1, min(n - 1, round(n * cred.level)))
loss.sum <- init <- 1:(n - gap)
for (i in 1:length(init)) {
loss.sum[i] <- sum(loss[init[i]:(init[i] + gap)])
}
min.init <- init[which.min(loss.sum)]
ans <- cbind(posterior[min.init], posterior[min.init + gap])
colnames(ans) <- c("LPL.interval", "LPL.interval")
if (plot == TRUE) {
ab = cbind.data.frame(
a=c(min(posterior[min.init]), posterior[min.init:(min.init +gap)], max(posterior[min.init + gap])),
b=c(min(loss[min.init:(min.init +gap)]), loss[min.init:(min.init + gap)], min(loss[min.init:(min.init + gap)]))
)
loss_curve = ggplot2::ggplot() +
geom_polygon(data=coords,aes(x=coords$x, y=coords$y), fill = "skyblue", colour = "skyblue", alpha = 0.5) +
geom_polygon(data=ab,aes(x=ab$a, y=ab$b), fill = "royalblue3", colour = "royalblue3", alpha = .9) +
ylim(min(loss), max(loss)) +
scale_x_continuous(limits=range(posterior)) +
labs(y="E[intrinsic loss]", x="\u03B8")
postdens =
ggplot2::ggplot(postloss,aes(x = posterior, y=..density..)) +
stat_density(fill="skyblue",alpha = .25, size = .75, adjust = 4, bw = "SJ", kernel = "epanechnikov", n = 2048, trim = FALSE) +
scale_x_continuous(limits=range(posterior)) +
labs(x="\u03B8")
d <- ggplot2::ggplot_build(postdens)$data[[1]]
postdens = postdens +
geom_area(data = subset(d, x > ans[1] & x < ans[2]), aes(x=x, y=y), fill="royalblue3", alpha=.9)
png("NUL")
loss_curve= ggplot_gtable(ggplot_build(loss_curve))
postdens = ggplot_gtable(ggplot_build(postdens))
postdens$widths = loss_curve$widths
dev.off()
gridExtra::grid.arrange(loss_curve,postdens,ncol=1, top=grid::textGrob(paste0("Posterior with", " ", cred.level*100, " ", "%", " " ,"Lowest Posterior Loss Interval"),gp=gpar(fontsize=12,font=2)))
}
ans = as.vector(c(ans,lowest))
names(ans) <- c("LPL.low", "LPL.high", "LPL point estimate")
return(ans)
}
#' Compute the kullback leibler divergence
#'
#' @param a the first distribution
#' @param b the second distribution
#' @param base exponent base, defaults to exp(1)
#' @export
#' @examples
#' kld()
kld = function (a, b, base = exp(1))
{
if (!is.vector(a))
a <- as.vector(a)
if (!is.vector(b))
b <- as.vector(b)
n1 <- length(a)
n2 <- length(b)
if (!identical(n1, n2))
stop("a and b must have the same length.")
if (any(!is.finite(a)) || any(!is.finite(b)))
stop("a and b must have finite values.")
if (any(a <= 0))
a <- exp(a)
if (any(b <= 0))
b <- exp(b)
a[which(a < .Machine$double.xmin)] <- .Machine$double.xmin
b[which(b < .Machine$double.xmin)] <- .Machine$double.xmin
a <- a/sum(a)
b <- b/sum(b)
kld.a.b <- a * (log(a, base = base) - log(b, base = base))
kld.b.a <- b * (log(b, base = base) - log(a, base = base))
sum.kld.a.b <- sum(kld.a.b)
sum.kld.b.a <- sum(kld.b.a)
mean.kld <- (kld.a.b + kld.b.a)/2
mean.sum.kld <- (sum.kld.a.b + sum.kld.b.a)/2
out <- list(kld.a.b = kld.a.b, kld.b.a = kld.b.a,
mean.kld = mean.kld, sum.kld.a.b = sum.kld.a.b, sum.kld.b.a = sum.kld.b.a,
mean.sum.kld = mean.sum.kld, intrinsic.discrepancy = min(sum.kld.a.b,
sum.kld.b.a))
return(out)
}
#' Compute exact p-values for bootstrap or monte carlo samples
#'
#' This uses the correct formula (r+1)/(n+1) for computing exact p-values, instead of
#' r/n (where r is the number of samples greater than the mode of the samples,
#' and n is the total number of samples). See citations.
#'
#' @param samples vector of posterior or bootstrap samples
#' @export
#' @examples
#' exact_pvalue()
#'
exact_pvalue <- function(samples) {
if (length(dim(samples)) == 0) {
std <- backsolve(chol(var(samples)), cbind(0, t(samples)) - mean(samples),
transpose = TRUE)
sqdist <- colSums(std * std)
(sum(sqdist[-1] > sqdist[1]) + 1) / (length(samples) + 1)
} else {
std <- backsolve(chol(var(samples)), cbind(0, t(samples)) - colMeans(samples),
transpose = TRUE)
sqdist <- colSums(std * std)
sum(sqdist[-1] > sqdist[1]) + 1 / (nrow(samples) +1)
}
}
#' Get a basic tidy summary of a bayesian model (no prior samples needed)
#'
#' Get summary including ETI or HDI credible intervals, exact p-values, and
#' Vovk-Selke maximum p-value ratio (VS-MPR).
#'
#' @param x the set of posterior samples to be summarized
#' @param digits number of digits to round estimates to
#' @param estimate.method one of "mean" or "median"
#' @param cred.method one of "HDI" or "ETI"
#' @param cred.level the confidence level. defaults to .95
#' @param cred.int set to FALSE to exclude credible intervals
#' @param pvals set to FALSE to exclude exact p-values
#' @param VSMPR set to FALSE to exclude VS-MPR
#' @param droppars list of parameters to exclude, defaults to c("lp__","hs_c2")
#' @param rhat set to TRUE to include rhat
#' @param ess set to TRUE to include ess
#' @export
#' @examples
#' post_summary()
#'
post_summary =
function (x, pars, digits = 2, estimate.method = "mean", cred.int = TRUE,
pvals = FALSE, ess = TRUE, VSMPR = FALSE, cred.level = 0.95, cred.method = "ETI",
droppars = c("lp__", "hs_c2"), ...)
{
stan <- inherits(x, "stanfit")
ss <- if (stan)
as.matrix(x)
else as.matrix(x)
ss <- ss[, !colnames(ss) %in% droppars, drop = FALSE]
wch = as.numeric(grep("prior_", colnames(ss)))
if (length(wch) == 0) {
ss = ss
}
else {
ss = ss[, -wch]
}
wch = as.numeric(grep("ySim", colnames(ss)))
if (length(wch) == 0) {
ss = ss
}
else {
ss = ss[, -wch]
}
wch = as.numeric(grep("yTest", colnames(ss)))
if(length(wch)==0) {
ss = ss
} else {
ss = ss[,-wch]
}
if (!missing(pars) && !stan) {
if (length(badpars <- which(!pars %in% colnames(ss))) >
0) {
stop("unrecognized parameters: ", pars[badpars])
}
ss <- ss[, pars]
}
if (estimate.method == "mean") {
estimate.method <- match.arg(estimate.method, c("mean",
"median"))
m <- switch(estimate.method, mean = colMeans(ss), median = apply(ss, 2, stats::median))
}
else if (estimate.method == "median") {
estimate.method <- match.arg(estimate.method, c("mean",
"median"))
m <- switch(estimate.method, mean = colMeans(ss), median = apply(ss, 2, stats::median))
}
ret <- data.frame(estimate = m, std.error = apply(ss, 2, sd))
if (cred.int) {
digits = digits
cred.method <- match.arg(cred.method, c("ETI", "HDI"))
SS = as.data.frame(ss)
ci <- switch(cred.method, ETI = t(apply(SS, 2, function(s)
abdisttools:::cred_interval(s, method = "ETI", cred.level = cred.level))),
HDI = t(apply(SS,2, function(s) cred_interval(s, method = "HDI", cred.level = cred.level))))
ci = as.data.frame(ci)
colnames(ci) = c("cred.low", "cred.high")
ret <- data.frame(ret, ci)
}
if (pvals == "TRUE") {
p = apply(ss, 2, abdisttools::exact_pvalue)
ret$p.value <- signif(as.vector(p), 5)
}
if (VSMPR == "TRUE") {
p = apply(ss, 2, abdisttools::exact_pvalue)
vsmpr = VSMPR(as.vector(p))
ret$vs.mpr <- as.vector(vsmpr)
minBF = 1/ret$vs.mpr
ret$`min p.null` <- signif(as.vector(minBF/(1 + minBF)), 5)
}
if (ess == "TRUE"){
ess = apply(ss, 2, coda::effectiveSize)
ret$ess <- ess
}
ret = abdisttools:::post_tibble(ret)
ret$term = factor(ret$term, levels = ret$term)
ret = ret %>% mutate(estimate = signif(estimate, digits = digits),
std.error = signif(std.error, digits = digits))
return(ret)
}
#' Convert posterior summary to tibble
#' @keywords internal
#' @examples
#' post_tibble()
post_tibble = function (x, newnames = NULL, newcol = "term")
{
unrowname = function (x)
{
rownames(x) <- NULL
x
}
if (!is.null(newnames) && length(newnames) != ncol(x)) {
stop("newnames must be NULL or have length equal to number of columns")
}
if (all(rownames(x) == seq_len(nrow(x)))) {
ret <- data.frame(x, stringsAsFactors = FALSE)
if (!is.null(newnames)) {
colnames(ret) <- newnames
}
}
else {
ret <- data.frame(...new.col... = rownames(x), unrowname(x),
stringsAsFactors = FALSE)
colnames(ret)[1] <- newcol
if (!is.null(newnames)) {
colnames(ret)[-1] <- newnames
}
}
dplyr::as_tibble(ret)
}
#' Repeat a row
#'
#' Repeat a row
#' @param x the row
#' @param n the number of times
#' @export
#' @examples
#' duplicateRow()
duplicateRow<-function(x,n){
matrix(rep(x,each=n),nrow=n)
}
#' Repeat a column
#'
#' Repeat a column
#'
#' @param x the column
#' @param n the number of times
#' @export
#' @examples
#' duplicateCol()
duplicateCol<-function(x,n){
matrix(rep(x,each=n), ncol=n, byrow=TRUE)
}
#' Extract posterior samples from a brms object
#' @param x the brms object
#' @param pars specific parameters to include, otherwise all if empty
#' @param droppars parameters to exclude. defaults to c("lp__","hs_c2","temp_Intercept", "log_lik")
#' @export
#' @examples
#' extract_post()
extract_post = function(x,pars,droppars = c("lp__","hs_c2", "temp_Intercept", "log_lik", "ySim", "yTest")){
stan <- inherits(x, "stanfit")
ss <- if (stan)
as.matrix(x, pars = pars)
else as.matrix(x)
ss <- ss[, !colnames(ss) %in% droppars, drop = FALSE]
wch = as.numeric(grep("prior_", colnames(ss)))
if(length(wch)==0) {
ss = ss
} else {
ss = ss[,-wch]
}
wch = as.numeric(grep("ySim", colnames(ss)))
if(length(wch)==0) {
ss = ss
} else {
ss = ss[,-wch]
}
wch = as.numeric(grep("yTest", colnames(ss)))
if(length(wch)==0) {
ss = ss
} else {
ss = ss[,-wch]
}
if (!missing(pars) && !stan) {
if (length(badpars <- which(!pars %in% colnames(ss))) >
0) {
stop("unrecognized parameters: ", pars[badpars])
}
ss <- ss[, pars]
}
abdisttools:::post_tibble(ss)
}
#' Extract posterior predictive samples from an abdiststan model
#' @param x the model
#' @param predname name of the prediction. Default is "ySim"
#' @export
#' @return returns a matrix where the columns are the posterior
#' distributions for each observation in the regression model. Each
#' row is a simulated re-sampling of the data.
#' @examples
#' extract_post_pred()
extract_post_pred = function(x, predname = "ySim"){
pars = predname
stan <- inherits(x, "stanfit")
ss <- if (stan)
as.matrix(x, pars = pars)
else as.matrix(x)
wch = as.numeric(grep("prior_", colnames(ss)))
if (length(wch) == 0) {
ss = ss
}
else {
ss = ss[, -wch]
}
if (!missing(pars) && !stan) {
if (length(badpars <- which(!pars %in% colnames(ss))) >
0) {
stop("unrecognized parameters: ", pars[badpars])
}
ss <- ss[, pars]
}
df = as.data.frame(abdisttools:::post_tibble(ss))
rownames(df) = c(paste0("yRep[", 1:nrow(df), "]" ))
return(df)
}
#' Extract prior samples from a brms object
#' @param x the brms object
#' @param pars specific parameters to include, otherwise all if empty
#' @param droppars parameters to exclude. defaults to c("lp__","hs_c2")
#' @export
#' @examples
#' extract_prior()
#'
extract_prior =
function (x, pars, droppars = c("lp__", "sigma"))
{
stan <- inherits(x, "stanfit")
ss <- if (stan)
as.matrix(x, pars = pars)
else as.matrix(x)
ss <- ss[, !colnames(ss) %in% droppars, drop = FALSE]
wch = as.numeric(grep("prior_", colnames(ss)))
if (length(wch) == 0) {
ss = ss
}
else {
names = colnames(ss)[wch]
ss = as.matrix(ss[, wch])
colnames(ss) = names
}
if (!missing(pars) && !stan) {
if (length(badpars <- which(!pars %in% colnames(ss))) >
0) {
stop("unrecognized parameters: ", pars[badpars])
}
ss <- ss[, pars]
}
abdisttools:::post_tibble(ss)
}
#' Get a summary of the posterior based on the lowest posterior loss relative to prior
#'
#' Get summary including LPL (lowest-posterior loss) intervals and estimates. Parallel offered
#' to speed up calculation for bigger models.
#'
#' @param prior prior samples
#' @param post posterior samples
#' @param estimate.method one of "LPL", "mean", or "median"
#' @param cred.level the confidence level. defaults to .95
#' @param parallel set to TRUE for large models.
#' @export
#' @examples
#' lpl_summary()
lpl_summary = function(prior, post, cred.level= .95, parallel=FALSE){
names = colnames(post)
prior = as.matrix(prior)
post = as.matrix(post)
if (parallel == TRUE) {
varlist = c("prior","post", "cred.level")
cl = as.integer(parallel::detectCores() - 1)
cl = parallel::makeCluster(cl)
parallel::clusterExport(cl, varlist, envir = environment())
intervals =t(parallel::parSapply(cl, 1:ncol(prior), function(i) abdisttools:::lpl_interval(prior[,i], post[,i], cred.level = cred.level)))
estimate = intervals[,3]
intervals = as.data.frame(intervals[,1:2])
colnames(intervals) = c("cred.low", "cred.high")
summary = cbind.data.frame(term = names, estimate = estimate, intervals)
parallel::stopCluster(cl)
} else {
intervals = t(sapply(1:ncol(prior), function(i) abdisttools:::lpl_interval(prior[,i], post[,i], cred.level = cred.level)))
estimate = intervals[,3]
intervals = as.data.frame(intervals[,1:2])
colnames(intervals) = c("cred.low", "cred.high")
summary = cbind.data.frame(term = names, estimate = estimate, intervals)
}
summary = abdisttools:::post_tibble(summary)
return(summary)
}
#' Get savage-dickey bayes factors for multiple parameters
#'
#' @param prior prior samples
#' @param post posterior samples
#' @param H0 null value. defaults to 0.
#' @export
#' @examples
#' bf_summary()
bf_summary = function(prior, post, H0=0){
prior = as.matrix(prior)
post = as.matrix(post)
bf = as.data.frame(t(sapply(1:ncol(prior), function(x) abdisttools:::bf_test(post[,x],prior[,x], H0=H0))))
}
#' Get the Hellinger distance between two probability distributions
#'
#' @param post1 the first posterior distribution
#' @param post2 the second posterior distribution
#' @param plot set to TRUE to plot
#' @export
#' @examples
#' hell.dist()
#'
hell.dist = function (post1, post2, plot = FALSE)
{
post1 = as.vector(as.matrix(post1))
post2 = as.vector(as.matrix(post2))
minx = min(c(post1, post2))
maxx = max(c(post1, post2))
e1 <- EnvStats::demp(post1, post1, discrete=FALSE)
e2 <- EnvStats::demp(post2, post1, discrete=FALSE)
BC = function(a, b) {sum(sqrt((a/sum(a))*(b/sum(b))))}
BCcoef <- BC(e1, e2)
hdist_cont = sqrt(1-BCcoef)
if (plot == TRUE) {
length = length(post1)
distr = c(rep("post1", length), rep("post2", length))
samps = c(post1, post2)
posteriors = cbind.data.frame(distr, samps)
plot = ggplot(posteriors, aes(x = samps, fill = distr)) +
geom_density(alpha = 0.4)
plot(plot)
}
hdist = c(hdist_cont, hdist_cont^2)
names(hdist) = c("Hellinger's Distance", "Squared Hellinger's Distance")
return(hdist)
}
#' Get the Jensen-Shannon divergence between two or more distributions
#'
#' @param m a matrix where each column is a set of distribution samples
#' @param plot set to TRUE to plot
#' @export
#' @examples
#' js.div()
#'
js.div = function (m, plot=FALSE)
{
H <- function(v) {
v <- v[v > 0]
return(sum(-v * log(v)))
}
m = as.matrix(m)
original.m = m
m = apply(m,2,function(x) EnvStats::demp(x, x, discrete = FALSE)/sum(EnvStats::demp(x, x, discrete = FALSE)))
nDistributions <- ncol(m)
nElements <- nrow(m)
w = as.vector(1/ncol(m))
w = rep(w, ncol(m))
wm <- w* m
jsd <- H(rowSums(wm)) -
sum((w* apply(m, 2, H)))
if (plot == TRUE) {
length = nrow(m)
distr = c(sapply(1:ncol(m), function(n) rep(paste("post",
n), length)))
samps = c(original.m)
posteriors = cbind.data.frame(distr, samps)
plot = ggplot(posteriors, aes(x = samps, fill = distr)) +
geom_density(alpha = 0.4)
plot(plot)
}
jsd = c(jsd, sqrt(jsd))
names(jsd) = c("J-S Divergence", "J-S Distance")
return(jsd)
}
#' Get posthoc group comparisons for an ANOVA-type model fit with brms
#'
#' @param brms the model fit
#' @param var the first factor in the interaction
#' @param by the second factor in an interaction if there is one, defaults to "NA"
#' @export
#' @examples
#' bf_robust()
#'
bayes_posthoc = function(brms, var="variable", by="NA") {
if (by=="NA"){
brms %>%
emmeans::emmeans(var) %>%
emmeans::contrast("revpairwise") %>%
gather_emmeans_draws() %>%
spread(key=contrast, value=.value) %>%
mutate(.chain=NULL, .iteration=NULL, .draw=NULL)
} else {
brms %>%
emmeans::emmeans(var, by=by) %>%
emmeans::contrast("revpairwise") %>%
gather_emmeans_draws() %>%
spread(key=contrast, value=.value) %>%
mutate(.chain=NULL, .iteration=NULL, .draw=NULL)
}
}
#' Get the Vovk-Selke Maximum P-value Ratio for a p-value
#'
#' @param p a p-value or vector of p-values
#' @param plot whether to plot; only works for a single p-value. Defaults to FALSE.
#' @export
#' @examples
#' VSMPR()
#'
VSMPR <- function(p, plot=FALSE){
totallynotthevoightkampfftest <- function(p){
vs.mpr <- ifelse(p >= 1/exp(1), 1, 1/(-exp(1)*p*log(p)))
if (any(is.nan(vs.mpr)))
vs.mpr[is.nan(vs.mpr)] <- Inf
return(vs.mpr)
}
vk = totallynotthevoightkampfftest(p)
if (length(p)>1){
plot=FALSE
}
if (plot==TRUE) {
beta_alt = rbeta(200000,min(1,-1/log(p)),1)
beta_null = runif(200000,0,1)
alt_dens = EnvStats::demp(p,beta_alt)
H = c(rep("alt",200000),rep("null",200000))
beta = cbind.data.frame(pvals=c(beta_alt,beta_null),H=H)
title = paste0("H1: beta(\u03BE =",round(min(1,-1/log(p)),2),",","1)"," ", " ","H0: beta(1,1)"," ", " ","VS-MPR="," ", round(vk,digits=2))
gg = ggpubr::ggdensity(as.data.frame(beta),y="..density..", x="pvals", fill="H") +
labs(title=title) +
ggplot2::scale_fill_manual(values = c("deepskyblue2", "red3")) +
annotate("linerange", x = p, linetype="dotted",size=1.2, ymin = 1, ymax=max(alt_dens), colour = "black") + theme_get()
plot(gg)
}
return(vk)
}
#' Get the posterior odds and posterior p-value
#'
#' Evaluates the density at the null hypothesis vs the density at the estimate, and calculates the
#' posterior odds as P(H0 | Data) / P(theta_hat | Data). Taking inspiration from Mills' (2014, 2017)
#' Bayesian Hypothesis Testing approach, the posterior odds are also converted to a probability.
#' The probability is calculated as odds / 1+odds. This provides a p-value type measure without
#' the need to calculate Bayes Factors for conversion to posterior error probabilities. This can
#' be extremely useful when prior samples are not available. Note that Savage-Dickey Density
#' Ratio Bayes Factors evaluate P(H0 | Data) / P(H0) and characterize the change in probability
#' between the null hypothesis in the prior and posterior. Mill's Posterior Odds characterizes
#' the relative posterior probability in the null hypothesis to the point estimate. Hence, the
#' two approaches to calculating odds and p-values differ in what question is answered.
#'
#' @param posterior a vector or data frame of posterior samples
#' @param H0 a single value or a vector of values for the null hypothesis
#' @param method whether the mean (default) or median should be used for the hypothesis test
#' @export
#' @examples
#' posterior_odds()
#'
#'
posterior_odds = function(posterior, H0, method = "mean"){
posterior.odds = function(posterior, H0, method){
if (method == "mean"){
estimate = mean(posterior)
}
else if (method == "median"){
estimate = median(posterior)
}
dens.est = EnvStats::demp(estimate, posterior, discrete = FALSE)
dens.null = EnvStats::demp(H0, posterior, discrete = FALSE)
odds = ( (dens.null) / (dens.est) ) + 2.2e-16
prob = odds / (1+odds)
data.frame(posterior_odds = odds, posterior_pvalue = prob)
}
if (length(H0) > 1) {
out = t(sapply(1:ncol(posterior), function(x) posterior.odds(as.data.frame(posterior)[,x], H0[x], method)))
return(as.data.frame(out))
} else if (length(H0) == 1){
posterior.odds(posterior, H0, method)
}
}
#' One sided (Directional) Bayesian hypothesis testing
#'
#' @param posterior the posterior distribution in question
#' @param H0 The Null Hypothesis. Defaults to 0, but for distributions such as the beta it
#' makes sense to change H0 to 0.50.
#' @param direction "pos" evaluates P(theta > H0 | Data). "neg" evaluates P(theta < H0 | Data).
#' "fsp" (false sign probability) calculates the probability that the estimate is in the wrong
#' direction, ie, min( P(theta > H0 | Data) , P(theta < H0 | Data) ). Defaults to "fsp".
#' @param plot defaults to TRUE
#' @export
#' @examples
#' prob_directed()
#'
prob_directed = function(posterior, H0=0, direction = "fsp", plot = TRUE){
if (direction == "fsp") {
p = min((min(pemp(H0, posterior, discrete = TRUE),
1 - EnvStats::pemp(H0, posterior, discrete = TRUE)) + 2.2e-16), 1)
odds = p / (1-p)
df = data.frame(Odds = odds, false_sign_prob = p)
}
else if (direction == "pos"){
p = min((1 - EnvStats::pemp(H0, posterior, discrete = TRUE) + 2.2e-16) , 1)
odds = p / (1-p)
df = data.frame(Odds = odds, prob_theta_pos = p)
}
else if (direction == "neg"){
p = min((EnvStats::pemp(H0, posterior, discrete = TRUE) + 2.2e-16), 1)
odds = p / (1-p)
df = data.frame(Odds = odds, prob_theta_neg = p)
}
if (plot == TRUE) {
cutoff <- H0
if (direction == "neg"){
p = EnvStats::pemp(H0, posterior, discrete = TRUE)
label = paste(round(p, 4)*100, "\u0025","\u003c", H0)
}
if (direction == "pos"){
p = 1 - EnvStats::pemp(H0, posterior, discrete = TRUE)
label = paste(round(p, 4)*100, "\u0025","\u003e", H0)
}
if (direction == "fsp"){
p1 = EnvStats::pemp(H0, posterior, discrete = TRUE)
p2 = 1 - EnvStats::pemp(H0, posterior, discrete = TRUE)
label = paste(round(p1, 4)*100, "\u0025", "\u003c", H0, "\u003c", round(p2, 4)*100, "\u0025")
}
hist.posterior <- density(posterior, adjust = 3, kernel="gaussian", bw = "SJ", n = length(posterior)) %$%
data.frame(x = x, y = y) %>%
mutate(area = x >= cutoff)
the.plot <- ggplot(data = hist.posterior, aes(x = x, ymin = 0, ymax = y, fill = area)) +
geom_ribbon() +
geom_line(aes(y = y)) +
ggtitle(label = label) +
theme(legend.position="none") +
xlab(label="\u03B8") +
ylab(label=NULL)
if (direction == "neg"){
the.plot = the.plot + scale_fill_manual(values = c("#2CBAFF9B", "#FF00009B"))
}
if (direction == "pos"){
the.plot = the.plot + scale_fill_manual(values = c("#FF00009B", "#2CBAFF9B"))
}
plot(the.plot)
}
return(df)
}
#' Plot the VSMPRs from a summary containing p-values
#'
#' @param summary a summary containing a column named p.value
#' @param text_angle the text angle
#' @param vjust vertical adjustment
#' @export
#' @examples
#' vsmpr_plot()
#'
vsmpr_plot = function(summary, text_angle=90,vjust=.06){
summary =cbind.data.frame(summary)
ggplot(summary, aes(y=forcats::fct_rev(term), x=log(VSMPR(summary$p.value)))) +
geom_point(size=5,color="steelblue3") +
geom_vline(xintercept = 0) +
labs(y="term",x="log(VS-MPR)")+
geom_segment(aes(y=term,
yend=term,
x=0,
xend=log(VSMPR(summary$p.value))),color="steelblue3",size=2)
}
#' make likelihood ratios look nice
#'
#' convert a vector of bayes factors, VSMPRs, or likelihood ratios to easier to read
#' character columns
#'
#' @param ratio a vector of likelihood ratios
#' @param digits number of digits
#' @export
#' @examples
#' tidyRatio()
#'
tidyRatio <- function(ratio, digits="default"){
round.ratio <- function(ratio, digits="default"){
if(digits=="default"){
if(ratio < 1/1000)
result <- "\u2264 1/1000"
if((ratio >= 1/1000) & (ratio <= 1/10))
result <- paste("1/", as.character(round(1/ratio)), sep="")
if((ratio > 1/10) & (ratio < 1))
result <- paste("1/", as.character(round(1/ratio, digits=1)), sep="")
if((ratio < 10) & (ratio >= 1))
result <- as.character(round(ratio, digits=1))
if((ratio >= 10) & (ratio <= 1000))
result <- as.character(round(ratio))
if(ratio > 1000)
result <- "\u2265 1000"
}
else{
if(ratio < 1)
result <- paste("1/", as.character(round(1/ratio, digits=digits)), sep="")
else
result <- as.character(round(ratio, digits=digits))
}
return(result)
}
result <- character()
for(i in 1:length(ratio))
result[i] <- round.ratio(ratio[i], digits=digits)
return(result)
}
#' Add significance stars to probabilities
#'
#' Add significance stars to probabilities
#'
#' @param p a vector of probabilities
#' @param cutpoints in order from most to least significant, a vector of probabilities to demarcate
#' significance levels. Defaults to 0, .002 , .0075, .037, .07, 1. The middle four numbers correspond to
#' the chosen significance levels.
#' @param a vector of symbols to use corresponding to the cutpoints
#' @details The default significance levels, .0075 , .0145, .025, .037 correspond to the p-values
#' obtained when the maximum BF10 (calculated via the VSMPR) is 10, 6, 4, and 3, respectively. These also
#' correspond to minimum posterior probabilities for the null hypothesis of 0.091, 0.143, 0.20, and 0.25.
#' If applying these probabilities to local false discovery rates or posterior null probabilities
#' then one may wish to change the cutpoints to c(0, 0.091, 0.143, 0.20, 0.25, 1).
#' @export
#' @examples
#' tidyProbs()
#'
tidyProbs = function(p, digits = 4, cutpoints = c(0, .0075 , .0145, .025, .037, 1) , symbols = c("***", "**", "*", "\u00B0", " "))
{
paste(format(signif(p,digits=digits), nsmall = digits) ,
unclass(
symnum(p, corr = FALSE, na = FALSE,
cutpoints = cutpoints,
symbols = symbols)
)
)
}
#' Estimate local false discovery rates and q-values
#'
#' Estimate local false discovery rates and q-values (as well as their false sign counterparts) from a
#' vector of p-values. Works best with a large number of p-values, the more the better (in the 100s+ is best,
#' but may work fine with 50+). Using with less than 50 results in an error.
#'
#' @param p a vector of p-values
#' @export
#' @examples
#' est.lfdr()
#'
est.lfdr = function (p)
{
eps = .Machine$double.eps
if (length(p) < 50) {
stop("For lfdr and q-value estimation a large number of p-values are required. Consider instead utilizing the VS-MPR,\n or using lpl_summary to obtain the posterior error probabilities if prior samples are available.")
}
est.xi = function (p)
{
lambda = seq(0.05, .95, 0.025)
rm_na <- !is.na(p)
p <- p[rm_na]
m <- length(p)
lambda <- sort(lambda)
ll <- length(lambda)
if (min(p) < 0 || max(p) > 1) {
stop("p-values not in valid range [0, 1].")
}
if (ll == 1) {
xi0 <- mean(p >= lambda)/(1 - lambda)
xi0.lambda <- xi0
xi0 <- min(xi0, 1)
}
else {
ind <- length(lambda):1
xi0 <- cumsum(tabulate(findInterval(p, vec = lambda))[ind])/(length(p) *
(1 - lambda[ind]))
xi0 <- xi0[ind]
xi0.lambda <- xi0
minxi0 <- quantile(xi0, prob = 0.01,na.rm=TRUE)
W <- sapply(lambda, function(l) sum(p >= l))
mse <- (W/(m^2 * (1 - lambda)^2)) * (1 - W/m) + (xi0 -
minxi0)^2
xi0 <- min(xi0[mse == min(mse)], 1)
}
xi = xi0
names(xi) = "\u03BE"
return(c(xi))
}
lfdr_out <- p
one.sided.p = p/2
rm_na <- !is.na(p)
p <- p[rm_na]
if (min(p) < 0 || max(p) > 1) {
stop("P-values must be in interval [0,1]. Make sure you have not passed along Bayes Factors or VS-MPRs!")
}
xi0 <- est.xi(p)
n <- length(p)
x <- log((p + eps)/(1 - p + eps))
myd <- scdensity::scdensity(x, constraint=c("monotoneLeftTail"), bw="nrd0")
mys <- smooth.spline(x = myd$x, y = myd$y)
y <- predict(mys, x)$y
dx <- exp(x)/(1 + exp(x))^2
lfdr <- (xi0 * dx)/y
lfdr = sapply(lfdr, function(lfdr) min(lfdr, 1))
o <- order(p, decreasing = FALSE)
ro <- order(o)
lfdr <- cummax(lfdr[o])[ro]
lfdr_out[rm_na] <- lfdr
lfsr = lfdr+one.sided.p
lfsr = sapply(lfsr, function(lfsr) min(lfsr, 1))
q.value=data.frame(lfdr_out=lfdr_out)
q.value = q.value %>%
mutate(index=seq(1:nrow(q.value))) %>%
arrange(lfdr_out) %>%
mutate(q.value = cummean(lfdr_out)) %>%
arrange(index) %>%
mutate(index=NULL)
s.value = data.frame(lfsr=lfsr)
s.value = s.value %>%
mutate(index=seq(1:nrow(s.value))) %>%
arrange(lfsr) %>%
mutate(s.value = cummean(lfsr)) %>%
arrange(index) %>%
mutate(index=NULL)
lfdr_out = list(lfdr=lfdr_out, lfsr=lfsr, q.value=q.value$q.value, s.value=s.value$s.value, xi=xi0)
return(lfdr_out)
}
#' Conduct a ROPE test
#'
#' Conduct a test of practical equivalence by defining a region of practical equivalence (ROPE) around the null hypothesis H0.
#'
#' @param x a vector of posterior samples
#' @param ROPE a numeric vector of two numbers. Defaults to c(-.1, .1), assuming a Cohen's d... Adjust to what is reasonable!
#' @param interval.method If cred.limits="NA", then interval method must be one of "HDI", "ETI", or "LPL"
#' @param prior.samps only needed if interval.method="LPL"
#' @param cred.level the confidence level for interval.method. Defaults to .95.
#' @param plot plot the output. defaults to TRUE.
#' @param line.color color marking H0. "white" is default.
#' @param interval.color defaults to purple/violet "#8c00ff"
#' @param density.color defaults to blue "#2CBAFF"
#' @param rope.color defaults to red "#FF0000"
#' @export
#' @examples
#' ROPE_test()
#'
ROPE_test = function (x, ROPE = c(-.1, .1), H0 = 0, prior.samps=NULL, plot = TRUE,
line.color = "white",
interval.color = "#8c00ff",
density.color = "#2CBAFF",
rope.color = "#FF0000",
interval.method = "ETI",
cred.level = .95)
{
x = as.vector(as.matrix(x))
if (cred.level == 1){
cred.level = .99999
}
if (interval.method == "ETI") {
interval <- abdisttools:::cred_interval(as.vector(as.matrix(x)),
cred.level = cred.level, method = "ETI")
names(interval) <- c("lower", "upper")
}
else if (interval.method == "HDI") {
interval <- abdisttools:::cred_interval(as.vector(as.matrix(x)),
cred.level = cred.level, method = "HDI")
names(interval) <- c("lower", "upper")
}
else if (interval.method == "LPL") {
if (is.null(prior.samps)) {
stop("Lowest Posterior Loss intervals require prior samples!",
call. = F) }
else {
interval <- abdisttools:::lpl_interval(prior = prior.samps, posterior = x, plot = FALSE, cred.level = cred.level)[1:2]
names(interval) <- c("lower", "upper")
}
}
if (length(ROPE) != 2)
stop("Argument ROPE needs to be a vector of length two.",
call. = F)
if (ROPE[1] > ROPE[2]) {
tmp <- ROPE[2]
ROPE[2] <- ROPE[1]
ROPE[1] <- tmp
}
original.x = x
x <- sort(x)
x.rope <- dplyr::between(x, interval[1], interval[2])
x <- x[which(x.rope == TRUE)]
r <- dplyr::between(x, ROPE[1], ROPE[2])
rope.pct = sum(r)/length(x)
rope.pct = data.frame(rope = rope.pct)
interval = as.data.frame(t(interval))
rope.pct$outside.rope = 1 - rope.pct$rope
colnames(rope.pct) <- c("inside", "outside")
.neff <- length(x)
result <- dplyr::case_when(interval$lower > ROPE[2] ~ "reject null",
interval$upper < ROPE[1] ~ "reject null", interval$lower >=
ROPE[1] & interval$upper <= ROPE[2] ~ "accept null",
TRUE ~ "undecided")
if (cred.level == .99999){
result = cbind.data.frame(rope.pct, odds = rope.pct$inside / rope.pct$outside,result, stringsAsFactors = FALSE)
names(result) = c("Inside ROPE", "Outside ROPE", "Interval Odds", "Decision")
} else {
result = cbind.data.frame(rope.pct, result, stringsAsFactors = FALSE)
names(result) = c("Inside ROPE", "Outside ROPE", "Decision")
}
if (plot == TRUE) {
xdf = data.frame(posterior.samples = as.vector(original.x))
ROPE = data.frame(t(ROPE))
plot <- xdf %>% ggplot2::ggplot(aes(x = posterior.samples,
y = ..density..)) +
stat_density(fill = density.color, alpha = .90, adjust = 3)
if (result$`Inside ROPE` == 0) {
plot <- plot + theme(legend.title = element_blank()) +
ggplot2::geom_vline(xintercept = H0, color = line.color,
linetype = "dotted", size = 1) +
geom_segment(aes(y = 0, yend = 0, xend = upper, x = lower),
data = interval, size = 2, color = interval.color) +
labs(x = "\u03B8")
}
else {
d <- ggplot2::ggplot_build(plot)$data[[1]]
plot <- plot +
geom_area(data = subset(d, x > ROPE$X1 & x < ROPE$X2), fill = rope.color, aes(x = x, y = y), alpha = 0.90) +
theme(legend.title = element_blank()) +
ggplot2::geom_vline(xintercept = H0, color = line.color,
linetype = "dotted", size = 1) +
geom_segment(aes(y = 0, yend = 0, xend = upper, x = lower), data = interval,size = 2, color = interval.color) +
labs(x = "\u03B8")
}
print(result)
plot
}
else {
return(result)
}
}
#' Convert MCMC chains to tidy for ggplot2
#'
#' @param mcmc.list. an MCMC list
#' @param family. group of parameters
#' with the same name but different numerical value
#' between square brackets (as beta[1], beta[2])
#' @param description. Character vector giving a short descriptive text that identifies the model.
#' @param par_labels. data frame with two colums. One named "Parameter"
#' with the same names of the parameters of the model.
#' @param sort. Logical. When TRUE (the default), parameters are sorted first by
#' family name and then by numerical value.
#' @param inc_warmup. Logical. When dealing with stanfit objects from rstan,
#' logical value whether the warmup samples are included. Defaults to FALSE.
#' @param stan_include_auxiliar. Logical value to include "lp__" parameter in rstan, and "lp__", "treedepth__" and
#' "stepsize__" in stan running without rstan. Defaults to FALSE.
#' @param droppars variables to exclude. Defaults to droppars = c("lp__","hs_c2","nu","sigma").
#' @export
#' @examples
#' tidyGg()
#'
tidyGg = function(GG, family. = NA, description. = NA, burnin. = FALSE, par_labels. = NA,
sort. = TRUE, inc_warmup. = FALSE, stan_include_auxiliar. = FALSE,
droppars = c("lp__","hs_c2","nu","sigma")) {
MCMC_Chain = function (s)
{
n.samples <- dim(s)[1]
iter <- 1:n.samples
d <- data.frame(Iteration = iter, as.matrix(unclass(s)),
check.names = FALSE)
D <- d %>% tidyr::gather(Parameter, value, -Iteration)
D <- dplyr::tbl_df(D)
return(D)
}
MCMC.Sort = function (x)
{
x <- sort(as.character(unique(x)))
family <- gsub("\\[.+\\]", "", x)
Families <- sort(unique(family))
X <- NULL
for (f in Families) {
x.family <- x[family == f]
if (length(grep("\\[", x.family)) > 0) {
index <- gsub("]", "", gsub("(.+)\\[", "", x.family))
if (length(grep(",", index) > 0)) {
idl <- data.frame(x.family = x.family, index = index,
matrix(unlist(strsplit(index, ",")), nrow = length(index),
byrow = TRUE))
for (c in 3:(dim(idl)[2])) {
idl[, c] <- as.numeric(as.character(idl[, c]))
}
command <- paste("dplyr::arrange(idl,", paste(names(idl)[-(c(1,
2))], collapse = ","), ")", sep = "")
idl <- eval(parse(text = command))
x.family <- as.character(idl$x.family)
}
else {
x.family <- x.family[order(as.numeric((gsub("]",
"", gsub("(.+)\\[", "", x.family)))))]
}
X <- c(X, x.family)
}
else {
X <- c(X, x.family)
}
}
return(X)
}
MCMCtoggplot = function (S, family = family., description = description., burnin = burnin., par_labels = par_labels.,
sort = sort., inc_warmup = inc_warmup., stan_include_auxiliar = stan_include_auxiliar.)
{
processed <- FALSE
original.object.class <- class(S)[1]
if (length(class(S)) > 1 & class(S)[1] == "stanreg") {
S <- S$stanfit
}
if (class(S) == "brmsfit") {
S <- S$fit
}
if (class(S) == "stanfit") {
nChains <- S@sim$chains
nThin <- S@sim$thin
mDescription <- S@model_name
D <- NULL
for (l in 1:nChains) {
sdf <- as.data.frame(S@sim$samples[[l]])
names(sdf) <- names(S@sim$samples[[l]])
sdf$Iteration <- 1:dim(sdf)[1]
s <- tidyr::gather(sdf, Parameter, value, -Iteration) %>%
dplyr::mutate(Chain = l) %>% dplyr::select(Iteration,
Chain, Parameter, value)
D <- dplyr::bind_rows(D, s)
}
if (!inc_warmup) {
if (original.object.class == "stanfit") {
D <- dplyr::filter(D, Iteration > (S@sim$warmup/nThin))
D$Iteration <- D$Iteration - (S@sim$warmup/nThin)
}
nBurnin <- S@sim$warmup
}
else {
nBurnin <- 0
}
if (!stan_include_auxiliar) {
D <- dplyr::filter(D, Parameter != "lp__")
}
if (sort) {
D$Parameter <- factor(D$Parameter, levels = MCMC.Sort(D$Parameter))
}
else {
D$Parameter <- factor(D$Parameter)
}
processed <- TRUE
D <- dplyr::tbl_df(D)
}
if (class(S) == "list") {
D <- NULL
for (i in 1:length(S)) {
samples.c <- dplyr::tbl_df(read.table(S[[i]], sep = ",",
header = TRUE, colClasses = "numeric", check.names = FALSE))
D <- dplyr::bind_rows(D, tidyr::gather(samples.c,
Parameter) %>% dplyr::mutate(Iteration = rep(1:(dim(samples.c)[1]),
dim(samples.c)[2]), Chain = i) %>% dplyr::select(Iteration,
Chain, Parameter, value))
}
if (!stan_include_auxiliar) {
D <- D[grep("__$", D$Parameter, invert = TRUE), ]
if (sort) {
D$Parameter <- factor(D$Parameter, levels = MCMC.Sort(D$Parameter))
}
else {
D$Parameter <- factor(D$Parameter)
}
}
nBurnin <- as.integer(gsub("warmup=", "", scan(S[[i]],
"", skip = 12, nlines = 1, quiet = TRUE)[2]))
nThin <- as.integer(gsub("thin=", "", scan(S[[i]], "",
skip = 13, nlines = 1, quiet = TRUE)[2]))
processed <- TRUE
}
if (class(S) == "mcmc.list" | class(S) == "mcmc" | processed) {
if (!processed) {
lS <- length(S)
D <- NULL
if (lS == 1 | class(S) == "mcmc") {
if (lS == 1 & class(S) == "mcmc.list") {
s <- S[[1]]
}
else {
s <- S
}
D <- dplyr::mutate(MCMC_Chain(s), Chain = 1) %>%
dplyr::select(Iteration, Chain, Parameter,
value)
nBurnin <- (attributes(s)$mcpar[1]) - (1 * attributes(s)$mcpar[3])
nThin <- attributes(s)$mcpar[3]
}
else {
for (l in 1:lS) {
s <- S[l][[1]]
D <- dplyr::bind_rows(D, dplyr::mutate(MCMC_Chain(s),
Chain = l))
}
D <- dplyr::select(D, Iteration, Chain, Parameter,
value)
nBurnin <- (attributes(s)$mcpar[1]) - (1 * attributes(s)$mcpar[3])
nThin <- attributes(s)$mcpar[3]
}
if (sort) {
D$Parameter <- factor(D$Parameter, levels = MCMC.Sort(D$Parameter))
}
else {
D$Parameter <- factor(D$Parameter)
}
D <- dplyr::arrange(D, Parameter, Chain, Iteration)
}
attr(D, "nChains") <- length(unique(D$Chain))
attr(D, "nParameters") <- length(unique(D$Parameter))
attr(D, "nIterations") <- max(D$Iteration)
if (is.numeric(burnin) & length(burnin) == 1) {
attr(D, "nBurnin") <- burnin
}
else if (is.logical(burnin)) {
if (burnin) {
attr(D, "nBurnin") <- nBurnin
}
else {
attr(D, "nBurnin") <- 0
}
}
else {
stop("burnin must be either logical (TRUE/FALSE) or a numerical vector of length one.")
}
attr(D, "nThin") <- nThin
if (is.character(description)) {
attr(D, "description") <- description
}
else {
if (!is.na(description)) {
print("description is not a text string. The name of the imported object is used instead.")
}
if (exists("mDescription")) {
attr(D, "description") <- mDescription
}
else {
attr(D, "description") <- as.character(sys.call()[2])
}
}
if (!is.na(family)) {
D <- get_family(D, family = family)
}
if (class(par_labels) %in% c("data.frame", "tbl_df")) {
if (length(which(c("Parameter", "Label") %in% names(par_labels))) ==
2) {
aD <- attributes(D)
levels(D$Parameter)[which(levels(D$Parameter) %in%
par_labels$Parameter)] <- as.character(par_labels$Label[match(levels(D$Parameter)[which(levels(D$Parameter) %in%
par_labels$Parameter)], par_labels$Parameter)])
L <- dplyr::tbl_df(data.frame(Parameter = par_labels$Label,
ParameterOriginal = par_labels$Parameter)) %>%
mutate(Parameter = as.character(Parameter))
D <- suppressWarnings(dplyr::left_join(D, L,
by = "Parameter"))
D <- D %>% dplyr::select(Iteration, Chain, Parameter,
value, ParameterOriginal)
if (class(D$Parameter) == "character") {
if (sort) {
D$Parameter <- factor(D$Parameter, levels = MCMC.Sort(D$Parameter))
}
else {
D$Parameter <- factor(D$Parameter)
}
}
attr(D, "nChains") <- aD$nChains
attr(D, "nParameters") <- aD$nParameters
attr(D, "nIterations") <- aD$nIterations
attr(D, "nBurnin") <- aD$nBurnin
attr(D, "nThin") <- aD$nThin
attr(D, "description") <- aD$description
if (dim(par_labels)[2] > 2) {
aD <- attributes(D)
L.noParameter <- dplyr::tbl_df(par_labels) %>%
dplyr::select(-Parameter) %>% dplyr::mutate(Label = as.character(Label))
D <- suppressWarnings(dplyr::left_join(D, L.noParameter,
by = c(Parameter = "Label")))
if (class(D$Parameter) == "character") {
if (sort) {
D$Parameter <- factor(D$Parameter, levels = MCMC.Sort(D$Parameter))
}
else {
D$Parameter <- factor(D$Parameter)
}
}
}
attr(D, "nChains") <- aD$nChains
attr(D, "nParameters") <- aD$nParameters
attr(D, "nIterations") <- aD$nIterations
attr(D, "nBurnin") <- aD$nBurnin
attr(D, "nThin") <- aD$nThin
attr(D, "description") <- aD$description
}
else {
stop("par_labels must include at least columns called 'Parameter' and 'Label'.")
}
}
else {
if (!is.na(par_labels)) {
stop("par_labels must be a data frame or a tbl_df.")
}
}
return(D)
}
else {
stop(cat(bold(red(("I'm sory Dave. I afraid I can't do that.")))))
}
}
fit.gg = MCMCtoggplot(GG)
fit.gg = fit.gg[!fit.gg$Parameter %in% droppars , ]
return(fit.gg)
}
#' Calculate R2 for MCMC or Bootstrap Regression Samples
#'
#' Calculate R2 for MCMC or Bootstrap Regression Samples
#'
#' @param formula a formula specifying the model.
#' @param data the data
#' @param the samples
#' @export
#' @examples
#' MCMC_R2()
#'
MCMC_R2 = function(formula, data, model.fit){
model.fit = as.matrix(model.fit)
y = which(colnames(data)==as.character(formula)[[2]])
y = data[,y]
Xmat = model.matrix(formula, data)
model = extract_post(model.fit, droppars = c("log_lik", "lp__", "nu", "sigma", "skew", "invsigma2", "lambda"))
wch = colnames(model)
coefs = model[, wch]
fit = as.data.frame(as.matrix(coefs) %*% as.matrix(t(Xmat)))
resid = sweep(fit, 2, as.vector(as.matrix(y)), "-")
var_f = apply(fit, 1, var)
var_e = apply(resid, 1, var)
R2 = var_f/(var_f + var_e)
return(R2)
}
#' Get posterior distributions for predictions from abdiststan fits
#'
#' Generate Posterior Predictive Distributions for Regression Models
#'
#' @param fit the MAPfit or abdiststanfit object
#' @param newdata the data
#' @export
#' @examples
#' post_pred()
#'
post_pred = function(fit, newdata){
formula = attr(fit, "formula")
xmat = model.matrix(formula, newdata)
samples = abdisttools::extract_post(fit)[,1:ncol(xmat)]
t(apply(samples, 1, function(z) as.vector((z %*% t(xmat)))))
}
#' Get point predictions from MAPfit or abdiststanfit models
#'
#' Predict function for abdiststanfit or MAPfit models. If another type of object is accidentally passed to
#' this function it will attempt to pass the object to the generic predict method.
#'
#' @param fit the MAPfit or abdiststanfit object
#' @param newdata the data
#' @export
#' @examples
#' post_linpred()
#'
post_linpred = function(fit, newdata){
if (isTRUE("abdiststanfit" %in% attr(fit, "class"))) {
formula = attr(fit, "formula")
xmat = model.matrix(formula, newdata)
coefs = abdisttools::post_summary(fit, estimate.method = "median")$estimate
coefs = coefs[1:ncol(xmat)]
as.vector((coefs %*% t(xmat)))
} else if (isTRUE("MAPfit" %in% attr(fit, "class"))) {
formula = attr(fit, "formula")
xmat = model.matrix(formula, newdata)
coefs = as.vector(fit)[1:ncol(xmat)]
as.vector((coefs %*% t(xmat)))
} else {
predict(fit, newdata)
}
}
#' Convert Cohen's d or Hedge's g to CLES
#'
#' Convert Cohen's d or Hedge's g to CLES
#'
#' @param x a numeric vector containing 1 or more cohen's d/hedge's g estimates
#' @param the samples
#' @export
#' @examples
#' dtoCLES()
#'
dtoCLES = function(x){pnorm(0, x/sqrt(2), 1, lower.tail = FALSE)}
#' Calculate empirical/non-parametric CLES
#'
#' Calculate empirical/non-parametric CLES
#'
#' @param formula a formula specifying the model.
#' @param data the data
#' @export
#' @examples
#' post_check()
#'
empirical.cles = function(formula, data){
cles <- function(variable, group, baseline, data) {
# Select the observations for group 1
x <- data[data[[group]] == baseline, variable]
# Select the observations for group 2
y <- data[data[[group]] != baseline, variable]
# Matrix with difference between XY for all pairs (Guillaume Rousselet's suggestion)
m <- outer(x,y,FUN="-")
# Convert to booleans; count ties as half true.
m <- ifelse(m==0, 0.5, m>0)
# Return proportion of TRUEs
qxly <- mean(m)
return(qxly)
}
data = as.data.frame(data)
variable = as.character(formula[2])
group = as.character(formula[3])
data$Variable = as.numeric(as.matrix(data)[,which(colnames(as.matrix(data))==variable)])
data$Group <- as.numeric(as.factor(as.matrix(data[,which(colnames(data)==as.character(group))])[,1]))
cles(variable = "Variable", group = "Group", baseline = 1, data = data)
}
#' Bootstrap empirical/non-parametric CLES
#'
#' Bootstrap empirical/non-parametric CLES
#'
#' @param formula a formula specifying the model.
#' @param data the data
#' @param iter defaults to 10000
#' @export
#' @examples
#' post_check()
#'
bootstrap.cles = function(formula, data, iter=10000){
data = as.data.frame(data)
Data = matrix(nrow=nrow(data), ncol = 2)
variable = as.character(formula[2])
group = as.character(formula[3])
Data[,1] <- as.numeric(as.matrix(data)[,which(colnames(as.matrix(data))==variable)])
Data[,2] <- as.numeric(as.factor(as.matrix(data[,which(colnames(data)==as.character(group))])[,1]))
colnames(Data) = c("Variable", "Group")
Data = as.data.frame(Data)
CLES <- function(variable, group, baseline, data) {
# Select the observations for group 1
x <- data[data[[group]] == baseline, variable]
# Select the observations for group 2
y <- data[data[[group]] != baseline, variable]
# Matrix with difference between XY for all pairs (Guillaume Rousselet's suggestion)
m <- outer(x,y,FUN="-")
# Convert to booleans; count ties as half true.
m <- ifelse(m==0, 0.5, m>0)
# Return proportion of TRUEs
qxly <- mean(m)
return(qxly)
}
Cles = vector()
for (i in 1:iter){
Cles[i] <- CLES(variable = "Variable", group = "Group", baseline = 1,data=dplyr::sample_n(Data, size=nrow(Data), replace=TRUE))
}
return(Cles)
}
#' Elicit a prior distribution
#'
#' Prior elicitation
#'
#' @param coins how many coins are you willing to invest? Defaults to 100.
#' @param bins how many bins should the sample space be divided into?
#' @param min.value what is the smallest number you expect to be possible for your sample space?
#' @param max.value what is the largest number you expect to be possible for your sample space?
#' @param distribution one of "gaussian", "beta", "half-t", or "gamma"
#' @export
#' @examples
#' prior.elicit()
#'
prior.elicit = function (coins=100, bins=9, min.value, max.value, distribution="gaussian")
{
n = coins
cats = round(seq(min.value, max.value, length.out = bins), 3)
cat.names = as.character(cats)
cat.names = c(sapply(cat.names[1:(length(cat.names)-1)], function(x) paste(x, ":")), paste(":", cat.names[length(cat.names)]))
if (missing(n))
stop("The n argument is required.")
if (missing(cats))
stop("The cats argument is required.")
if (missing(cat.names))
stop("The cat.names argument is required.")
if (!identical(length(cats), length(cat.names)))
stop("Different lengths found for cats and cat.names.")
cat.labels <- letters[1:length(cats)]
cat(crayon::red("\n\n You will be asked two questions until all coins are invested:"))
cat(crayon::green("\n\n1. How much would you like to invest"))
cat(crayon::magenta("\n2. In which bin do you wish to invest? \n"))
cat(crayon::bold(crayon::cyan("\n Each bin represents the endpoint of a range. The first number, for example, might represent -Inf:100.
\n The last number might represent the range 100:Inf, in the context of an unbounded continuous distribution. \n")))
cat("\nbins:", cat.names)
cat("\nbin ID:", cat.labels, "\n\n")
readline("Press Enter/Return to start: ")
while (n > 0) {
cat("\n\nYou have", n, "coins remaining.\n")
N <- 0
while ((N <= 0)) N <- readline("How much would you like to invest?")
N <- as.numeric(N)
cat("\nIn which bin do you wish to invest?\n")
cat("\nbins:", cat.names)
cat("\nbin Entry:", cat.labels, "\n\n")
answer <- "Bayes"
while (all(cat.labels != answer)) answer <- readline("bin: ")
pos <- which(cat.labels == answer)
if (!exists("out"))
out <- rep(cats[pos], N)
else out <- c(out, rep(cats[pos], N))
bar.out = as.data.frame(table(out))
colnames(bar.out) <- c("Bin", "Coins")
bar = ggpubr::ggbarplot(bar.out, x="Bin", y="Coins", fill="skyblue2")
plot(bar)
n <- n - N
}
if (distribution=="gaussian") {
out = out
dist <- c(median(out), sd(out))
df <- fitdistrplus::fitdist(as.vector(scale(out)), distr="t", start = list(df=6), method="mge", gof="KS")$estimate[1]
df = round(df, 0)
t <- ggpubr::ggdensity(brms:::rstudent_t(5000, df, mu=dist[1], sigma=dist[2]), title="Student-T") + theme_get()
cat(crayon::underline(crayon::bold(crayon::green("Student T approximation:\n"))))
cat(crayon::bold(crayon::green("\nThe parameters that best describe your prior beliefs are:", "\ndf=", df,
"\nmean=", round(dist[1],2), "\nsd=", round(dist[2],1))))
cat(crayon::bold(crayon::green("\nThe approximate quantiles of your prior are:\n")))
print(round(quantile(brms:::rstudent_t(5000, df, mu=dist[1], sigma=dist[2]), c(.025, .05, .25, .5, .75, .95, .975)),2))
cat(crayon::underline(crayon::bold(crayon::blue("Normal approximation:\n"))))
dist <- MASS::fitdistr(out, "normal")$estimate
n <- ggpubr::ggdensity(rnorm(5000, dist[1], dist[2]), title="Normal") + theme_get()
cat(crayon::bold(crayon::blue("\nThe parameters that best describe your prior beliefs are:",
"\nmean=", round(dist[1],2), "\nsd=", round(dist[2],1))))
cat(crayon::bold(crayon::blue("\nThe approximate quantiles of your prior are:\n")))
print(round(quantile(rnorm(5000, dist[1], dist[2]), c(.025, .05, .25, .5, .75, .95, .975)),2))
plot(ggpubr::ggarrange(t, n, nrow=2, ncol=1))
}
if (distribution=="beta") {
dist <- fitdistrplus::fitdist(out, distr ="beta",
start = list(shape1= .01, shape2 = .01), lower=0.001, method="mme")$estimate
plot(ggpubr::ggdensity(rbeta(400000, dist[1], dist[2])) + theme_get())
cat(crayon::bold(crayon::magenta("The parameters that best describe your prior beliefs are:",
"\nshape1=", round(dist[1],3), "\nshape2=", round(dist[2],3))))
cat(crayon::bold(crayon::magenta("\nThe approximate quantiles of your prior are:\n")))
print(round(quantile(rbeta(200000, dist[1], dist[2]), c(.025, .05, .25, .5, .75, .95, .975)),2))
}
if (distribution=="half-t") {
out = out
dist <- c(median(out), sd(out))
df <- fitdistrplus::fitdist(as.vector(scale(out)), distr="t", start = list(df=df.start), method="mge", gof="AD")$estimate[1]
df = round(df, 0)
t <- ggpubr::ggdensity(abs(brms:::rstudent_t(5000, df, mu=dist[1], sigma=dist[2])), title="Student-T") + theme_get()
cat(crayon::underline(crayon::bold(crayon::cyan("Half-Student T approximation:\n"))))
cat(crayon::bold(crayon::cyan("\nThe parameters that best describe your prior beliefs are:", "\ndf=", df,
"\nmean=", round(dist[1],2), "\nsd=", round(dist[2],1))))
cat(crayon::bold(crayon::cyan("\nThe approximate quantiles of your prior are:\n")))
print(round(quantile(abs(brms:::rstudent_t(5000, df, mu=dist[1], sigma=dist[2])), c(.025, .05, .25, .5, .75, .95, .975),2)))
plot(t)
}
if (distribution=="gamma") {
out = out
dist <- fitdistrplus::fitdist(out, "gamma", method="mme")$estimate
g <- ggpubr::ggdensity(abs(rgamma(5000, dist[1], dist[2])), title="Gamma")+ theme_get()
gamma <- crayon::make_style(rgb(.09 , .89, .53, 1))
cat(crayon::underline(crayon::bold(gamma("Gamma approximation:\n"))))
cat(crayon::bold(gamma("\nThe parameters that best describe your prior beliefs are:",
"\nshape", round(dist[1],2), "\nrate=", round(dist[2],1))))
cat(crayon::bold(gamma("\nThe approximate quantiles of your prior are:\n")))
print(round(quantile(abs(rgamma(5000, dist[1], dist[2])), c(.025, .05, .25, .5, .75, .95, .975),2)))
plot(g)
}
out
}
#' Calibrate a personal probability with the deFinetti Game!
#'
#' Prior elicitation
#'
#' @param coins State a personal probability, ie, "I think there's a p=.80 chance it will rain today"
#' @export
#' @examples
#' deFinettiGame()
#'
deFinettiGame = function (probability)
{
if (missing(probability))
stop("The probability argument is required.")
if ((probability <= 0) | (probability > 1))
stop("The probability must be between 0 and 1")
ques <- paste("\nDescribe a hypothesis (such as ``There is an owl in that tree''): ")
event <- readline(crayon::red(ques))
region <- c(0, 1)
while ((region[2] - region[1]) > probability) {
x <- round(mean(region) * 100)
y <- 100 - x
cat(crayon::green(crayon::bold("\nYou have two options:")))
cat(crayon::cyan("\n\n1.Wait and win a payout of $1 if the event happens."))
cat(crayon::magenta("\n2.Draw a marble from a bucket with", x, "purple marbles and",
y, "orange marbles. Drawing a purple", "marble results in a payout of $1."))
ans <- readline(cat(crayon::red(crayon::bold("\n\nChoose Option 1 or Option 2: "))))
if (ans == 1)
region[1] <- x/100
else if (ans == 2)
region[2] <- x/100
else region <- c(0, 0)
}
if (sum(region) == 0)
cat("\nTry again. Valid answers are 1 or 2.")
else {
cat("\n\nYour subjective probability is in the interval [",
region[1], ",", region[2], "] of ", event,
".\n\n", sep = "")
}
return(region)
}
#' Bayesian Binomial & Negative Binomial Test
#'
#' Estimate a probability that has a bernoulli likelihood
#'
#' @param z number of successes
#' @param N number of trials
#' @param a.prior Prior for first shape parameter
#' @param b.prior Prior for second shape parameter
#' @param cred.level Credibility level. Defaults to .975
#' @param console Should it print to console? Defaults to TRUE
#' @param return.samples Should it return samples of the prior and posterior? Defaults to FALSE
#' @export
#' @examples
#' bern.test()
#'
bern.test = function(z, N, a.prior=1, b.prior=1, cred.level=.975, return.samples=FALSE, console=TRUE){
scientific_10x <- function(values, digits = 1) {
if(!is.numeric(values)){
stop("values must be numbers")
}
if(grepl("^\\d{2}$", digits)){
stop("digits must a one or two digit whole number")
}
x <- sprintf(paste0("%.", digits, "e"), values)
x <- gsub("^(.*)e", "\\1e", x)
longestExponent <- max(sapply(gregexpr("\\d{1,}$", x), attr, 'match.length'))
zeroTrimmed <- ifelse(longestExponent > 2,
paste0("\\1", paste(rep("~", times = longestExponent-1), collapse = "")),
"\\1")
x <- gsub("(e[+|-])[0]", zeroTrimmed, x)
x <- gsub("e", "*10^", x)
if(any(grepl("\\^\\-", x))){
x <- gsub("\\^\\+", "\\^~~", x)
} else {
x <- gsub("\\^\\+", "\\^", x)
}
# return this as an expression
parse(text=x)
}
estimate = (z + a.prior) / (N + a.prior + b.prior)
alpha1 = z + a.prior
beta1 = N - z + b.prior
set.seed(123)
posteriorA = rbeta(42500/2,alpha1, beta1)
set.seed(1423)
posteriorB = rbeta(42500/2,alpha1, beta1)
posterior = c(posteriorA, posteriorB)
set.seed(123)
priorA = rbeta(42500/2, a.prior, b.prior)
set.seed(1423)
priorB = rbeta(42500/2, a.prior, b.prior)
prior = c(priorA, priorB)
if(length(qbeta(seq(0,1, by=.00001)[which(dbeta(seq(0,1, by=.00001), a.prior, b.prior)==max(dbeta(seq(0,1, by=.00001), a.prior, b.prior)))], a.prior, b.prior)) > 1) {
prior.mean = qbeta(.5, a.prior, b.prior)
} else {
prior.mean = qbeta(seq(0,1, by=.00001)[which(dbeta(seq(0,1, by=.00001), a.prior, b.prior)==max(dbeta(seq(0,1, by=.00001), a.prior, b.prior)))], a.prior, b.prior)
}
prior.dens = dbeta(prior.mean, a.prior, b.prior)
posterior.dens = dbeta(prior.mean, alpha1, beta1)
if (console==TRUE) {
cat(crayon::red(crayon::underline("\nEstimated Prob. of Success\n")))
cat(crayon::red(round(estimate,3)))
cat(crayon::blue(crayon::underline("\n",paste(100*cred.level, "%"), "Lowest Posterior Loss Interval\n")))
cat(crayon::blue(round(abdisttools:::lpl_interval(prior, posterior, cred.level=cred.level)[-3],3)))
cat(crayon::magenta(crayon::underline("\n",paste(100*cred.level, "%"), "Exact Equal Tailed Interval\n")))
lower = qbeta((1 - cred.level )/ 2, alpha1, beta1)
upper = qbeta(cred.level+((1 - cred.level )/ 2), alpha1, beta1)
ETI = c(lower, upper)
cat(crayon::magenta(round(ETI, 3)))
BF10 = prior.dens/posterior.dens
if (5000 <= BF10 || BF10 <=.0001) {
BF10 = scientific_10x(BF10, digits=2)
}
BF01 = 1 / (prior.dens/posterior.dens )
if (5000 <= BF01 || BF01 <=.0001) {
BF01 = scientific_10x(BF01, digits=2)
}
BF = c(BF01, BF10)
cat(crayon::green(crayon::underline("\nHypothesis Tested: Prob. of Success=", round(prior.mean,3), "\n")))
cat(crayon::green(crayon::underline("\nBayes Factor:\n")))
cat(crayon::green(c("BF01", as.character(BF01) , "\n")))
cat(crayon::green(c("BF10", as.character(BF10) , "\n")))
}
if (return.samples == TRUE)
{
samples = cbind.data.frame(posterior = posterior, prior = prior)
return(samples)
}
}
#' Variance Inflation Factor Plot
#'
#' Calculate the VIF for a least squares or generalized linear model.
#'
#' @param formula the formula
#' @param data the data
#' @param family the glm family. Defaults to "gaussian"
#' @export
#' @examples
#' VIF()
#'
VIF = function(formula, data, family="gaussian"){
model = glm(formula=formula,data=data,family=family)
vifs = car::vif(model)
if (isTRUE(is.vector(vifs))) {
vifs = as.data.frame(vifs)
colnames(vifs) = "GVIF"
}
vifs %>% as.data.frame() %>% rownames_to_column() %>% mutate(Variable=as.factor(rowname)) %>%
ggplot(aes(y=Variable, x=GVIF, fill=GVIF)) +
geom_colh(fill="steelblue") +
geom_vline(xintercept=5, linetype="dashed") +
guides(fill=FALSE)
}
#' Posterior Predictive Checks for abdiststan models
#'
#' Calculate the VIF for a least squares or generalized linear model.
#'
#' @param out the model fit
#' @param y.obs the observed outcome variable.
#' @param reps Defaults to 100.
#' @param pred.name the name of the simulations
#' @param y.obs.color the color of the density curve for the observed data (suggest black or white
#' to contrast with the assortment of colors used for the simulations)
#' @export
#' @examples
#' post_check()
#'
post_check = function(out, y.obs, reps = 100, pred.name = "ySim", y.obs.color = "white") {
pp = extract_post_pred(out, predname = pred.name)
N = nrow(pp)
wch = sample(1:N, size = reps)
y.obs = data.frame(Y.obs = y.obs)
y.sims = as.data.frame(pp)
y.sims = t(y.sims[wch, ])
y.sims = as.data.frame(y.sims)
y.sims = y.sims %>% rownames_to_column()
y.sims = gather(y.sims, key = "rowname")
g = ggplot(y.sims, aes(x = value, color = rowname), linetype = 4, alpha = 0) +
geom_density(trim = FALSE) +
geom_density(data = y.obs, aes(x = Y.obs), color = y.obs.color, size = 1.2, linetype = 1, trim = FALSE, inherit.aes = FALSE) +
guides(color = FALSE) +
ggtitle(label = "\nPosterior Predictive Check\n")
plot(g)
}
#' Binomial Confusion matrix for classification problems
#'
#' An alternative to the caret package's confusion matrix function.
#'
#' @param predictions the predicted classes.
#' @param actual the real outcome variable.
#' @export
#' @examples
#' confusion.matrix()
#'
confusion.matrix = function(predictions, actual) {
table = table(predictions, actual)
table = as.data.frame.matrix(table)
A = table[1,1]
B = table[1,2]
C = table[2,1]
D = table[2,2]
MCC = ((A*D) - (C* B)) / sqrt((A+C)*(A+B)*(D+C)*(D+B))
Accuracy = (A+D)/(A+B+C+D)
NIR = max((A+C)/(A+B+C+D) , 1 - (A+C)/(A+B+C+D))
Prevalence = (A+C)/(A+B+C+D)
a.prior = 0.5
b.prior = 0.5
z = (A+D)
N = (A+B+C+D)
estimate.accuracy = (z + a.prior) / (N + a.prior + b.prior)
alpha1 = z + a.prior
beta1 = N - z + b.prior
lower = qbeta((1 - .90 )/ 2, alpha1, beta1)
upper = qbeta(.90 +((1 - .90 )/ 2), alpha1, beta1)
p = pbeta(NIR, alpha1, beta1, lower.tail = FALSE)
prior.dens = dbeta(NIR, .5, .5)
posterior.dens = dbeta(NIR, alpha1, beta1)
BF = prior.dens/posterior.dens
Sensitivity = A/(A+C)
Specificity = D/(B+D)
PPV = (Sensitivity * Prevalence)/((Sensitivity*Prevalence) + ((1-Specificity)*(1-Prevalence)))
NPV = (Specificity * (1-Prevalence))/(((1-Sensitivity)*Prevalence) + ((Specificity)*(1-Prevalence)))
FDR = 1-PPV
FNDR = 1-NPV
matrix0 = matrix(0, nrow=13, ncol=3)
matrix0[1,] <- c("Classification", paste0("True", levels(actual)[1]), paste0("True", levels(actual)[2]))
matrix0[c(2,3),1] <- c(levels(actual)[1], levels(actual)[2])
matrix0[2,2] = A
matrix0[2,3] = B
matrix0[3,2] = C
matrix0[3,3] = D
matrix0[4,] = c("", "", "")
matrix0[5,] = c("Accuracy", round(Accuracy, 4), "")
matrix0[6,] = c("90% Credible Interval", round(lower,4), round(upper,4))
matrix0[7,] = c("No Information Rate", round(NIR,3), " ")
matrix0[8,] = c("Prob(Acc > NIR | Data)", round(p,4), paste0("(BF=", round(BF,4), ")"))
matrix0[9,] = c("Sensitivity", round(Sensitivity,4), " ")
matrix0[10,] = c("Specificity", round(Specificity,4), " ")
matrix0[11,] = c("False Discovery Rate", round(FDR,4), " ")
matrix0[12,] = c("False Nondiscovery Rate", round(FNDR,4), " ")
matrix0[13,] = c(paste0("Matthew's Correlation"," ", "(" ,noquote("\u3d5"), ")"), round(MCC, 4), paste0(noquote(paste0("\u3d5", "\u00B2")), "=", round(MCC^2, 4)))
knitr::kable(as.tibble(matrix0), col.names = c("","",""))
}
#' Input desired mean and sd to obtain corresponding gamma parameters
#'
#' This function makes it easy to get the parameters of a gamma distribution
#' such that they have a desired mode and standard deviation.
#'
#' @param mean the desired mean
#' @param sd the desired sd
#' @export
#' @examples
#' gamma_params()
#'
gamma_params = function (mean, sd, plot = FALSE)
{
ra = (mean)/(sd^2)
sh = (mean^2)/(sd^2)
params = c(sh, ra)
names(params) = c("shape", "rate")
if (plot == TRUE) {
gamma = data.frame(gamma = rgamma(25000, sh, ra))
gg = ggplot(data = gamma, aes(x = gamma)) +
stat_density(alpha = 0.25, size = 0.7, adjust = 10, bw = "SJ", kernel = "cosine",
n = 8192, trim = FALSE, fill = "#00ed7e", color = "#00ed7e")
plot(gg)
}
return(params)
}
#' Input desired mode and sd to obtain corresponding inverse gamma parameters
#'
#' This function makes it easy to get the parameters of an inverse gamma distribution
#' such that they have a desired mode and standard deviation.
#'
#' @param mode the mode
#' @param sd the sd
#' @export
#' @examples
#' inv_gamma_params()
#'
#'
inv_gamma_params = function(mode, sigma, plot = FALSE) {
rinvgamma =
function (n, shape, scale = 1)
{
return(1/rgamma(n = n, shape = shape, rate = scale))
}
sigma2 = sigma^2
mu2 = mode^2
if(sigma^2 < Inf) {
shape = sqrt(2*(2+mode^2/sigma^2))
shape2 = 2*shape
shape1 = 2
err = 2*mode^2*gamma(shape/2)^2-(sigma^2+mode^2)*(shape-2)*gamma((shape-1)/2)^2
while(abs(shape2-shape1) > 1e-12) {
if(err > 0) {
shape1 = shape
if(shape < shape2) {
shape = shape2
} else {
shape = 2*shape
shape2 = shape
}
} else {
shape2 = shape
}
shape = (shape1+shape2)/2
err = 2*mode^2*gamma(shape/2)^2-(sigma^2+mode^2)*(shape-2)*gamma((shape-1)/2)^2
}
rate = (sigma^2+mode^2)*(shape-2)
} else {
shape = 2
rate = 2*mode^2/pi
}
params = c(shape=shape, rate=rate)
if (plot == TRUE) {
gamma = data.frame(gamma = rinvgamma(20000, shape, rate))
gg = ggplot(data = gamma, aes(x = gamma)) + stat_density(alpha = 0.25,
size = 0.7,
adjust = 3,
bw = "SJ",
kernel = "epanechnikov",
n = 2048,
trim = FALSE,
fill = "#00e6ed",
color = "#00e6ed") +
xlab(label = "inverse gamma")
plot(gg)
}
return(params)
}
#' Input desired mode and sd to obtain corresponding beta parameters
#'
#' This function makes it easy to get the parameters of a beta distribution
#' such that they have a desired mode and standard deviation.
#'
#' @param mean the mean
#' @param sd the sd
#' @export
#' @examples
#' beta_params()
#'
beta_params = function( mean , sd , plot=FALSE) {
if ( mean <=0 | mean >= 1) stop("must have 0 < mean < 1")
if ( sd <= 0 ) stop("sd must be > 0")
kappa = mean*(1-mean)/sd^2 - 1
if ( kappa <= 0 ) stop("invalid combination of mean and sd")
a = mean * kappa
b = ( 1.0 - mean ) * kappa
params = c(a, b)
names(params) = c("alpha", "beta")
if (plot==TRUE) {
beta = data.frame(beta = rbeta(10000, a, b))
gg = ggplot(data = beta, aes(x = beta)) +
stat_density(alpha = 0.25,
size = .70,
adjust = 3,
bw = "SJ",
kernel = "epanechnikov",
n = 2048,
trim = FALSE,
fill = "#00bfff",
color = "#00bfff")
plot(gg)
}
return(params)
}
#' Extract posterior from a correlation matrix fit in stan
#'
#' @param fit the stan model
#' @param ... other arguments to pass to stan
#' @export
#' @examples
#' extract_cor_post()
#'
extract_cor_post = function(fit, ...){
post = extract_post(fit, ...)
wch = grep("cormat", x = colnames(post))
N = attr(fit, "P")
mat = matrix(0, N, N)
cors = post[,wch]
cors = cors[,which(lower.tri(mat) == TRUE)]
others = post[,-wch]
post = cbind.data.frame(cors, others)
return(post)
}
#' Extract prior from a correlation matrix fit in stan
#'
#' @param fit the stan model
#' @param ... other arguments to pass to stan
#' @export
#' @examples
#' extract_cor_prior()
#'
extract_cor_prior = function(fit, ...){
prior = extract_prior(fit, ...)
wch = grep("prior", x = colnames(prior))
N = attr(fit, "P")
mat = matrix(0, N, N)
cors = prior[,wch]
cors = cors[,which(lower.tri(mat) == TRUE)]
return(cors)
}
#' Special summary method for correlation matrices fit in Stan
#'
#' @param fit the stan model
#' @param digits number of digits to round to
#' @param bayes.factor calculate savage-dickey density ratios
#' @param estimate.method the point summary. defaults to "median"
#' @param cred.method Defaults to "ETI"
#' @param cred.level Defaults to .95
#' @param ... other arguments to pass to post summary
#' @export
#' @examples
#' cor_summary()
#'
cor_summary = function(fit, digits = 2, bayes.factor = TRUE, estimate.method = "mean", cred.method = "ETI", cred.level = .95, ...) {
post = extract_post(fit)
prior = extract_prior(fit)
wch = grep("cormat", x = colnames(post))
N = attr(fit, "P")
mat = matrix(0, N, N)
cors = post[,wch]
cors = cors[,which(lower.tri(mat) == TRUE)]
prior = prior[,which(lower.tri(mat) == TRUE)]
sum = post_summary(cors, estimate.method = estimate.method, cred.method = cred.method, cred.level = cred.level, ...)
#sum = sum %>% mutate_if(is.numeric, signif, digits)
if (bayes.factor == TRUE) {
BF = as.data.frame(t(sapply(1:ncol(prior), function(x) abdisttools:::bf_test(cors[, x], prior[, x], H0 = 0))))
BF = BF %>% mutate_if(is.numeric, signif, 4)
sum = cbind.data.frame(sum, BF)
sum = sum %>% transmute(term = term, estimate = estimate, std.error = std.error, cred.low = cred.low,
cred.high = cred.high, BF01 = BF01)
sum = as_tibble(sum)
}
return(sum)
}
#' Construct a point-estimated correlation matrix from a stan fit
#'
#' @param fit the stan model
#' @export
#' @examples
#' extract_cormat()
#'
extract_cormat = function(fit) {
P = attr(fit, "P")
post = extract_post(fit)
prior = extract_prior(fit)
wch = grep("cormat", x = colnames(post))
post = post[,wch]
estimates = colMeans(post)
matrix(estimates, P, P)
}
#' Concatenate lower and upper confidence interval endpoints
#'
#'
#' @param lower lower
#' @param uppper upper
#' @export
#' @examples
#' tidy_confint()
#'
tidy_confint = function(lower, upper) {
lu = cbind.data.frame(lower, upper)
sapply(1:nrow(lu), function(x) {paste("[", lu[x,1], "," , lu[x,2], "]")})
}
#' Compute Jeffrey's Distance between two distributions
#'
#' Returns the Jeffrey's Distance
#'
#' @param a the first distribution
#' @param b the second distribution
#' @param base exponent base, defaults to exp(1)
#' @export
#' @examples
#' jeffreys_dist()
jeffreys_dist = function (a, b, base = exp(1)) {
if (!is.vector(a))
a <- as.vector(a)
if (!is.vector(b))
b <- as.vector(b)
n1 <- length(a)
n2 <- length(b)
if (!identical(n1, n2))
stop("a and b must have the same length.")
if (any(!is.finite(a)) || any(!is.finite(b)))
stop("a and b must have finite values.")
if (any(a <= 0))
a <- exp(a)
if (any(b <= 0))
b <- exp(b)
a[which(a < .Machine$double.xmin)] <- .Machine$double.xmin
b[which(b < .Machine$double.xmin)] <- .Machine$double.xmin
a <- a/sum(a)
b <- b/sum(b)
kld.a.b <- a * (.5 * (log(a, base = base) - log(b, base = base)))
kld.b.a <- (b * (log(b, base = base) - log(a, base = base)))
sum.kld.a.b <- sum(kld.a.b)
sum.kld.b.a <- .5 * sum(kld.b.a)
sum.kld.a.b + sum.kld.b.a
}
#' Compute Jaynes' Psi between two distributions
#'
#' Returns the Jaynes' Psi divergence
#'
#' @param a the first distribution
#' @param b the second distribution
#' @export
#' @examples
#' jaynes_psi()
jaynes_psi = function (a, b) {
if (!is.vector(a))
a <- as.vector(a)
if (!is.vector(b))
b <- as.vector(b)
n1 <- length(a)
n2 <- length(b)
if (!identical(n1, n2))
stop("a and b must have the same length.")
if (any(!is.finite(a)) || any(!is.finite(b)))
stop("a and b must have finite values.")
if (any(a <= 0))
a <- exp(a)
if (any(b <= 0))
b <- exp(b)
a[which(a < .Machine$double.xmin)] <- .Machine$double.xmin
b[which(b < .Machine$double.xmin)] <- .Machine$double.xmin
a <- a/sum(a)
b <- b/sum(b)
dist = a * (log10(a) - log10(b))
dist = sum(dist)
N = 10*n1
N * dist
}
#' Get summary statistics
#'
#' Returns mean, sd, median, mad, skewness, excess kurtosis, min, max
#'
#' @param x the vector or data frame
#' @param na.rm TRUE
#' @param trim 0
#' @export
#' @examples
#' summary_stats()
summary_stats = function(x, ..., na.rm = TRUE, trim = 0) {
if (is.null(ncol(x))) {
x = data.frame(x = x)
}
XCSS.kurt =
function (x, na.rm = FALSE, ...)
{
if (na.rm)
x = x[!is.na(x)]
n = length(x)
if (is.integer(x))
x = as.numeric(x)
kurtosis = sum((x - mean(x))^4/as.numeric(var(x))^2)/length(x) - 3
kurtosis
}
ret <- tibble::tibble(
column = names(x), n = vapply(
X = x,
FUN = function(k) sum(!is.na(k)), FUN.VALUE = numeric(1)
),
mean = vapply(
X = x, FUN = mean, FUN.VALUE = numeric(1),
na.rm = na.rm
), sd = vapply(
X = x, FUN = stats::sd,
FUN.VALUE = numeric(1), na.rm = na.rm
),
median = vapply(
X = x,
FUN = stats::median, FUN.VALUE = numeric(1)
),
mad = vapply(
X = x,
FUN = stats::mad, FUN.VALUE = numeric(1), na.rm = na.rm
),
min = vapply(
X = x, FUN = min, FUN.VALUE = numeric(1),
na.rm = na.rm
),
max = vapply(
X = x, FUN = max, FUN.VALUE = numeric(1),
na.rm = na.rm
),
skew = vapply(
X = x, FUN = skewness,
FUN.VALUE = numeric(1), na.rm = na.rm
),
excess.kurtosis = vapply(
X = x,
FUN = XCSS.kurt, FUN.VALUE = numeric(1)
)
)
ret
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.