# Helper functions
# Exported functions ----
#' Check if a list / object contains a value
#'
#' @param x object or list
#' @param val character value of the name of the object/list element to check
#'
#' @examples
#' x <- list("a"=1, "b"=2)
#' contains(x, "a")
#'
#' @return TRUE if the value is present, FALSE if not.
#' @export
contains <- function(x, val) {
return(val %in% names(x))
}
# Not exported -----
# Functions used to parse/check hypotheses -----
# Scale a vector of coefficients to unit measurement
# This creates scaled beta coefficients
scale_coef <- function(b, y_sd) {
b * sd(b) / y_sd
}
# Parse hypothesis
#' @importFrom stringr str_detect
#' @importFrom stringr str_split
#' @importFrom stringr str_extract
parse_hypothesis <- function(hypothesis_this) {
# Get from the global environment
operators_allowed <- getOption("blm_hypothesis_operators")$operators
# Check if operations allowed
detect_operator <- stringr::str_detect(hypothesis_this, operators_allowed)
# If all false, then error
if(all(detect_operator == FALSE)) {
stop(paste0("No allowed operators found in hypothesis. Allowed operators are '",
paste0(operators_allowed, collapse=", "), "'"))
}
# Which operators found?
# Split into: primary operators (>, <, =)
# Group operators (|, &)
pops <- operators_allowed[1:3][detect_operator[1:3]]
gops <- operators_allowed[4:5][detect_operator[4:5]]
# If multiple primary operators found but not & then ==> error
if((length(gops) == 0) & (length(pops) > 1)) {
stop("Multiple primary operators ('=', '<', '>') passed without a group operator ('&')")
}
# Check for '&' (multiple hypotheses)
if(any(gops == "\\&")) {
hyps <- stringr::str_split(hypothesis_this, "\\&") %>%
# Trim whitespace
lapply(trimws) %>%
# To list
unlist() %>%
as.list()
} else {
hyps <- list(hypothesis_this)
}
# For each operator, split at left and right side
hyps_split <- hyps %>%
lapply(function(x) {
# Detect operator
pop_current <- stringr::str_extract(x, pops) %>%
na.omit(.) %>%
as.character()
# Split string
sp <- stringr::str_split(x, pop_current)[[1]] %>%
trimws()
# Check length
if(length(sp) > 2) {
stop(paste0("Too many elements in hypothesis: '", paste0(pop_current), "'"))
}
# Any numeric?
nums <- c()
for(i in sp) nums <- c(nums, suppressWarnings(!is.na(as.numeric(i))))
# Left and right parts
left <- ifelse(nums[1], as.numeric(sp[1]), sp[1])
right <- ifelse(nums[2], as.numeric(sp[2]), sp[2])
# Result
res <- list(
"operator" = pop_current,
"left" = list(
"expression" = left,
"abs_values" = stringr::str_detect(left, "\\|"),
"algebra" = stringr::str_detect(left, "\\*|\\-|\\+|\\/"),
"is_numeric" = nums[1]
) ,
"right" = list(
"expression" = right,
"abs_values" = stringr::str_detect(right, "\\|"),
"algebra" = stringr::str_detect(right, "\\*|\\-|\\+|\\/"),
"is_numeric" = nums[2]
))
# Parse parameter names
res$left$params <- parse_parameters(res$left$expression, res$left$abs_values, res$left$algebra,
res$left$is_numeric)
res$right$params <- parse_parameters(res$right$expression, res$right$abs_values, res$right$algebra,
res$right$is_numeric)
# If absolute values, replace | | for R command
if(res$left$abs_values) {
res$left$expression <- stringr::str_replace_all(res$left$expression, "\\|", "") %>%
paste0("abs(", ., ")")
}
if(res$right$abs_values) {
res$right$expression <- stringr::str_replace_all(res$right$expression, "\\|", "") %>%
paste0("abs(", ., ")")
}
# Add algebra operators
if(res$left$algebra) {
res$left$algebra_operator <- stringr::str_extract(res$left$expression, "\\*|\\-|\\+|\\/")
}
if(res$right$algebra) {
res$right$algebra_operator <- stringr::str_extract(res$right$expression, "\\*|\\-|\\+|\\/")
}
# Return
return(res)
})
# Names of hypotheses
names(hyps_split) <- paste0("group_", 1:length(hyps_split))
# Add order of evaluation by reversing groups
hyps_split <- hyps_split[length(hyps_split):1]
# Return
return(hyps_split)
}
# Check if all parsed parameters in parameters in data
check_hypothesis <- function(hypothesis_parsed, parameters) {
# Retrieve parameters
pars_in_hyp <- lapply(hypothesis_parsed, function(x) list(x$left$params, x$right$params)) %>%
unlist(recursive = TRUE) %>%
unique()
# If pars not in parameters of data, throw error
if(any(!pars_in_hyp %in% parameters)) {
stop(paste0("User passed parameters ('",
paste0(pars_in_hyp[!pars_in_hyp %in% parameters], collapse=", "),
"') in hypotheses that are not present in data"))
}
}
# Parse parameters from an expression
#' @importFrom stringr str_replace_all
#' @importFrom stringr str_split
parse_parameters <- function(exp, abs_values, algebra, is_numeric) {
# Assign r
r <- exp
# Strip parentheses if in equation
r <- stringr::str_replace_all(r, "\\(|\\)", "")
# Check abs values
if(abs_values) {
# Strip
r <- stringr::str_replace_all(r, "\\|", "")
}
if(algebra) {
r <- stringr::str_split(r, "\\*|\\-|\\+|\\/")[[1]] %>%
trimws()
}
if(is_numeric) {
r <- c()
}
# Is any a number?
# If empty
nnums <- c()
for(i in seq_along(r)){
if(suppressWarnings(is.na(as.numeric(r[i])))) {
nnums <- c(nnums, i)
}
}
r <- r[nnums]
return(unique(r))
}
# Compute the complexity of a hypothesis
# @param hypothesis_i the current hypothesis under evaluation
# @param priors object of class priors containing prior information
# @param y_sd standard deviation of the outcome variable y
# CITE HOITINK
compute_hypothesis_complexity <- function(hypothesis_i, priors, y_sd) {
# Get parsed version of the hypothesis
hpar <- hypothesis_i$parsed
# The order has already been reversed if multiple hypotheses!
# Get an overview of the parameters mentioned
params_in_hyp <- lapply(hpar, function(x) c(x$left$params, x$right$params)) %>%
unlist(recursive = TRUE) %>%
unique()
# Draw samples from the priors
prior_samples <- lapply(params_in_hyp, function(x) {
# Prior information
prinf <- priors[[x]]
# Draw
rnorm(50000, prinf$mu, prinf$sd) #%>%
#scale_coef(y_sd)
})
# Names
names(prior_samples) <- params_in_hyp
# Assign to environment
for(i in seq_along(prior_samples)) {
# Assign value to local scope. i.e. this has the effect of b0 <- value
assign(names(prior_samples)[i], prior_samples[[i]])
}
# Evaluate the parameters
evaluated_hyps <- rep(0, 50000)
for(i in seq_along(hpar)) {
# Current
cur <- hpar[[i]]
# Evaluate the expression
evaluated_hyps <- evaluated_hyps + eval(parse(text=paste(cur$left$expression, cur$operator, cur$right$expression)))
}
# Only accept evaluated_hyps if it is TRUE in all cases. i.e. value for pos i == # of evaluted hypotheses
evaluated_hyps <- ifelse(evaluated_hyps == length(hpar), 1, 0)
# Return proportion (complexity)
return(mean(evaluated_hyps))
}
# Compute the fit of a hypothesis
# @param hypothesis_i the current hypothesis under evaluation
# @param posterior posterior samples
# @param y_sd standard deviation of the outcome variable y
# CITE HOITINK
compute_hypothesis_fit <- function(hypothesis_i, posterior, y_sd) {
# Get parsed version of the hypothesis
hpar <- hypothesis_i$parsed
# The order has already been reversed if multiple hypotheses!
# Get an overview of the parameters mentioned
params_in_hyp <- lapply(hpar, function(x) c(x$left$params, x$right$params)) %>%
unlist(recursive = TRUE) %>%
unique()
# Extract posterior from the posterior samples
post_samples <- vector("list", length(params_in_hyp))
# Names
names(post_samples) <- params_in_hyp
# Assign to environment
for(i in seq_along(post_samples)) {
# Assign value to local scope. i.e. this has the effect of b0 <- value
assign(params_in_hyp[i], posterior[,params_in_hyp[i]] #%>%
#scale_coef(y_sd)
)
}
# Evaluate the parameters
evaluated_hyps <- rep(0, nrow(posterior))
for(i in seq_along(hpar)) {
# Current
cur <- hpar[[i]]
# Evaluate the expression
evaluated_hyps <- evaluated_hyps + eval(parse(text=paste(cur$left$expression,
cur$operator,
cur$right$expression)))
}
# Only accept evaluated_hyps if it is TRUE in all cases. i.e. value for pos i == # of evaluted hypotheses
evaluated_hyps <- ifelse(evaluated_hyps == length(hpar), 1, 0)
# Return proportion (fit)
return(mean(evaluated_hyps))
}
# Helper functions for sampling ----
# Initiate initial values for a Gibbs chain
initialize_chain_values <- function(priors) {
# For each prior, draw initial values and construct a weight matrix
w <- matrix(0L, nrow = length(priors)-1, ncol=1)
# To grid
for(i in seq_along(priors)) {
if(names(priors)[i] == "sigma") {
sigma <- draw_value(priors[[i]])
} else {
w[i,1] <- draw_value(priors[[i]])
}
}
# Return
return(
list(
"w" = w,
"sigma" = sigma
)
)
}
# Helper function that calls the Julia MC sampler
mc_sampler <- function(X, y, initial_values, iterations, thinning, priors, samplers) {
# Unroll initial values
w <- initial_values$w
# If intercept-only model, then w is a scalar. Coerce to array
# JuliaCall screws this up in conversion!!!
if(is.vector(w)) {
w <- matrix(w, ncol=1, nrow=1)
}
sigma <- initial_values$sigma
# Call MCMC sampler in Julia
r <- .blm$julia$eval("MCMC_sampler")(X, as.numeric(y), w, sigma, as.integer(iterations),
as.integer(thinning), unname(priors), unname(samplers))
# Burn
return(r)
}
# Helper functions for evaluation -----
# Autocorrelation plot
# @param pd posterior samples
# @param chain chain to show autocorrelation plot for
autocorrelation_plot <- function(pd, chain) {
# Get data from posterior
qd <- get_value(pd, "samples")[[paste0("chain_", chain)]]
# Lag data
qd <- qd %>%
as.data.frame() %>%
lapply(., function(x) autocor(x, n=40)) %>%
do.call(rbind.data.frame, .)
# Add variable name
qd$id <- stringr::str_replace_all(row.names(qd), "\\.[0-9]{1,2}", "")
# Plot
ggplot2::ggplot(qd, ggplot2::aes(x=lag, y=correlation, group=id)) +
ggplot2::geom_bar(stat="identity") +
ggplot2::scale_x_continuous(breaks= scales::pretty_breaks()) +
ggplot2::scale_y_continuous(limits = c(-1,1)) +
theme_blm() +
ggplot2::facet_wrap(id ~ .) +
ggplot2::labs(title="Autocorrelation plot")
}
# History plot
history_plot <- function(samples) {
ggplot2::ggplot(samples, ggplot2::aes(x = iteration,
y=value,
color=as.factor(chain),
group = parameter)) +
ggplot2::geom_line(alpha=0.4) +
ggplot2::scale_color_brewer(palette = "Set1", name = "chain",
labels = paste0("chain ", 1:length(unique(samples$chain)))) +
theme_blm() +
ggplot2::theme(axis.title = ggplot2::element_blank()) +
ggplot2::facet_wrap("parameter ~ .", scales = "free_y",
ncol=2) +
ggplot2::labs(title = "Trace plot", subtitle = "each chain is indicated by its own color")
}
# Density plot
density_plot <- function(samples) {
ggplot2::ggplot(samples, ggplot2::aes(x=value,
fill = as.factor(chain))) +
ggplot2::geom_density(alpha=0.4) +
ggplot2::scale_color_brewer(palette = "Set1", name = "chain",
labels = paste0("chain ", 1:length(unique(samples$chain)))) +
theme_blm() +
ggplot2::theme(axis.title = ggplot2::element_blank(),
legend.position="none") +
ggplot2::facet_wrap("parameter ~ .", scales = "free") +
ggplot2::labs(title = "Posterior densities", subtitle = "each chain is indicated by its own color")
}
# Helper function for autocorrelation
autocor <- function(x, n=10) {
# Results
res <- rep(0, n)
# Lag for each n and calculate correlation
for(i in 1:n) {
res[i] <- cor(x, c(rep(NA, i), x[1:(length(x)-i)]),
use="complete.obs")
}
# Return
return(
data.frame(
"lag" = 1:n,
"correlation" = res
)
)
}
# Calculate mode
calc_mode <- function(x) {
uniqv <- unique(x)
uniqv[which.max(tabulate(match(x, uniqv)))]
}
# Posterior predictive checks in julia
ppc_julia <- function(X, y, posterior_samples) {
# Call Julia function for posterior predictive checks
return(
.blm$julia$eval("ppc_draws")(X, as.numeric(y), posterior_samples)
)
}
# R-squared calculation in julia
bayes_R2 <- function(X, y, posterior_samples) {
# Call Julia function for R2
return(
.blm$julia$eval("bayes_R2")(X, as.numeric(y), posterior_samples)
)
}
# Outliers in Julia
julia_ppd <- function(X, y, posterior_samples) {
# Call Julia function outliers
return(
.blm$julia$eval("compute_ppd")(X, as.numeric(y), posterior_samples)
)
}
# Model DIC
# See: http://kylehardman.com/BlogPosts/View/6
DIC <- function(X, y, posterior_samples) {
## Map values
map <- MAP(posterior_samples)$MAP
## Bind data
pb <- bind(posterior_samples)
## To matrix (expected by julia)
pb <- as.matrix(pb)
## Coef separate from sigma
sigma <- unname(map["sigma"])
coefs <- matrix(map[-length(map)], ncol=1)
## Two parts to DIC ==> (1) sum of log of likelihood P(y|theta_MAP)
# Call Julia implementation
# NOTE: sigma here is the standard deviation. We use posterior values. These have been squared-rooted
# after sampling.
r <- .blm$julia$eval("DIC")(X, as.numeric(y),
coefs, sigma, pb)
## Return model fit
return(
do.call(cbind.data.frame,
r)
)
}
# Calculate the model BIC
BIC_blm <- function(n, p, LL) {
log(n)*p - 2*LL
}
# Calculate the Bayes factor of the model against the null model
# See Wagemakers 2007
BF <- function(BIC0, BIC1) {
exp((BIC0 - BIC1)/2)
}
## Sample size helper
sample_size <- function(n, pk) {
n / (1 + 2 * sum(pk))
}
# Helper functions for special cat() ----
# Special cat function for burn-in statistic
cat.burnin <- function(x) {
# Dims
j <- dim(x)[2]
n <- dim(x)[1]
# Fill setting for cat()
f <- (nchar(crayon::green("0.00")) + 1) * j + (j-1) + nchar("chain 1 ")
# Round
x <- round(abs(x), digits=2)
# Determine colors
is_zero <- x == 0
less_than_05 <- abs(x) < 0.05
# Turn x into a character vector
x_v <- as.character(round(unname(unlist(x)), digits=2))
# For those elements of length 1, add decimals
x_v_l1 <- nchar(x_v) == 1
x_v_l3 <- nchar(x_v) == 3
# Set
x_v[x_v_l1] <- paste0(x_v[x_v_l1], ".00")
x_v[x_v_l3] <- paste0(x_v[x_v_l3], "0")
# Paste
cat(paste0(" ", paste0("b", 0:(j-1), collapse=" "), "\n"))
cat(ifelse(is_zero, crayon::green(x_v),
ifelse(less_than_05, crayon::yellow(x_v),
crayon::red(x_v))), fill=f, labels=paste0("Chain ", 1:2), sep = " ")
}
# Special cat function for Gelman-Rubin statistic
cat.GR <- function(x) {
# Round
x <- round(x, digits=2)
# To vector
x <- unname(x[1,])
# Length
j <- length(x)
# Fill setting for cat()
f <- (nchar(crayon::green("0.00")) + 1) * j + (j-1) + nchar("chain 1 ")
# Determine colors
is_one <- x == 1
# Turn x into a character vector
x_v <- as.character(x)
# For those elements of length 1, add decimals
x_v_l1 <- nchar(x_v) == 1
x_v_l3 <- nchar(x_v) == 3
# Set
x_v[x_v_l1] <- paste0(x_v[x_v_l1], ".00")
x_v[x_v_l3] <- paste0(x_v[x_v_l3], "0")
# Paste
cat(paste0(" ", paste0("b", 0:(j-1), collapse=" "), "\n"))
cat(ifelse(is_one, crayon::green(x_v), crayon::red(x_v)), fill=f, labels="Value ", sep = " ")
}
# Misc ----
# See documentation below.
generate_dataset <- function(n = 2000, j = 5, binary = 1, seed=NULL,
heteroskedastic = FALSE, correlated_errors = FALSE, ...) {
opts <- list(...)
if("degree" %in% names(opts)) {
degree <- opts$degree
if(!is.numeric(degree)) {
stop("'degree' must be numeric")
} else if(degree <= 0) {
stop("'degree' cannot be less than or equal to 0")
} else {
# Continue
degree <- degree
}
} else{
degree <- 1
}
# Empty matrix
X <- matrix(0L, ncol=j, nrow=n)
# Determine which variables will be binary
# Populate matrix
if(!is.null(seed)) {
set.seed(seed)
}
binary_j <- order(runif(j))[0:binary]
for(i in 1:j) {
if(!i %in% binary_j) {
# Mean and sd
m <- runif(1, 5, 10) * sign(runif(1, -1, 1))
sd <- runif(1, 1, 3)
X[,i] <- rnorm(n, m, sd)
} else {
X[,i] <- rbinom(n, 1, 0.5)
}
}
# Intercept + j coefs
coef <- c(rnorm(1, 0, 20), rnorm(j, 0, 5))
sigmaSq <- runif(1, 1, 10)
#browser()
# Make the example heteroskedastic
if(heteroskedastic) {
sigmaSq <- rep(1e-10, n)
for(i in setdiff(1:j, binary_j)) {
X[,i] <- X[,i] + seq(1,mean(X[,i]) + degree*sd(X[,i]), length.out = n)
# Set sigma
pw <- runif(1, 1, 1.6)
# Scale power for degree
if(degree <= 1) {
degreepw <- 1
} else {
degreepw <- degree
}
pw <- (pw + (1-(1/degreepw)))/pw
sigmaSq <- sigmaSq + (abs(X[,i])^pw)
}
sigmaSq
# Add intercept
X <- cbind(rep(1, n), X)
# Generate y
y <- rnorm(n,
mean = X %*% matrix(coef),
sd = sqrt(sigmaSq))
} else if(correlated_errors) { # Make the example correlated
# Correlated values
ord <- arima.sim(list(order = c(1,0,0), ar = degree), n = n, sd=sqrt(sigmaSq))
# Add intercept
X <- cbind(rep(1, n), X)
# Generate y
y <- rnorm(n,
mean = X %*% matrix(coef) + ord,
sd = sqrt(sigmaSq))
} else {
# Add intercept
X <- cbind(rep(1, n), X)
# Generate y
y <- rnorm(n,
mean = X %*% matrix(coef),
sd = sqrt(sigmaSq))
}
# List of real values (for later)
real <- list(
coef = coef,
sigma = sigmaSq
)
# Return
return(
list(
"X" = X,
"y" = y,
"parameters" = real,
"seed" = ifelse(is.null(seed), NA, seed)
)
)
}
#' Generate a dataset with normal variables and outcome for testing purposes
#'
#' @param n number of examples
#' @param j number of variables
#' @param binary number of columns with 0/1 outcomes
#' @param seed seed for pseudo-random number generator
#'
#' @return list containing true values and data frame with n rows and j columns with simulated data.
#' @export
blmsim <- generate_dataset
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.