#' Bayesian Density Estimation Using \code{gold} (Gaussian process On a Log Density).
#'
#' Estimates a density from Bayesian techniques using a Gaussian process on a log density.
#'
#' @usage
#' gold(y, s1 = max(2.3 - 0.003 * n, 0.5), c1 = max([n/50], 1), s2 = NA, c2 = c1/2, N = 20000, graves = TRUE, scale_l = 0.00001, scale_u = 0.00001, poi = NA)
#'
#' @param y A numeric vector. \code{gold} estimates the density on this data.
#' @param s1 The standard deviation of the Gaussian prior. The default is set to a linear equaiton that decreases s1 as the sample size increases (see details).
#' @param c1 The correlation parameter of the Gaussian prior that controls the correlation in the covariance structure and smoothness in the density estimate. See details for the defaul value.
#' @param s2 The standard deviation of the Gaussian proposal distribution.
#' @param c2 The correlation parameter that controls the covariance structure of the Gaussian proposal distribution.
#' @param N The number of iterations to run in the algorithm. The default is 20,000.
#' @param graves An option to have the standard deviation of the proposal distribution (s1) chosen using an automatic step size selection (See Graves (2011)) (DEFAULT is FALSE).
#' @param scale_l A value >= 0 controlling the scaling based on the minimum data value. The default is 0.00001 (see details).
#' @param scale_u value >= 0 controlling the scaling based on the maximum data value. The default is 0.00001 (see details).
#' @param poi Points of interest at which to estimate the density. \code{gold_pdf} will produce estimates and credible intervals of the density at any point, but \code{mat} returned by \code{gold_pdf} and \code{gold_cdf} only return the posterior draws if the density is estimated at that point.
#'
#' @export
#' @importFrom matrixcalc svd.inverse
#' @importFrom MASS mvrnorm
#'
#' @details
#'
#' The density being estimated takes the form below:
#'
#' \eqn{f(x) =}
#' \deqn{exp(g(x)) / (\int_{0}^{1} exp(g(u)) du)}
#'
#' where g(x) is an unknown log density.
#'
#' Given that g(x) is unknown, the integral in the normalizing constant is estimated using a weighted average, and the set of unknown parameters that receive a prior is g(x) at a finite set of points. This finite set of points includes each data point, as well as a grid to ensure good coverage.
#'
#' This density operates on data between 0 and 1. Thus, if the input is not between 0 and 1, it is standardized. The formula for scaling is below:
#'
#' \deqn{(y-(min(y)-scale_l))/(max(y)+scale_u-(min(y)-scale_l))}.
#'
#' Values of 0 for the scale parameters indicates the density will only be the edges of the data in \code{y}. The default for \code{scale_l} and \code{scale_u} is 0.00001 since some densities cannot be estimated at 0 and 1.
#'
#' There is a default recommendation based on the size of \code{y}. The recommendation starts and 1 and then increases by 1 for each additional 50 data points:
#'
#' Default: c1 = max(round(n/50), 1)
#'
#' A default is also recommended for s1 in the form of: 1.1 - 0.002 * n. However, a minimum value of .05 is enforced, but the user can specify their own value for s1 to overide this.
#' @return
#'
#' \code{gold} returns a list containing the density estimate results.
#'
#' \item{\code{G}}{A matrix with the posterior draws for g(x) at a finite number of points. The number of rows is the number of iterations. The number of columns is the number of data points plus the grid points.}
#' \item{\code{widths}}{A vector with the widths around each x used for g(x). These are the widths used in the weighted average to estimate the integral in the normalizing constant.}
#' \item{\code{x}}{A vector with the points at which g(x) was estimated.}
#' \item{\code{y}}{A vector containing the data introduced to \code{gold} for density estimation.}
#' \item{\code{ar}}{The acceptance rate from the random walk proposals.}
#' \item{\code{prior}}{A vector of all prior and proposal parameter values.}
#' \item{\code{poi}}{The points of interest if \code{poi} is non-missing.}
#'
#' @references
#'
#' Graves, T. (2011). \emph{Automatic Step Size Selection in Random Walk Metropolis Algorithms}.
#'
#' @examples
#'
#' ## --------------------------------------------------------------------------------
#' ## Beta Distribution
#' ## --------------------------------------------------------------------------------
#'
#' # First run 'gold' on data sampled from a Beta(5, 1) distribution with 200 data points.
#' y <- rbeta(200, 5, 1)
#' # Use all defaults: this also uses Graves method to tune the standard deviation for the proposal
#' # distribution.
#' gold_beta <- gold(y = y)
#'
#' # Check trace plots
#' gold_mcmcplots(gold_beta)
#'
#' # Examine the estimate of the PDF
#' gold_plot(gold_beta, data = TRUE)
#'
#' # Examine the estimate of the CDF with the empirical CDF overlaied
#' gold_plot(gold_beta, type = "cdf", data = TRUE)
#'
#' ## --------------------------------------------------------------------------------
#' ## Normal Distribution
#' ## --------------------------------------------------------------------------------
#'
#' # First run 'gold' on data sampled from a Normal(0, 1) distribution with 100 data points.
#' y <- rnorm(100, 0, 1)
#' # Specify all prior parameters: tune until achieve desired acceptance rate
#' gold_norm <- gold(y = y, s1 = 2, c1 = 1, s2 = 2, c2 = 0.5, N = 20000, graves = FALSE, scale_l = 2*sd(y), scale_u = 2*sd(y))
#'
#' # Check what s2 was chosen
#' gold_norm$prior
#'
#' # Check acf plots
#' gold_mcmcplots(gold_norm, type = "acf")
#'
#' # Examine the estimate of the PDF with credible intervals
#' gold_plot(gold_norm, cri = TRUE)
#'
#' # Examine the estimate of the CDF using an interactive graph
#' gold_plot(gold_norm, type = "cdf", interact = TRUE)
#'
#' ## --------------------------------------------------------------------------------
#' ## Bimodal Distribution
#' ## --------------------------------------------------------------------------------
#'
#' # Sample 400 random uniforms
#' u <- runif(400)
#' y <- rep(NA,400)
#' # Sampling from the mixture
#' for(i in 1:400){
#' if(u[i] < 0.3){
#' y[i] <- rnorm(1, 0, 1)
#' }else {
#' y[i] <- rnorm(1, 4, 1)
#' }
#' }
#'
#' # First run 'gold' on data sampled from a bimodal distribution with 200 data points.
#' # Specify several points of interest
#' # USe graves, but use less correlation then default by making 'c1' large.
#' gold_bimodal <- gold(y = y, s1 = 2, c1 = 15, c2 = 7, graves = TRUE, poi = c(0, 6.5), scale_l = .5, scale_u = .5)
#'
#' # Check the running mean plots
#' gold_mcmcplots(gold_bimodal, type = "rm")
#'
#' # Plot the PDF with the data and credible intervals
#' gold_plot(gold_bimodal, data = TRUE, cri = TRUE)
#'
#' # Plot the CDF
#' gold_plot(gold_bimodal, type = "cdf")
#'
#' # Plot histograms at estimates of the CDF at the 'poi'
#' bimodal_cdf <- gold_cdf(x = c(0, 6.5), gold_bimodal)
#' bimodal_cdf$cdf
#' # x = 0
#' hist(bimodal_cdf$mat[,1])
#' # x = 6.5
#' hist(bimodal_cdf$mat[,2])
gold <- function(y, s1 = NA, c1 = NA, s2 = NA, c2 = NA, N = 20000, graves=TRUE, scale_l = .00001, scale_u = .00001, poi = NA){
# Specify default for s1 if missing
if(is.na(s1)){
s1 = max(2.3 - 0.003 * length(y), 0.05)
}
# Specify default for c1 if missing
if(is.na(c1)){
c1 <- max(round(length(y)/50),1)
}
if(is.na(c2)){
c2 <- c1/2
}
if(!is.na(s2)){
graves = FALSE
}
# All error checking ********************************************************************
# No missing values in y
if(length(which(is.na(y))) > 0){
stop(" 'y' cannot contain missing values.")
}
# Error check for appropriate range of parameters
if(s1<=0){
stop(" 's1' should be > 0.")
}
if(c1 < 0){
stop(" 'c1' should be >= 0.")
}
if(c2 < 0){
stop(" 'c2' should be >= 0.")
}
if (scale_l < 0.0) {
stop(" 'scale_l' should be >= 0.")
}
if (scale_u < 0.0) { # scale paramter needs to be greater than or equal to 0
stop(" 'scale_u' should be >= 0.")
}
# Make sure numbers in 'poi' are unique, otherwise throws algorithm off
if(!is.na(poi[1])){
total_length <- length(y)+length(poi)
if(length(unique(poi))!=length(poi)){
stop("The values in 'poi' are not unique.")
}
# Check for missing values
if(length(which(is.na(poi))) > 0){
stop(" 'poi' cannot contain missing values.")
}
}else{
total_length <- length(y)
}
# Make sure N is an integer
if(N/ceiling(N)<1){
stop(" 'N' should be an integer value.")
}
# Make sure N is greater than 0
if(N<=0){
stop(" 'N' should be an integer greater than zero.")
}
#### end of error checking
# Create vector to contain original data
y_orig <- NA
for(j in 1:length(y)){
y_orig[j]<-y[j]
}
# Sort the data
y_orig <- sort(y_orig)
# Find the maximum and minimum
# If points of interest are specified outside the range of the data,
# This needs to be incorporated into the max and min otherwise the standardized
# valeus for 'poi' might be greater than 1 or less than 0
if(!is.na(poi[1])){
max_y <- max(y, poi)
min_y <- min(y, poi)
}else{
max_y <- max(y)
min_y <- min(y)
}
# Mark positions that are data
names(y_orig) <- rep("data", length(y_orig))
# Add points of interest to the data if there are any entered
if(is.na(poi[1])){
y_all_orig <- y_orig
} else{
names(poi) <- rep("poi", length(poi))
y_all_orig <- sort(c(y_orig, poi))
}
# y_all becomes the scaled data between 0 and 1 to use in the algorithm
y_all <- y_all_orig
# Check if data is outside (0,1) range and scale it if it is
if((max_y>1)|(min_y<0)){
for(j in 1:length(y_all_orig)){
y_all[j] <- (y_all_orig[j]-(min_y-scale_l))/(max_y+scale_u-(min_y-scale_l))
}
}
# Sort the data again
y_all <- sort(y_all)
# Create grid:0, 0.01,..., 0.99, 1
x_full <- 0:100
x_full <- x_full/100
# Will eventually remove end points, but just need for calculating bin widths
# Mark positions that are the grid and the end points
names(x_full) <- rep("grid", length(x_full))
names(x_full)[1] <- "end"
names(x_full)[length(x_full)] <- "end"
# x_full contains the end points, x removes them
x <- x_full[-1]
x <- x[-length(x)]
# Get summary of data to count how many times each data point appeared
y_d <- data.frame(y_all_orig,y_all, names(y_all_orig))
names(y_d) <- c("y_all_orig", "y_all", "names.y_all_orig.")
y_d <- y_d %>% group_by(y_all_orig,y_all, names.y_all_orig.) %>% summarise(n=n())
# Data to add to grid
y_add <- y_d$y_all
# Maintain names of data points: "data" and "poi"
names(y_add) <- y_d$names.y_all_orig.
# if contains "poi", remove rows since this data set should contain only the 'y' input
if(length(which(y_d$names.y_all_orig.=="poi"))!=0){
y_d <- y_d %>% filter(names.y_all_orig. != "poi")
}
# Add data to grid
x <- c(x,y_add)
# Sort the data
x <- sort(x)
# Sort the full data
x_full <- sort(c(x_full,y_add))
# calculate the distances between points
pairwise_dist <- NA
for(i in 1:(length(x_full)-1)){
pairwise_dist[i] <- abs(x_full[{i+1}]-x_full[i])
}
# The grid is designed to have distances of .01 between them, if there are two data points within .01 of each other
# remove grid point in between
drop <- NA
r_index <- 1
check <- which(!(names(x)%in% c("data", "poi")))
for(j in check){
if(j>1&j<length(x)){
if(names(x)[{j-1}]=="data"&names(x)[{j+1}]=="data"){
distance <- x[{j+1}]-x[{j-1}]
if(distance<.01&names(x)[j]=="grid"){
drop[r_index] <- j
r_index <- r_index+1
}
}
}
}
# Just in case none to drop
if(!is.na(drop[1])){
x <- x[-drop]
}
x_full <- c(0,x,1)
# Calculate the midpoints of the remaining points
midpoints <- NA
for (i in 1:(length(x)+1)){
midpoints[i] <- (x_full[{i+1}]-x_full[{i}])/2+x_full[{i}]
}
# Calculate the widths between each pair of points
widths <- NA
for (i in 1:(length(x))){
widths[i] <- abs(midpoints[{i+1}]-midpoints[{i}])
}
# Covariance matrix for prior distribution
cov_prior <- matrix(0, nrow=length(x), ncol=length(x))
for (i in 1:length(x)){
for(j in 1:length(x)){
d <- abs(x[i]-x[j])
if(c1*d<.5){
cov_prior[i,j] <- s1^2*(1-6*(c1*d)^2+6*(c1*d)^3)
}else if (c1*d <1){
cov_prior[i,j] <- s1^2*(2*(1-c1*d)^3)
} else {
cov_prior[i,j] <- 0
}
}
}
# Use SVD
cov_prior_inv <- matrixcalc::svd.inverse(cov_prior)
G <- matrix(0, nrow=N, ncol=length(x))
# Starting values
G[1,] <-rep(0, ncol(G))
###GRAVES################
if(graves==TRUE){
# Vector of values to establish trend in acceptance rate
s2_graves <- c(.001,.01,.03,.06,.08,.1,.2,.3,.6,.8,1,3,5)
# Number of step sizes to try
n_step_size <- length(s2_graves)
# The target is .35, slightly lower than .4 due to number of parameteres
target <- .35
# Store acceptance rates
ar <- NA
#Which which data points in x are the data
which_x <- which(names(x)=="data")
# Loop through each step size suggestion
for (i in 1:n_step_size){
ar_G <- NA
# Covariate matrix
cov_x <- matrix(0, nrow=length(x), ncol=length(x))
for (l in 1:length(x)){
for(j in 1:length(x)){
cov_x[l,j] <- s2_graves[i]*exp(-c2*(x[l]-x[j])^2)
}
}
for (l in 1:ncol(cov_x)){
cov_x[l,l] <- cov_x[l,l]+.0000000000001
}
# Set up for multivariate simultions from normal
n <- 1
mu <- rep(0, length(x))
tol = 1e-06
p <- length(mu)
eS <- svd(cov_x)
ev <- eS$d
if (!all(ev >= -tol * abs(ev[1L])))
stop("'Sigma' is not positive definite")
mvrnorm_calc <- eS$u %*% diag(sqrt(pmax(ev, 0)), p)
# Store iterations for tuning purposes
tune<- matrix(0, nrow=201, ncol=ncol(cov_x))
tune[1,] <- G[1,]
for(j in 2:101){
# Simulate length(x) values from standard normal
rw <- matrix(rnorm(p * n), n)
# Get proposal
rw <- drop(mu) + mvrnorm_calc %*% t(rw)
g_prop <- tune[{j-1},]+rw
# Numerator and denominator for acceptance ratio on the log scale
ap_num_log <- -length(y)*log(sum(widths*exp(g_prop)))+sum(g_prop[which_x])-.5*t(g_prop)%*%cov_prior_inv%*%g_prop
ap_den_log <- -length(y)*log(sum(widths*exp(tune[{j-1},])))+sum(tune[{j-1},which_x])-.5*t(tune[{j-1},])%*%cov_prior_inv%*%tune[{j-1},]
# Calculate acceptance ratio on the log scale
ap_log <- ap_num_log-ap_den_log
if (log(runif(1))<ap_log){
tune[j,] <- g_prop
ar_G[j] <- 1
} else {
tune[j,] <- tune[j-1,]
ar_G[j] <- 0
}
}
ar_G <- ar_G[-1]
# Calculate the number that accepted
ar[i] <- sum(ar_G)
}
# Relationship between log acceptance ratio and log of s2 is linear
test <- glm(cbind(ar,100-ar)~log(s2_graves), family="binomial")
# Back calculate what s2 leads to .35 acceptance rate
s2_start <- exp((log(target/(1-target))-test$coefficients[1])/test$coefficients[2])
names(s2_start) <- NULL
# s2 should not be too huge or small, if it is, algorithm is not converging so
# pick more reasonable value by finding closest in suggestions to
if(s2_start>15 | s2_start<.0001){
s2_start <- s2_graves[which.min(abs(ar - target*100)) ]
}
##########
s2 <- s2_start
}
# Calculate covariance matrix for proposals
cov_x <- matrix(0, nrow=length(x), ncol=length(x))
for (i in 1:length(x)){
for(j in 1:length(x)){
cov_x[i,j] <- s2*exp(-c2*(x[i]-x[j])^2)
}
}
# Ensures has inverse
for (i in 1:ncol(cov_x)){
cov_x[i,i] <- cov_x[i,i]+.0000000000001
}
####SVD for multivariate proposals#####################
n <- 1
mu <- rep(0, length(x))
tol = 1e-06
p <- length(mu)
eS <- svd(cov_x)
ev <- eS$d
if (!all(ev >= -tol * abs(ev[1L])))
stop("'Sigma' is not positive definite")
mvrnorm_calc <- eS$u %*% diag(sqrt(pmax(ev, 0)), p)
# Collect acceptance rate
ar_G <- NA
# Get indices for which parameters are actual data verses part of the grid
which_x <- which(names(x)=="data")
# Metropolis-Hastings algorithm
for (i in 2:N){
# Simulate length(x) values from standard normal
rw <- matrix(rnorm(p * n), n)
rw <- drop(mu) + mvrnorm_calc %*% t(rw)
# Add to previous iteration
g_prop <- G[{i-1},]+rw
# Calculate log of numerator and denominator of acceptance probability
ap_num_log <- -length(y)*log(sum(widths*exp(g_prop)))+sum(y_d$n*g_prop[which_x])-.5*t(g_prop)%*%cov_prior_inv%*%g_prop
ap_den_log <- -length(y)*log(sum(widths*exp(G[{i-1},])))+sum(y_d$n*G[{i-1},which_x])-.5*t(G[{i-1},])%*%cov_prior_inv%*%G[{i-1},]
# Calculate log of acceptance probability
ap_log <- ap_num_log-ap_den_log
if (log(runif(1))<ap_log){
G[i,] <- g_prop #Acceptance proposal
ar_G[i] <- 1
} else {
G[i,] <- G[i-1,] #Reject proposal
ar_G[i] <- 0
}
if(i %% 1000 == 0){
print(paste("Iteration:", i))
}
}
# Acceptance Rates
ar_G <- ar_G[-1]
print("Acceptance Rate:")
print(sum(ar_G)/length(ar_G))
if(is.na(poi[1])){
return(list(G=G,widths=widths,x=x,y=y_orig,ar=sum(ar_G)/length(ar_G), prior=c(s1 = s1, c1 = c1, s2 = s2, c2 = c2), scale_l = scale_l, scale_u = scale_u))
}else{
return(list(G=G,widths=widths,x=x,y=y_orig,ar=sum(ar_G)/length(ar_G), prior=c(s1 = s1, c1 = c1, s2 = s2, c2 = c2), scale_l = scale_l, scale_u = scale_u, poi = poi))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.