knitr::opts_chunk$set(echo = TRUE)
We are partially concerned with how well our test does if the data comes from a distribution that has a bounded support. (I.e. (a,b)).
We examine how well it does at distinguising the uniform from a mixture of a uniform and a beta(5,5) (it has the same mean but a different shape)
library(parallel) library(ggplot2) source("MarginalLikIntfunctions.R") source("Laplacefunction.R") set.seed(1000) p = rbeta(500,.5,.5) dlength = 400 dataset1 <- runif(dlength) dataset2 <- matrix(data = NA, nrow = length(p),ncol = dlength) for(i in 1:length(p)) { unifdraw = runif(dlength) for(j in 1:dlength) { if(unifdraw[j] > p[i]) { dataset2[i,j] = runif(1) } else { dataset2[i,j] = rbeta(1,shape1 = 4, shape2 = 4) } } } XT1 <- dataset1[1:(length(dataset1)*.3)] XV1 <- dataset1[-(1:(length(dataset1)*.3))] likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML1 = laplace.kernH(y = XT1, x = XV1, hhat = bwlik$maximum) CL1 <- makeCluster(16) clusterExport(cl = CL1, list("XT1","XV1","dataset2","logmarg.specialkernMCimport", "ExpectedKernML1", "dlength", "logSumExp", "HallKernel", "laplace.kernH2", "logintegrand.Hall", "loglike.KHall", "KHall", "hessian", "dataset1"), envir = environment()) result <- parApply(cl = CL1, dataset2[1:500,], 1, FUN = function(x){ BFi = 0 for(j in 1:30) { trainingset = sample(1:dlength, size = dlength*.3) XT1 = dataset1[trainingset] XV1 = dataset1[-trainingset] XT2 = x[trainingset] XV2 = x[-trainingset] likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum) likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT2, x = XV2)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML2 = laplace.kernH2(y = XT2, x = XV2, hhat = bwlik$maximum) likvec2 = function(h) {sum(log(HallKernel(h,datagen2 = c(XT1,XT2), x = c(XV1,XV2))))} bwlik2 = optimize(f = function(h){ likvec2(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernMLcomb = laplace.kernH2(y = c(XT1,XT2), x = c(XV1,XV2), hhat = bwlik2$maximum) BFi = BFi + ExpectedKernML1[1] + ExpectedKernML2[1] - ExpectedKernMLcomb[1] } BFi = BFi / 30 return(BFi) }) stopCluster(CL1) # lm4 <- lm(result~p[1:500]) # lw1 = loess(result~p) # j <- order(p) # plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*') # lines(p[j],lw1$fitted[j],col="red",lwd=3) # # abline(a = 0, b = 0) #abline(lm4, col = "blue") dfCVBF = data.frame(logBF = result, p = p) plot1 = ggplot(dfCVBF, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/CVBFCompactnaive.pdf",plot = plot1, device = "pdf") sorteddfCVBF = dfCVBF[order(dfCVBF$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfCVBF) - 1) ) { nonparvarest = nonparvarest + (sorteddfCVBF[j + 1, 1] - sorteddfCVBF[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfCVBF)) #Variance estimate nonparvarest #Standard deviation estimate sqrt(nonparvarest)
We are examining the log BF, of the test that checks whether a standard uniform is the same as that of a mixture of a standard uniform and a beta(5,5), where p is changing from -
This is the raw data, that is both distributions have [0,1] support and are unchanged.
The next plot redoes this but sees if there's a difference in the data if we examine just their logs instead and reflect the training dat.
dataset1st = dataset1 dataset2st = dataset2 dataset1 = -log(dataset1) dataset2 = -log(dataset2) XT1 <- c(dataset1[1:(length(dataset1)*.3)],-dataset1[1:(length(dataset1)*.3)]) XV1 <- dataset1[-(1:(length(dataset1)*.3))] likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML1 = laplace.kernH(y = XT1, x = XV1, hhat = bwlik$maximum) CL1 <- makeCluster(16) clusterExport(cl = CL1, list("XT1","XV1","dataset2","logmarg.specialkernMCimport", "ExpectedKernML1", "dlength", "logSumExp", "HallKernel", "laplace.kernH2", "logintegrand.Hall", "loglike.KHall", "KHall", "hessian", "dataset1"), envir = environment()) result <- parApply(cl = CL1, dataset2[1:500,], 1, FUN = function(x){ BFi = 0 for(j in 1:30) { trainingset = sample(1:dlength, size = dlength*.3) XT1 = c(dataset1[trainingset], -dataset1[trainingset]) XV1 = dataset1[-trainingset] XT2 = c(x[trainingset], -x[trainingset]) XV2 = x[-trainingset] likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT2, x = XV2)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML2 = laplace.kernH2(y = XT2, x = XV2, hhat = bwlik$maximum) likvec2 = function(h) {sum(log(HallKernel(h,datagen2 = c(XT1,XT2), x = c(XV1,XV2))))} bwlik2 = optimize(f = function(h){ likvec2(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernMLcomb = laplace.kernH2(y = c(XT1,XT2), x = c(XV1,XV2), hhat = bwlik2$maximum) BFi = BFi + ExpectedKernML1[1] + ExpectedKernML2[1] - ExpectedKernMLcomb[1] } BFi = BFi / 30 return(BFi) }) stopCluster(CL1) # lw1 = loess(result~p) # j <- order(p) # plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*') # lines(p[j],lw1$fitted[j],col="red",lwd=3) dfCVBF2 = data.frame(logBF = result, p = p) plot1 = ggplot(dfCVBF2, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-50,100)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/CVBFCompactimproved.pdf",plot = plot1, device = "pdf") sorteddfCVBF2 = dfCVBF2[order(dfCVBF2$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfCVBF2) - 1) ) { nonparvarest = nonparvarest + (sorteddfCVBF2[j + 1, 1] - sorteddfCVBF2[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfCVBF2)) #Variance estimate nonparvarest #Standard deviation estimate sqrt(nonparvarest) #abline(lm4, col = "blue") dataset1 = dataset1st dataset2 = dataset2st
There's a change in shape in the BF
We do this with ks test now instead.
kslist = c() for(j in 1:500) { kslist[j] = ks.test(dataset1,dataset2[j,])$p } # plot(p[1:500],log(kslist), xlab = "p", ylab = "p-values from KS test", pch = '*') # abline(a=log(0.05),b=0, col = "blue") dfKS = data.frame(logks = log(kslist), p = p) plot1 = ggplot(dfKS, aes(x = p, y = logks)) + geom_point() + geom_hline(yintercept = log(.05), color = "blue") + ylim(c(-50,100)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/KSCompact.pdf",plot = plot1, device = "pdf") Blist = c() e = exp(1) for(j in 1:500) { if(kslist[j] < 1/e) { Blist[j] = 1 / (-e * kslist[j] * log(kslist[j])) } else{ Blist[j] = 1 } if(kslist[j] == 0) { Blist[j] = Inf } } Blist = log(Blist) dfKS = data.frame(logks = log(kslist), p = p, logB = Blist) ggplot(dfKS, aes(x = p, y = logks)) + geom_point() + geom_hline(yintercept = log(.05), color = "blue") + ylim(c(-50,100)) + geom_smooth(se = FALSE) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) ggplot(dfKS, aes(x = p, y = logB)) + geom_point() + geom_hline(yintercept = 1, color = "blue") + ylim(c(-50,100)) + geom_smooth(se = FALSE) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) sum(Blist == 0) #This is 99 plot1 = ggplot(dfKS, aes(x = p, y = logB)) + geom_point() + geom_hline(yintercept = 1, color = "blue") + ylim(c(-70, 50)) + geom_smooth(se = FALSE) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) ggsave("BayesSimPlots/KSBCompact.pdf", plot = plot1, device = "pdf")
Comparing this to Polya Tree with a normal base distribution.
PTlist = c() for(j in 1:500) { meancombineddataset = mean(c(dataset1, dataset2[j,])) sdcombineddataset = sd(c(dataset1, dataset2[j,])) dataset1adj = (dataset1 - meancombineddataset) / sdcombineddataset dataset2adj = (dataset2[j,] - meancombineddataset) / sdcombineddataset PTlist[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qnorm, c = 1, leveltot = 10)$logBF } plot(p[1:500],PTlist, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*') abline(a=0,b=0, col = "blue") dfPT = data.frame(logBF = PTlist, p = p) plot1 = ggplot(dfPT, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-50,100)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/PTNCompact.pdf",plot = plot1, device = "pdf") sorteddfPT = dfPT[order(dfPT$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfPT) - 1) ) { nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfPT)) #Variance estimate nonparvarest #Standard deviation estimate sqrt(nonparvarest)
Comparing this to Polya Tree with a Cauchy base distribution.
PTlist2 = c() for(j in 1:500) { mediancombineddataset = median(c(dataset1, dataset2[j,])) IQRcombineddataset = IQR(c(dataset1, dataset2[j,])) dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36) dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36) PTlist2[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qcauchy, c = 1, leveltot = 10)$logBF } # plot(p[1:500],PTlist2, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*') # abline(a=0,b=0, col = "blue") dfPT2 = data.frame(logBF = PTlist2, p = p) plot1 = ggplot(dfPT2, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-50,100)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/PTCCompact.pdf",plot = plot1, device = "pdf") sorteddfPT = dfPT2[order(dfPT$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfPT) - 1) ) { nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfPT)) #Variance estimate nonparvarest #Standard deviation estimate sqrt(nonparvarest)
It looks like CVBF with reflection does the best.
dfall = data.frame(PTC = dfPT2$logBF, PTN = dfPT$logBF, CVBFAvg = dfCVBF$logBF, logKSb = dfKS$logB, p = dfCVBF$p, CVBFAvgAdj = dfCVBF2$logBF) write.csv(dfall, "logBFandKSvaluesforcompact.csv") plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) + # geom_point() + # geom_point(aes(p, PTC), color = "purple") + # geom_point(aes(p, PTN), color = "green") + # geom_point(aes(p, logKSb), color = "red") + # geom_point(aes(p, CVBFAvgAdj), color = "cyan") + geom_smooth(aes(p, PTC), color = "pink", se = FALSE) + geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) + geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) + geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) + geom_smooth(aes(p, CVBFAvgAdj), color = "brown", se = FALSE) + labs(x = "p", y = "log(BF)") + ylim(c(-50,80)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/jointlogBFcompact.pdf",plot = plot1, device = "pdf") plot2 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) + geom_point() + geom_point(aes(p, PTC), color = "purple") + geom_point(aes(p, PTN), color = "green") + geom_point(aes(p, logKSb), color = "red") + geom_point(aes(p, CVBFAvgAdj), color = "cyan") + geom_smooth(aes(p, PTC), color = "pink", se = FALSE) + geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) + geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) + geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) + geom_smooth(aes(p, CVBFAvgAdj), color = "brown", se = FALSE) + labs(x = "p", y = "log(BF)") + ylim(c(-50,100)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) ggsave("BayesSimPlots/jointlogBFcompactfull.pdf",plot = plot2, device = "pdf")
dfall = read.csv("logBFandKSvaluesforcompact.csv") colorBlindGrey8 <- c("#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#009E73") colors <- c("Polya Tree Cauchy Base"=colorBlindGrey8[1], "Polya Tree Normal Base"=colorBlindGrey8[2], "Average CVBF of 30 splits"=colorBlindGrey8[3], "Selke's KS test 'B-value'" = colorBlindGrey8[4], "Data Reflection Adjusted CVBF" = colorBlindGrey8[5] ) plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) + # geom_point() + # geom_point(aes(p, PTC), color = "purple") + # geom_point(aes(p, PTN), color = "green") + # geom_point(aes(p, logKSb), color = "red") + geom_smooth(aes(p, PTC, color = "Polya Tree Cauchy Base"), linetype = 1, se = FALSE) + geom_smooth(aes(p, PTN, color = "Polya Tree Normal Base"), linetype = 2, se = FALSE) + geom_smooth(aes(p, CVBFAvg, color = "Average CVBF of 30 splits"), linetype = 6, se = FALSE) + geom_smooth(aes(p, logKSb, color = "Selke's KS test 'B-value'"), linetype = 4, se = FALSE) + geom_smooth(aes(p, CVBFAvgAdj, color = "Data Reflection Adjusted CVBF"), se = FALSE) + labs(x = "p", y = "log(BF)", color = "Legend") + ylim(c(-50, 80)) + scale_color_manual(values=colors) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/jointlogBFcompactLegend.pdf",plot = plot1, device = "pdf")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.