knitr::opts_chunk$set(echo = TRUE)
It has been noticed that the Gaussian Kernel is unstable if the data comes from a very long tail distribution.
If the data comes from a Cauchy distribution, the bayes factor, in general, is unstable.
We verify that switching Kernels, in general, seems to fix problems.
We try to show this in plots and by using "BayesSim"
We now show the "Cauchy v Gaussian problem"
library(parallel) library(ggplot2) source("MarginalLikIntfunctions.R") source("Laplacefunction.R") set.seed(1000) p = rbeta(500,.5,.5) dlength = 400 dataset1 <- rcauchy(dlength) dataset2 <- matrix(data = NA, nrow = length(p),ncol = dlength) for(i in 1:length(p)) { unifdraw = runif(dlength) for(j in 1:dlength) { if(unifdraw[j] > p[i]) { dataset2[i,j] = rcauchy(1) } else { dataset2[i,j] = rnorm(1, mean = 0, sd = 1.4826) } } } XT1 <- dataset1[1:(length(dataset1)*.3)] XV1 <- dataset1[-(1:(length(dataset1)*.3))] likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum) CL1 <- makeCluster(16) clusterExport(cl = CL1, list("XT1","XV1","dataset2","logmarg.specialkernMCimport", "ExpectedKernML1", "dlength", "logSumExp", "HallKernel", "laplace.kernH2", "logintegrand.Hall", "loglike.KHall", "KHall", "hessian", "dataset1"), envir = environment()) result <- parApply(cl = CL1, dataset2[1:500,], 1, FUN = function(x){ BFi = 0 for(j in 1:30) { trainingset = sample(1:dlength, size = dlength*.3) XT1 = dataset1[trainingset] XV1 = dataset1[-trainingset] XT2 = x[trainingset] XV2 = x[-trainingset] likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum) likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT2, x = XV2)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML2 = laplace.kernH2(y = XT2, x = XV2, hhat = bwlik$maximum) likvec2 = function(h) {sum(log(HallKernel(h,datagen2 = c(XT1,XT2), x = c(XV1,XV2))))} bwlik2 = optimize(f = function(h){ likvec2(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernMLcomb = laplace.kernH2(y = c(XT1,XT2), x = c(XV1,XV2), hhat = bwlik2$maximum) BFi = BFi + ExpectedKernML1[1] + ExpectedKernML2[1] - ExpectedKernMLcomb[1] } BFi = BFi / 30 return(BFi) }) stopCluster(CL1)# lm4 <- lm(result~p) # lw1 = loess(result~p) # j <- order(p) # plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*') # lines(p[j],lw1$fitted[j],col="red",lwd=3) # # # abline(a = 0, b = 0) dfCVBF = data.frame(logBF = result, p = p) plot1 = ggplot(dfCVBF, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-70, 30)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/CVBFShortvLong.pdf",plot = plot1, device = "pdf") #abline(lm4, col = "blue") sorteddfCVBF = dfCVBF[order(dfCVBF$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfCVBF) - 1) ) { nonparvarest = nonparvarest + (sorteddfCVBF[j + 1, 1] - sorteddfCVBF[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfCVBF)) #A variance estimate nonparvarest #A standard deviation estimate sqrt(nonparvarest)
We are examining the log BF, of the test that checks whether a standard Cauchy is the same as a mixture of a standard Cauchy and a normal whose IQR is roughly the same as the Cauchy.
kslist = c() for(j in 1:500) { kslist[j] = ks.test(dataset1,dataset2[j,])$p } Blist = c() e = exp(1) for(j in 1:500) { if(kslist[j] < 1/e) { Blist[j] = 1 / (-e * kslist[j] * log(kslist[j])) } else{ Blist[j] = 1 } if(kslist[j] == 0) { Blist[j] = Inf } } Blist = log(Blist) sum(Blist == 0) ###297 # plot(p[1:500],log(kslist), xlab = "p", ylab = "log p-values from KS test", pch = '*') # abline(a=log(0.05),b=0, col = "blue") dfKS = data.frame(logks = log(kslist), p = p, logB = Blist) plot1 = ggplot(dfKS, aes(x = p, y = logks)) + geom_point() + geom_hline(yintercept = log(.05), color = "blue") + ylim(c(-70, 30)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/KSShortvLong.pdf",plot = plot1, device = "pdf") plot2 = ggplot(dfKS, aes(x = p, y = logB)) + geom_point() + geom_hline(yintercept = 1, color = "blue") + ylim(c(-70, 30)) + geom_smooth(se = FALSE) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot2 ggsave("BayesSimPlots/KSBShortvLong.pdf",plot = plot2, device = "pdf")
We also compare this to the BFs produced by the Polya tree test.
We center and scale before proceeding. The below file code deals with the normal distribution being used as a base distribution.
PTlist = c() for(j in 1:500) { mediancombineddataset = median(c(dataset1, dataset2[j,])) IQRcombineddataset = IQR(c(dataset1, dataset2[j,])) dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36) dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36) PTlist[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qnorm, c = 1, leveltot = 10)$logBF } # plot(p[1:500],PTlist, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*') # abline(a=0,b=0, col = "blue") dfPT = data.frame(logBF = PTlist, p = p) plot1 = ggplot(dfPT, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-70, 30)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/PTNShortvLong.pdf",plot = plot1, device = "pdf") sorteddfPT = dfPT[order(dfPT$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfPT) - 1) ) { nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfPT)) #Variance estimate nonparvarest #Standard deviation estimate sqrt(nonparvarest)
How does this compare to preforming the same test but using a Cauchy quantile instead?
PTlist2 = rep(0, times = 500) for(j in 1:500) { mediancombineddataset = median(c(dataset1, dataset2[j,])) IQRcombineddataset = IQR(c(dataset1, dataset2[j,])) dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36) dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36) PTlist2[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qcauchy, c = 1, leveltot = 10)$logBF } # plot(p[1:500],PTlist2, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*') # abline(a=0,b=0, col = "blue") # abline(a=-2.99,b=0, col = "green") dfPT2 = data.frame(logBF = PTlist2, p = p) plot1 = ggplot(dfPT2, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-70, 30)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/PTCShortvLong.pdf",plot = plot1, device = "pdf") sorteddfPT = dfPT2[order(dfPT$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfPT) - 1) ) { nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfPT)) #Variance estimate nonparvarest #Standard deviation estimate sqrt(nonparvarest)
We can now compare all plots. The Polya tree that uses a Cauchy base distribution is variable but ascends above 0 corecctly. The Polya tree that uses a normal base distribution is also variable but crosses 0 late.
dfall = data.frame(PTC = dfPT2$logBF, PTN = dfPT$logBF, CVBFAvg = dfCVBF$logBF, logKSb = dfKS$logB, p = dfCVBF$p) write.csv(dfall, file = "logBFsandKSvaluesforshortvlong.csv") plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) + # geom_point() + # geom_point(aes(p, PTC), color = "purple") + # geom_point(aes(p, PTN), color = "green") + # geom_point(aes(p, logKSb), color = "red") + geom_smooth(aes(p, PTC), color = "pink", se = FALSE) + geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) + geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) + geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) + labs(x = "p", y = "log(BF)") + ylim(c(-70, 30)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/jointlogBFShortvLong.pdf",plot = plot1, device = "pdf") + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot2 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) + geom_point() + geom_point(aes(p, PTC), color = "purple") + geom_point(aes(p, PTN), color = "green") + geom_point(aes(p, logKSb), color = "red") + geom_smooth(aes(p, PTC), color = "pink", se = FALSE) + geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) + geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) + geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) + labs(x = "p", y = "log(BF)") + ylim(c(-70, 50)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot2 ggsave("BayesSimPlots/jointlogBFLongvShortfull.pdf",plot = plot2, device = "pdf") + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
dfall = read.csv("logBFsandKSvaluesforshortvlong.csv") colorBlindGrey8 <- c("#F0E442", "#0072B2", "#D55E00", "#CC79A7") colors <- c("Polya Tree Cauchy Base"=colorBlindGrey8[1], "Polya Tree Normal Base"=colorBlindGrey8[2], "Average CVBF of 30 splits"=colorBlindGrey8[3], "Selke's KS test 'B-value'" = colorBlindGrey8[4]) plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) + # geom_point() + # geom_point(aes(p, PTC), color = "purple") + # geom_point(aes(p, PTN), color = "green") + # geom_point(aes(p, logKSb), color = "red") + geom_smooth(aes(p, PTC, color = "Polya Tree Cauchy Base"), linetype = 1, se = FALSE) + geom_smooth(aes(p, PTN, color = "Polya Tree Normal Base"), linetype = 2, se = FALSE) + geom_smooth(aes(p, CVBFAvg, color = "Average CVBF of 30 splits"), linetype = 6, se = FALSE) + geom_smooth(aes(p, logKSb, color = "Selke's KS test 'B-value'"), linetype = 4, se = FALSE) + labs(x = "p", y = "log(BF)", color = "Legend") + ylim(c(-70, 30)) + scale_color_manual(values=colors) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/jointlogBFShortvLongLegend.pdf",plot = plot1, device = "pdf")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.