knitr::opts_chunk$set(echo = TRUE)
It has been noticed that the Gaussian Kernel is unstable if the data comes from a very long tail distribution.
If the data comes from a Cauchy distribution, the bayes factor, in general, is unstable.
We verify that switching Kernels, in general, seems to fix problems. Does this kernel still work well with short tail data?
We try to show this in plots and by using "BayesSim"
We now show the "Gaussian v Gaussian problem"
library(parallel) library(ggplot2) source("MarginalLikIntfunctions.R") source("Laplacefunction.R") set.seed(1000) p = rbeta(500,.5,.5) dlength = 400 dataset1 <- rnorm(dlength) dataset2 <- matrix(data = NA, nrow = length(p),ncol = dlength) for(i in 1:length(p)) { unifdraw = runif(dlength) for(j in 1:dlength) { if(unifdraw[j] > p[i]) { dataset2[i,j] = rnorm(1) } else { dataset2[i,j] = rnorm(1, mean = 0, sd = sqrt(4)) } } } XT1 <- dataset1[1:(length(dataset1)*.3)] XV1 <- dataset1[-(1:(length(dataset1)*.3))] likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum) CL1 <- makeCluster(16) clusterExport(cl = CL1, list("XT1","XV1","dataset2","logmarg.specialkernMCimport", "ExpectedKernML1", "dlength", "logSumExp", "HallKernel", "laplace.kernH2", "logintegrand.Hall", "loglike.KHall", "KHall", "hessian", "dataset1"), envir = environment()) result <- parApply(cl = CL1, dataset2[1:500,], 1, FUN = function(x){ BFi = 0 for(j in 1:30) { trainingset = sample(1:dlength, size = dlength*.3) XT1 = dataset1[trainingset] XV1 = dataset1[-trainingset] XT2 = x[trainingset] XV2 = x[-trainingset] likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum) likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT2, x = XV2)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernML2 = laplace.kernH2(y = XT2, x = XV2, hhat = bwlik$maximum) likvec2 = function(h) {sum(log(HallKernel(h,datagen2 = c(XT1,XT2), x = c(XV1,XV2))))} bwlik2 = optimize(f = function(h){ likvec2(h)}, lower = 0, upper = 10, maximum = TRUE) ExpectedKernMLcomb = laplace.kernH2(y = c(XT1,XT2), x = c(XV1,XV2), hhat = bwlik2$maximum) BFi = BFi + ExpectedKernML1[1] + ExpectedKernML2[1] - ExpectedKernMLcomb[1] } BFi = BFi / 30 return(BFi) }) stopCluster(CL1) lm4 <- lm(result~p[1:500]) #plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*') # lw1 = loess(result~p) # j <- order(p) # plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*') # lines(p[j],lw1$fitted[j],col="red",lwd=3) # # # abline(a = 0, b = 0) dfCVBF = data.frame(logBF = result, p = p) plot1 = ggplot(dfCVBF, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-40, 60)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/CVBFShortvShort.pdf",plot = plot1, device = "pdf") sorteddfCVBF = dfCVBF[order(dfCVBF$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfCVBF) - 1) ) { nonparvarest = nonparvarest + (sorteddfCVBF[j + 1, 1] - sorteddfCVBF[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfCVBF)) #Variance estimate nonparvarest #Standard deviation estimate sqrt(nonparvarest)
We are examining the log CVBF, of the test that checks whether a standard Gaussian is the same as a mixture of a standard Gaussian and a normal with higher variance.
Lets see how KS test does on this problem
kslist = c() for(j in 1:500) { kslist[j] = ks.test(dataset1,dataset2[j,])$p } Blist = c() e = exp(1) for(j in 1:500) { if(kslist[j] < 1/e) { Blist[j] = 1 / (-e * kslist[j] * log(kslist[j])) } else{ Blist[j] = 1 } if(kslist[j] == 0) { Blist[j] = Inf } } Blist = log(Blist) sum(Blist == 0) #### 137 dfKS = data.frame(logks = log(kslist), p = p, logB = Blist) plot1 = ggplot(dfKS, aes(x = p, y = logks)) + geom_point() + geom_hline(yintercept = log(.05), color = "blue") + ylim(c(-40, 60)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/KSShortvShort.pdf",plot = plot1, device = "pdf") plot2 = ggplot(dfKS, aes(x = p, y = logB)) + geom_point() + geom_hline(yintercept = 1, color = "blue") + ylim(c(-40, 60)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot2 ggsave("BayesSimPlots/KSBShortvShort.pdf",plot = plot1, device = "pdf")
What about Polya tree test with a normal base distribution?
PTlist = c() for(j in 1:500) { mediancombineddataset = median(c(dataset1, dataset2[j,])) IQRcombineddataset = IQR(c(dataset1, dataset2[j,])) dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36) dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36) PTlist[j] = PolyaTreetest(dataset1,dataset2[j,], Ginv = qnorm, c = 1, leveltot = 9)$logBF #print(j) } plot(p[1:500],PTlist, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*') abline(a=0,b=0, col = "blue") dfPT = data.frame(logBF = PTlist, p = p) plot1 = ggplot(dfPT, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-40, 60)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/PTNShortvShort.pdf",plot = plot1, device = "pdf") sorteddfPT = dfPT[order(dfPT$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfPT) - 1) ) { nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfPT)) #Variance estimate nonparvarest #Standard deviation estimate sqrt(nonparvarest)
What about Polya tree with Cauchy base distribution?
PTlist2 = c() for(j in 1:500) { mediancombineddataset = median(c(dataset1, dataset2[j,])) IQRcombineddataset = IQR(c(dataset1, dataset2[j,])) dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36) dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36) PTlist2[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qcauchy, c = 1, leveltot = 10)$logBF #print(j) } plot(p[1:500],PTlist2, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*') abline(a=0,b=0, col = "blue") dfPT2 = data.frame(logBF = PTlist2, p = p) plot1 = ggplot(dfPT2, aes(x = p, y = logBF)) + geom_point() + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-40, 60)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/PTCShortvShort.pdf",plot = plot1, device = "pdf") #quantile(x = PTNull, probs = .9) #quantile(x = CVBF, probs = .9) sorteddfPT = dfPT2[order(dfPT$p),] nonparvarest = 0 for(j in 1:(nrow(sorteddfPT) - 1) ) { nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2 } nonparvarest = nonparvarest / (2* nrow(sorteddfPT)) #Variance estimate nonparvarest #Standard deviation estimate sqrt(nonparvarest)
The Polya tree with a Cauchy base distribution cross the line later than other methods.
dfall = data.frame(PTC = dfPT2$logBF, PTN = dfPT$logBF, CVBFAvg = dfCVBF$logBF, logKSb = dfKS$logB, p = dfCVBF$p) write.csv(dfall, file = "logBFsandKSvaluesforshortvshort.csv") plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) + # geom_point() + # geom_point(aes(p, PTC), color = "purple") + # geom_point(aes(p, PTN), color = "green") + # geom_point(aes(p, logKSb), color = "red") + geom_smooth(aes(p, PTC), color = "pink", se = FALSE) + geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) + geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) + geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) + labs(x = "p", y = "log(BF)") + ylim(c(-40, 60)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/jointlogBFShortvShort.pdf",plot = plot1, device = "pdf") plot2 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) + geom_point() + geom_point(aes(p, PTC), color = "purple") + geom_point(aes(p, PTN), color = "green") + geom_point(aes(p, logKSb), color = "red") + geom_smooth(aes(p, PTC), color = "pink", se = FALSE) + geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) + geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) + geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) + labs(x = "p", y = "log(BF)") + ylim(c(-40, 60)) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot2 ggsave("BayesSimPlots/jointlogBFshortvshortfull.pdf",plot = plot2, device = "pdf")
dfall = read.csv("logBFsandKSvaluesforshortvshort.csv") colorBlindGrey8 <- c("#F0E442", "#0072B2", "#D55E00", "#CC79A7") colors <- c("Polya Tree Cauchy Base"=colorBlindGrey8[1], "Polya Tree Normal Base"=colorBlindGrey8[2], "Average CVBF of 30 splits"=colorBlindGrey8[3], "Selke's KS test 'B-value'" = colorBlindGrey8[4]) plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) + # geom_point() + # geom_point(aes(p, PTC), color = "purple") + # geom_point(aes(p, PTN), color = "green") + # geom_point(aes(p, logKSb), color = "red") + geom_smooth(aes(p, PTC, color = "Polya Tree Cauchy Base"), linetype = 1, se = FALSE) + geom_smooth(aes(p, PTN, color = "Polya Tree Normal Base"), linetype = 2, se = FALSE) + geom_smooth(aes(p, CVBFAvg, color = "Average CVBF of 30 splits"), linetype = 6, se = FALSE) + geom_smooth(aes(p, logKSb, color = "Selke's KS test 'B-value'"), linetype = 4, se = FALSE) + labs(x = "p", y = "log(BF)", color = "Legend") + ylim(c(-40, 60)) + scale_color_manual(values=colors) + theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12)) plot1 ggsave("BayesSimPlots/jointlogBFShortvShortLegend.pdf",plot = plot1, device = "pdf")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.