knitr::opts_chunk$set(echo = TRUE)

Long Tail Experiments

It has been noticed that the Gaussian Kernel is unstable if the data comes from a very long tail distribution.

If the data comes from a Cauchy distribution, the bayes factor, in general, is unstable.

We verify that switching Kernels, in general, seems to fix problems.

We try to show this in plots and by using "BayesSim"

We now show the "Cauchy v Cauchy problem". The next plot shown will be CVBFs for the Cauchy vs Cauchy problem.

library(parallel)
library(ggplot2)
source("MarginalLikIntfunctions.R")
source("Laplacefunction.R")
set.seed(1000)

p = rbeta(500,.5,.5)
dlength = 400

dataset1 <- rcauchy(dlength)
dataset2 <- matrix(data = NA, nrow = length(p),ncol = dlength)
for(i in 1:length(p))
{
  unifdraw = runif(dlength)
  for(j in 1:dlength)
  {
    if(unifdraw[j] > p[i])
    {
      dataset2[i,j] = rcauchy(1)
    }
    else
    {
      dataset2[i,j] = rcauchy(1,location = 1, scale = 1)
    }
  }
}
XT1 <- dataset1[1:(length(dataset1)*.3)]
XV1 <- dataset1[-(1:(length(dataset1)*.3))]

likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))}
bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum)

CL1 <- makeCluster(16)
clusterExport(cl = CL1, list("XT1","XV1","dataset2","logmarg.specialkernMCimport", "ExpectedKernML1", "dlength", "logSumExp", "HallKernel", "laplace.kernH2", "logintegrand.Hall", "loglike.KHall", "KHall", "hessian", "dataset1"), envir = environment())
result <- parApply(cl = CL1, dataset2[1:500,], 1, FUN = function(x){
  BFi = 0
  for(j in 1:30)
  {
    trainingset = sample(1:dlength, size = dlength*.3)
    XT1 = dataset1[trainingset]
    XV1 = dataset1[-trainingset]
    XT2 = x[trainingset]
    XV2 = x[-trainingset] 
    likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))}
    bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
    ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum)

    likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT2, x = XV2)))}
    bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
    ExpectedKernML2 = laplace.kernH2(y = XT2, x = XV2, hhat = bwlik$maximum)

    likvec2 = function(h) {sum(log(HallKernel(h,datagen2 = c(XT1,XT2), x = c(XV1,XV2))))}
    bwlik2 = optimize(f = function(h){  likvec2(h)}, lower = 0, upper = 10, maximum = TRUE)

    ExpectedKernMLcomb = laplace.kernH2(y = c(XT1,XT2), x = c(XV1,XV2), hhat = bwlik2$maximum)

    BFi = BFi + ExpectedKernML1[1] + ExpectedKernML2[1] - ExpectedKernMLcomb[1]
  }
  BFi = BFi / 30
  return(BFi)
})
stopCluster(CL1)
#lm4 <- lm(result~p[1:100])
# lw1 = loess(result~p)
# j <- order(p)
# plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*')
# lines(p[j],lw1$fitted[j],col="red",lwd=3)
# 
# abline(a = 0, b = 0)

dfCVBF = data.frame(logBF = result, p = p)
plot1 = ggplot(dfCVBF, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue", se = FALSE) + geom_smooth(colour = "red", se = FALSE) + ylim(c(-70, 50)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/CVBFLongvLong.pdf",plot = plot1, device = "pdf")

sorteddfCVBF = dfCVBF[order(dfCVBF$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfCVBF) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfCVBF[j + 1, 1] - sorteddfCVBF[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfCVBF))

#A variance estimate

nonparvarest

#A standard deviation estimate

sqrt(nonparvarest)

#abline(lm4, col = "blue")

We are examining the log BF, of the test that checks whether a standard Cauchy is the same as a mixture of a standard Cauchy and a Cauchy with higher location parameter.

kslist = c()
for(j in 1:500)
{
  kslist[j] =  ks.test(dataset1,dataset2[j,])$p
}

dfKS = data.frame(logks = log(kslist), p = p)

plot1 = ggplot(dfKS, aes(x = p, y = logks)) + geom_point()  + geom_hline(yintercept = log(.05), color = "blue") + geom_smooth(se = FALSE) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/KSLongvLong.pdf",plot = plot1, device = "pdf")

Blist = c()
e = exp(1)
for(j in 1:500)
{
  if(kslist[j] < 1/e)
  {
    Blist[j] = 1 / (-e * kslist[j] * log(kslist[j]))
  }
  else{
    Blist[j] = 1
  }
  if(kslist[j] == 0)
  {
    Blist[j] = Inf
  }
}
Blist = log(Blist)

dfKS = data.frame(logks = log(kslist), p = p, logB = Blist)

ggplot(dfKS, aes(x = p, y = logks)) + geom_point()  + geom_hline(yintercept = log(.05), color = "blue") + ylim(c(-50,100)) + geom_smooth(se = FALSE) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

ggplot(dfKS, aes(x = p, y = logB)) + geom_point()  + geom_hline(yintercept = 1, color = "blue") + ylim(c(-50,100)) + geom_smooth(se = FALSE) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

sum(Blist == 0)

#This is 99

plot1 = ggplot(dfKS, aes(x = p, y = logB)) + geom_point()  + geom_hline(yintercept = 1, color = "blue") + ylim(c(-70, 50)) + geom_smooth(se = FALSE) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

ggsave("BayesSimPlots/KSBlongvlong.pdf", plot = plot1, device = "pdf")


#plot(p[1:500],log(kslist), xlab = "p", ylab = "log(p-values) from KS test", pch = '*')
#abline(a=log(0.05),b=0, col = "blue")

We also compare this to the BFs produced by the Polya tree test if the base distribution is normal.

PTlist = c()
for(j in 1:500)
{
  mediancombineddataset = median(c(dataset1, dataset2[j,]))
  IQRcombineddataset = IQR(c(dataset1, dataset2[j,]))
  dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36)
  dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36)
  PTlist[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qnorm, c = 1, leveltot = 10)$logBF
}
# plot(p[1:500],PTlist, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*')
# abline(a=0,b=0, col = "blue")

dfPT = data.frame(logBF = PTlist, p = p)
plot1 = ggplot(dfPT, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-70, 50)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/PTNLongvLong.pdf",plot = plot1, device = "pdf")

sorteddfPT = dfPT[order(dfPT$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfPT) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfPT))

#A variance estimate

nonparvarest

#A standard deviation estimate

sqrt(nonparvarest)

We also compare this to the BFs produced by the Polya tree test if the base distribution is Cauchy.

#set.seed(100)
PTlist2 = c()
for(j in 1:500)
{
  mediancombineddataset = median(c(dataset1, dataset2[j,]))
  IQRcombineddataset = IQR(c(dataset1, dataset2[j,]))
  dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36)
  dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36)
  PTlist2[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qcauchy, c = 1, leveltot = 10)$logBF
}
plot(p[1:500],PTlist2, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*')
abline(a=0,b=0, col = "blue")
abline(a=-2.99,b=0, col = "green")

dfPT2 = data.frame(logBF = PTlist2, p = p)
plot1 = ggplot(dfPT2, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-70, 50)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/PTCLongvLong2.pdf",plot = plot1, device = "pdf",  width = 7, height = 5)
sorteddfPT = dfPT2[order(dfPT$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfPT) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfPT))

#A variance estimate

nonparvarest

#A standard deviation estimate

sqrt(nonparvarest)

We can now compare all plots. The Polya tree that uses a Cauchy base distribution is more variable but seems to ascent more sharply than CVBF.

dfall = data.frame(PTC = dfPT2$logBF, PTN = dfPT$logBF, CVBFAvg = dfCVBF$logBF, logKSb = dfKS$logB, p = dfCVBF$p)
write.csv(dfall, file = "logBFsandKSvaluesforlongvlong")

plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) +
  geom_smooth(aes(p, PTC), color = "pink", se = FALSE) +
  geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) +
  geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) +
  geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) +
  labs(x = "p", y = "log(BF)") + ylim(c(-70, 50)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot1
ggsave("BayesSimPlots/jointlogBFLongvLong.pdf",plot = plot1, device = "pdf")

plot2 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) +
  geom_point() +
  geom_point(aes(p, PTC), color = "purple") +
  geom_point(aes(p, PTN), color = "green") +
  geom_point(aes(p, logKSb), color = "red") +
  geom_smooth(aes(p, PTC), color = "pink", se = FALSE) +
  geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) +
  geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) +
  geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) +
  labs(x = "p", y = "log(BF)") + ylim(c(-70, 50)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot2
ggsave("BayesSimPlots/jointlogBFLongvLongfull.pdf",plot = plot2, device = "pdf")
dfall = read.csv("logBFsandKSvaluesforlongvlong.csv")
colorBlindGrey8   <- c("#F0E442", "#0072B2", "#D55E00", "#CC79A7")
colors <- c("Polya Tree Cauchy Base"=colorBlindGrey8[1], "Polya Tree Normal Base"=colorBlindGrey8[2], "Average CVBF of 30 splits"=colorBlindGrey8[3], "Selke's KS test 'B-value'" = colorBlindGrey8[4])
plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) +
#  geom_point() +
  # geom_point(aes(p, PTC), color = "purple") +
  # geom_point(aes(p, PTN), color = "green") +
  # geom_point(aes(p, logKSb), color = "red") +
  geom_smooth(aes(p, PTC, color = "Polya Tree Cauchy Base"), linetype = 1, se = FALSE) +
  geom_smooth(aes(p, PTN, color = "Polya Tree Normal Base"), linetype = 2, se = FALSE) +
  geom_smooth(aes(p, CVBFAvg, color = "Average CVBF of 30 splits"), linetype = 6, se = FALSE) +
  geom_smooth(aes(p, logKSb, color = "Selke's KS test 'B-value'"), linetype = 4, se = FALSE) +
  labs(x = "p", y = "log(BF)", color = "Legend") + ylim(c(-70, 50)) + scale_color_manual(values=colors) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot1

ggsave("BayesSimPlots/jointlogBFLongvLongLegend.pdf",plot = plot1, device = "pdf")


naveedmerchant/BayesScreening documentation built on June 13, 2024, 7:56 a.m.