knitr::opts_chunk$set(echo = TRUE)

Compact Support Experiments

We are partially concerned with how well our test does if the data comes from a distribution that has a bounded support. (I.e. (a,b)).

We examine how well it does at distinguising the uniform from a mixture of a uniform and a beta(5,5) (it has the same mean but a different shape)

library(parallel)
library(ggplot2)
source("MarginalLikIntfunctions.R")
source("Laplacefunction.R")
set.seed(1000)

p = rbeta(500,.5,.5)
dlength = 400

dataset1 <- runif(dlength)
dataset2 <- matrix(data = NA, nrow = length(p),ncol = dlength)
for(i in 1:length(p))
{
  unifdraw = runif(dlength)
  for(j in 1:dlength)
  {
    if(unifdraw[j] > p[i])
    {
      dataset2[i,j] = runif(1)
    }
    else
    {
      dataset2[i,j] = rbeta(1,shape1 = 4, shape2 = 4)
    }
  }
}
XT1 <- dataset1[1:(length(dataset1)*.3)]
XV1 <- dataset1[-(1:(length(dataset1)*.3))]

likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))}
bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
ExpectedKernML1 = laplace.kernH(y = XT1, x = XV1, hhat = bwlik$maximum)

CL1 <- makeCluster(16)
clusterExport(cl = CL1, list("XT1","XV1","dataset2","logmarg.specialkernMCimport", "ExpectedKernML1", "dlength", "logSumExp", "HallKernel", "laplace.kernH2", "logintegrand.Hall", "loglike.KHall", "KHall", "hessian", "dataset1"), envir = environment())
result <- parApply(cl = CL1, dataset2[1:500,], 1, FUN = function(x){
  BFi = 0
  for(j in 1:30)
  {
    trainingset = sample(1:dlength, size = dlength*.3)
    XT1 = dataset1[trainingset]
    XV1 = dataset1[-trainingset]
    XT2 = x[trainingset]
    XV2 = x[-trainingset] 
    likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))}
    bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
    ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum)

    likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT2, x = XV2)))}
    bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
    ExpectedKernML2 = laplace.kernH2(y = XT2, x = XV2, hhat = bwlik$maximum)

    likvec2 = function(h) {sum(log(HallKernel(h,datagen2 = c(XT1,XT2), x = c(XV1,XV2))))}
    bwlik2 = optimize(f = function(h){  likvec2(h)}, lower = 0, upper = 10, maximum = TRUE)

    ExpectedKernMLcomb = laplace.kernH2(y = c(XT1,XT2), x = c(XV1,XV2), hhat = bwlik2$maximum)

    BFi = BFi + ExpectedKernML1[1] + ExpectedKernML2[1] - ExpectedKernMLcomb[1]
  }
  BFi = BFi / 30
  return(BFi)
})
stopCluster(CL1)


# lm4 <- lm(result~p[1:500])
# lw1 = loess(result~p)
# j <- order(p)
# plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*')
# lines(p[j],lw1$fitted[j],col="red",lwd=3)
# 
# abline(a = 0, b = 0)

#abline(lm4, col = "blue")



dfCVBF = data.frame(logBF = result, p = p)
plot1 = ggplot(dfCVBF, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/CVBFCompactnaive.pdf",plot = plot1, device = "pdf")

sorteddfCVBF = dfCVBF[order(dfCVBF$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfCVBF) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfCVBF[j + 1, 1] - sorteddfCVBF[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfCVBF))

#Variance estimate

nonparvarest

#Standard deviation estimate

sqrt(nonparvarest)

We are examining the log BF, of the test that checks whether a standard uniform is the same as that of a mixture of a standard uniform and a beta(5,5), where p is changing from -

This is the raw data, that is both distributions have [0,1] support and are unchanged.

The next plot redoes this but sees if there's a difference in the data if we examine just their logs instead and reflect the training dat.

dataset1st = dataset1
dataset2st = dataset2

dataset1 = -log(dataset1)
dataset2 = -log(dataset2)
XT1 <- c(dataset1[1:(length(dataset1)*.3)],-dataset1[1:(length(dataset1)*.3)])
XV1 <- dataset1[-(1:(length(dataset1)*.3))]

likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))}
bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
ExpectedKernML1 = laplace.kernH(y = XT1, x = XV1, hhat = bwlik$maximum)

CL1 <- makeCluster(16)
clusterExport(cl = CL1, list("XT1","XV1","dataset2","logmarg.specialkernMCimport", "ExpectedKernML1", "dlength", "logSumExp", "HallKernel", "laplace.kernH2", "logintegrand.Hall", "loglike.KHall", "KHall", "hessian", "dataset1"), envir = environment())
result <- parApply(cl = CL1, dataset2[1:500,], 1, FUN = function(x){
  BFi = 0
  for(j in 1:30)
  {
    trainingset = sample(1:dlength, size = dlength*.3)
    XT1 = c(dataset1[trainingset], -dataset1[trainingset])
    XV1 = dataset1[-trainingset]
    XT2 = c(x[trainingset], -x[trainingset])
    XV2 = x[-trainingset] 
    likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT2, x = XV2)))}
    bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
    ExpectedKernML2 = laplace.kernH2(y = XT2, x = XV2, hhat = bwlik$maximum)

    likvec2 = function(h) {sum(log(HallKernel(h,datagen2 = c(XT1,XT2), x = c(XV1,XV2))))}
    bwlik2 = optimize(f = function(h){  likvec2(h)}, lower = 0, upper = 10, maximum = TRUE)

    ExpectedKernMLcomb = laplace.kernH2(y = c(XT1,XT2), x = c(XV1,XV2), hhat = bwlik2$maximum)

    BFi = BFi + ExpectedKernML1[1] + ExpectedKernML2[1] - ExpectedKernMLcomb[1]
  }
  BFi = BFi / 30
  return(BFi)
})
stopCluster(CL1)
# lw1 = loess(result~p)
# j <- order(p)
# plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*')
# lines(p[j],lw1$fitted[j],col="red",lwd=3)

dfCVBF2 = data.frame(logBF = result, p = p)
plot1 = ggplot(dfCVBF2, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-50,100)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot1
ggsave("BayesSimPlots/CVBFCompactimproved.pdf",plot = plot1, device = "pdf")

sorteddfCVBF2 = dfCVBF2[order(dfCVBF2$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfCVBF2) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfCVBF2[j + 1, 1] - sorteddfCVBF2[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfCVBF2))

#Variance estimate

nonparvarest

#Standard deviation estimate

sqrt(nonparvarest)

#abline(lm4, col = "blue")
dataset1 = dataset1st
dataset2 = dataset2st

There's a change in shape in the BF

We do this with ks test now instead.

kslist = c()
for(j in 1:500)
{
  kslist[j] =  ks.test(dataset1,dataset2[j,])$p
}
# plot(p[1:500],log(kslist), xlab = "p", ylab = "p-values from KS test", pch = '*')
# abline(a=log(0.05),b=0, col = "blue")

dfKS = data.frame(logks = log(kslist), p = p)
plot1 = ggplot(dfKS, aes(x = p, y = logks)) + geom_point()  + geom_hline(yintercept = log(.05), color = "blue") + ylim(c(-50,100)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/KSCompact.pdf",plot = plot1, device = "pdf")


Blist = c()
e = exp(1)
for(j in 1:500)
{
  if(kslist[j] < 1/e)
  {
    Blist[j] = 1 / (-e * kslist[j] * log(kslist[j]))
  }
  else{
    Blist[j] = 1
  }
  if(kslist[j] == 0)
  {
    Blist[j] = Inf
  }
}
Blist = log(Blist)

dfKS = data.frame(logks = log(kslist), p = p, logB = Blist)

ggplot(dfKS, aes(x = p, y = logks)) + geom_point()  + geom_hline(yintercept = log(.05), color = "blue") + ylim(c(-50,100)) + geom_smooth(se = FALSE) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

ggplot(dfKS, aes(x = p, y = logB)) + geom_point()  + geom_hline(yintercept = 1, color = "blue") + ylim(c(-50,100)) + geom_smooth(se = FALSE) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

sum(Blist == 0)

#This is 99

plot1 = ggplot(dfKS, aes(x = p, y = logB)) + geom_point()  + geom_hline(yintercept = 1, color = "blue") + ylim(c(-70, 50)) + geom_smooth(se = FALSE) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

ggsave("BayesSimPlots/KSBCompact.pdf", plot = plot1, device = "pdf")

Comparing this to Polya Tree with a normal base distribution.

PTlist = c()
for(j in 1:500)
{
  meancombineddataset = mean(c(dataset1, dataset2[j,]))
  sdcombineddataset = sd(c(dataset1, dataset2[j,]))
  dataset1adj = (dataset1 - meancombineddataset) / sdcombineddataset
  dataset2adj = (dataset2[j,] - meancombineddataset) / sdcombineddataset
  PTlist[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qnorm, c = 1, leveltot = 10)$logBF
}
plot(p[1:500],PTlist, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*')
abline(a=0,b=0, col = "blue")



dfPT = data.frame(logBF = PTlist, p = p)
plot1 = ggplot(dfPT, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-50,100)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/PTNCompact.pdf",plot = plot1, device = "pdf")

sorteddfPT = dfPT[order(dfPT$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfPT) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfPT))

#Variance estimate

nonparvarest

#Standard deviation estimate

sqrt(nonparvarest)

Comparing this to Polya Tree with a Cauchy base distribution.

PTlist2 = c()
for(j in 1:500)
{
  mediancombineddataset = median(c(dataset1, dataset2[j,]))
  IQRcombineddataset = IQR(c(dataset1, dataset2[j,]))
  dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36)
  dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36)
  PTlist2[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qcauchy, c = 1, leveltot = 10)$logBF
}
# plot(p[1:500],PTlist2, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*')
# abline(a=0,b=0, col = "blue")

dfPT2 = data.frame(logBF = PTlist2, p = p)
plot1 = ggplot(dfPT2, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-50,100)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot1
ggsave("BayesSimPlots/PTCCompact.pdf",plot = plot1, device = "pdf")

sorteddfPT = dfPT2[order(dfPT$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfPT) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfPT))

#Variance estimate

nonparvarest

#Standard deviation estimate

sqrt(nonparvarest)

It looks like CVBF with reflection does the best.

dfall = data.frame(PTC = dfPT2$logBF, PTN = dfPT$logBF, CVBFAvg = dfCVBF$logBF, logKSb = dfKS$logB, p = dfCVBF$p, CVBFAvgAdj = dfCVBF2$logBF)

write.csv(dfall, "logBFandKSvaluesforcompact.csv")

plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) +
  # geom_point() +
  # geom_point(aes(p, PTC), color = "purple") +
  # geom_point(aes(p, PTN), color = "green") +
  # geom_point(aes(p, logKSb), color = "red") +
  # geom_point(aes(p, CVBFAvgAdj), color = "cyan") +
  geom_smooth(aes(p, PTC), color = "pink", se = FALSE) +
  geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) +
  geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) +
  geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) +
  geom_smooth(aes(p, CVBFAvgAdj), color = "brown", se = FALSE) +
  labs(x = "p", y = "log(BF)") + ylim(c(-50,80)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot1
ggsave("BayesSimPlots/jointlogBFcompact.pdf",plot = plot1, device = "pdf")

plot2 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) +
  geom_point() +
  geom_point(aes(p, PTC), color = "purple") +
  geom_point(aes(p, PTN), color = "green") +
  geom_point(aes(p, logKSb), color = "red") +
  geom_point(aes(p, CVBFAvgAdj), color = "cyan") +
  geom_smooth(aes(p, PTC), color = "pink", se = FALSE) +
  geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) +
  geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) +
  geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) +
  geom_smooth(aes(p, CVBFAvgAdj), color = "brown", se = FALSE) +
  labs(x = "p", y = "log(BF)") + ylim(c(-50,100)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

ggsave("BayesSimPlots/jointlogBFcompactfull.pdf",plot = plot2, device = "pdf")
dfall = read.csv("logBFandKSvaluesforcompact.csv")
colorBlindGrey8   <- c("#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#009E73")
colors <- c("Polya Tree Cauchy Base"=colorBlindGrey8[1], "Polya Tree Normal Base"=colorBlindGrey8[2], "Average CVBF of 30 splits"=colorBlindGrey8[3], "Selke's KS test 'B-value'" = colorBlindGrey8[4], "Data Reflection Adjusted CVBF" = colorBlindGrey8[5] )


plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) +
#  geom_point() +
  # geom_point(aes(p, PTC), color = "purple") +
  # geom_point(aes(p, PTN), color = "green") +
  # geom_point(aes(p, logKSb), color = "red") +
  geom_smooth(aes(p, PTC, color = "Polya Tree Cauchy Base"), linetype = 1, se = FALSE) +
  geom_smooth(aes(p, PTN, color = "Polya Tree Normal Base"), linetype = 2, se = FALSE) +
  geom_smooth(aes(p, CVBFAvg, color = "Average CVBF of 30 splits"), linetype = 6, se = FALSE) +
  geom_smooth(aes(p, logKSb, color = "Selke's KS test 'B-value'"), linetype = 4, se = FALSE) +
  geom_smooth(aes(p, CVBFAvgAdj, color = "Data Reflection Adjusted CVBF"), se = FALSE) +
  labs(x = "p", y = "log(BF)", color = "Legend") + ylim(c(-50, 80)) + scale_color_manual(values=colors) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot1


ggsave("BayesSimPlots/jointlogBFcompactLegend.pdf",plot = plot1, device = "pdf")


naveedmerchant/BayesScreening documentation built on June 13, 2024, 7:56 a.m.