In naveedmerchant/BayesScreening: Classical and Bayesian Screening for Binary Classification

knitr::opts_chunk$set(echo = TRUE)

Short Tail Experiments

It has been noticed that the Gaussian Kernel is unstable if the data comes from a very long tail distribution.

If the data comes from a Cauchy distribution, the bayes factor, in general, is unstable.

We verify that switching Kernels, in general, seems to fix problems. Does this kernel still work well with short tail data?

We try to show this in plots and by using "BayesSim"

We now show the "Gaussian v Gaussian problem"

library(parallel)
library(ggplot2)
source("MarginalLikIntfunctions.R")
source("Laplacefunction.R")
set.seed(1000)

p = rbeta(500,.5,.5)
dlength = 400

dataset1 <- rnorm(dlength)
dataset2 <- matrix(data = NA, nrow = length(p),ncol = dlength)
for(i in 1:length(p))
{
  unifdraw = runif(dlength)
  for(j in 1:dlength)
  {
    if(unifdraw[j] > p[i])
    {
      dataset2[i,j] = rnorm(1)
    }
    else
    {
      dataset2[i,j] = rnorm(1, mean = 0, sd = sqrt(4))
    }
  }
}

XT1 <- dataset1[1:(length(dataset1)*.3)]
XV1 <- dataset1[-(1:(length(dataset1)*.3))]

likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))}
bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum)

CL1 <- makeCluster(16)
clusterExport(cl = CL1, list("XT1","XV1","dataset2","logmarg.specialkernMCimport", "ExpectedKernML1", "dlength", "logSumExp", "HallKernel", "laplace.kernH2", "logintegrand.Hall", "loglike.KHall", "KHall", "hessian", "dataset1"), envir = environment())
result <- parApply(cl = CL1, dataset2[1:500,], 1, FUN = function(x){
  BFi = 0
  for(j in 1:30)
  {
    trainingset = sample(1:dlength, size = dlength*.3)
    XT1 = dataset1[trainingset]
    XV1 = dataset1[-trainingset]
    XT2 = x[trainingset]
    XV2 = x[-trainingset] 
    likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))}
    bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
    ExpectedKernML1 = laplace.kernH2(y = XT1, x = XV1, hhat = bwlik$maximum)

    likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT2, x = XV2)))}
    bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)
    ExpectedKernML2 = laplace.kernH2(y = XT2, x = XV2, hhat = bwlik$maximum)

    likvec2 = function(h) {sum(log(HallKernel(h,datagen2 = c(XT1,XT2), x = c(XV1,XV2))))}
    bwlik2 = optimize(f = function(h){  likvec2(h)}, lower = 0, upper = 10, maximum = TRUE)

    ExpectedKernMLcomb = laplace.kernH2(y = c(XT1,XT2), x = c(XV1,XV2), hhat = bwlik2$maximum)

    BFi = BFi + ExpectedKernML1[1] + ExpectedKernML2[1] - ExpectedKernMLcomb[1]
  }
  BFi = BFi / 30
  return(BFi)
})
stopCluster(CL1)

lm4 <- lm(result~p[1:500])
#plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*')
# lw1 = loess(result~p)
# j <- order(p)
# plot(p,result, xlab = "p", ylab = "Log Bayes Factor", pch = '*')
# lines(p[j],lw1$fitted[j],col="red",lwd=3)
# 
# 
# abline(a = 0, b = 0)

dfCVBF = data.frame(logBF = result, p = p)
plot1 = ggplot(dfCVBF, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-40, 60)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/CVBFShortvShort.pdf",plot = plot1, device = "pdf")

sorteddfCVBF = dfCVBF[order(dfCVBF$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfCVBF) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfCVBF[j + 1, 1] - sorteddfCVBF[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfCVBF))

#Variance estimate

nonparvarest

#Standard deviation estimate

sqrt(nonparvarest)

We are examining the log CVBF, of the test that checks whether a standard Gaussian is the same as a mixture of a standard Gaussian and a normal with higher variance.

Lets see how KS test does on this problem

kslist = c()
for(j in 1:500)
{
  kslist[j] =  ks.test(dataset1,dataset2[j,])$p
}
Blist = c()
e = exp(1)
for(j in 1:500)
{
  if(kslist[j] < 1/e)
  {
    Blist[j] = 1 / (-e * kslist[j] * log(kslist[j]))
  }
  else{
    Blist[j] = 1
  }
  if(kslist[j] == 0)
  {
    Blist[j] = Inf
  }
}
Blist = log(Blist)
sum(Blist == 0)

#### 137

dfKS = data.frame(logks = log(kslist), p = p, logB = Blist)
plot1 = ggplot(dfKS, aes(x = p, y = logks)) + geom_point()  + geom_hline(yintercept = log(.05), color = "blue") + ylim(c(-40, 60)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/KSShortvShort.pdf",plot = plot1, device = "pdf")


plot2 = ggplot(dfKS, aes(x = p, y = logB)) + geom_point()  + geom_hline(yintercept = 1, color = "blue") + ylim(c(-40, 60)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot2

ggsave("BayesSimPlots/KSBShortvShort.pdf",plot = plot1, device = "pdf")

What about Polya tree test with a normal base distribution?

PTlist = c()
for(j in 1:500)
{
  mediancombineddataset = median(c(dataset1, dataset2[j,]))
  IQRcombineddataset = IQR(c(dataset1, dataset2[j,]))
  dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36)
  dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36)
  PTlist[j] = PolyaTreetest(dataset1,dataset2[j,], Ginv = qnorm, c = 1, leveltot = 9)$logBF
  #print(j)
}
plot(p[1:500],PTlist, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*')
abline(a=0,b=0, col = "blue")

dfPT = data.frame(logBF = PTlist, p = p)
plot1 = ggplot(dfPT, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-40, 60)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/PTNShortvShort.pdf",plot = plot1, device = "pdf")

sorteddfPT = dfPT[order(dfPT$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfPT) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfPT))

#Variance estimate

nonparvarest

#Standard deviation estimate

sqrt(nonparvarest)

What about Polya tree with Cauchy base distribution?

PTlist2 = c()
for(j in 1:500)
{
  mediancombineddataset = median(c(dataset1, dataset2[j,]))
  IQRcombineddataset = IQR(c(dataset1, dataset2[j,]))
  dataset1adj = (dataset1 - mediancombineddataset) / (IQRcombineddataset / 1.36)
  dataset2adj = (dataset2[j,] - mediancombineddataset) / (IQRcombineddataset / 1.36)
  PTlist2[j] = PolyaTreetest(dataset1adj,dataset2adj, Ginv = qcauchy, c = 1, leveltot = 10)$logBF
  #print(j)
}
plot(p[1:500],PTlist2, xlab = "p", ylab = "log BF values from Polya Tree test", pch = '*')
abline(a=0,b=0, col = "blue")


dfPT2 = data.frame(logBF = PTlist2, p = p)
plot1 = ggplot(dfPT2, aes(x = p, y = logBF)) + geom_point()  + geom_hline(yintercept = 0, color = "blue") + geom_smooth(colour = "red", se = FALSE) + ylim(c(-40, 60)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))
plot1
ggsave("BayesSimPlots/PTCShortvShort.pdf",plot = plot1, device = "pdf")

#quantile(x = PTNull, probs = .9)
#quantile(x = CVBF, probs = .9)

sorteddfPT = dfPT2[order(dfPT$p),]
nonparvarest = 0
for(j in 1:(nrow(sorteddfPT) - 1) )
{
  nonparvarest = nonparvarest + (sorteddfPT[j + 1, 1] - sorteddfPT[j, 1])^2
}
nonparvarest = nonparvarest / (2* nrow(sorteddfPT))


#Variance estimate

nonparvarest

#Standard deviation estimate

sqrt(nonparvarest)

The Polya tree with a Cauchy base distribution cross the line later than other methods.

dfall = data.frame(PTC = dfPT2$logBF, PTN = dfPT$logBF, CVBFAvg = dfCVBF$logBF, logKSb = dfKS$logB, p = dfCVBF$p)
write.csv(dfall, file = "logBFsandKSvaluesforshortvshort.csv")

plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) +
#  geom_point() +
  # geom_point(aes(p, PTC), color = "purple") +
  # geom_point(aes(p, PTN), color = "green") +
  # geom_point(aes(p, logKSb), color = "red") +
  geom_smooth(aes(p, PTC), color = "pink", se = FALSE) +
  geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) +
  geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) +
  geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) +
  labs(x = "p", y = "log(BF)") + ylim(c(-40, 60)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot1
ggsave("BayesSimPlots/jointlogBFShortvShort.pdf",plot = plot1, device = "pdf")

plot2 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) +
 geom_point() +
geom_point(aes(p, PTC), color = "purple") +
geom_point(aes(p, PTN), color = "green") +
geom_point(aes(p, logKSb), color = "red") +
  geom_smooth(aes(p, PTC), color = "pink", se = FALSE) +
  geom_smooth(aes(p, PTN), color = "yellow", se = FALSE) +
  geom_smooth(aes(p, CVBFAvg), color = "blue", se = FALSE) +
  geom_smooth(aes(p, logKSb), color = "orange", se = FALSE) +
  labs(x = "p", y = "log(BF)") + ylim(c(-40, 60)) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot2
ggsave("BayesSimPlots/jointlogBFshortvshortfull.pdf",plot = plot2, device = "pdf")

dfall = read.csv("logBFsandKSvaluesforshortvshort.csv")
colorBlindGrey8   <- c("#F0E442", "#0072B2", "#D55E00", "#CC79A7")
colors <- c("Polya Tree Cauchy Base"=colorBlindGrey8[1], "Polya Tree Normal Base"=colorBlindGrey8[2], "Average CVBF of 30 splits"=colorBlindGrey8[3], "Selke's KS test 'B-value'" = colorBlindGrey8[4])
plot1 = ggplot(data = dfall, mapping = aes(p, CVBFAvg)) +
#  geom_point() +
  # geom_point(aes(p, PTC), color = "purple") +
  # geom_point(aes(p, PTN), color = "green") +
  # geom_point(aes(p, logKSb), color = "red") +
  geom_smooth(aes(p, PTC, color = "Polya Tree Cauchy Base"), linetype = 1, se = FALSE) +
  geom_smooth(aes(p, PTN, color = "Polya Tree Normal Base"), linetype = 2, se = FALSE) +
  geom_smooth(aes(p, CVBFAvg, color = "Average CVBF of 30 splits"), linetype = 6, se = FALSE) +
  geom_smooth(aes(p, logKSb, color = "Selke's KS test 'B-value'"), linetype = 4, se = FALSE) +
  labs(x = "p", y = "log(BF)", color = "Legend") + ylim(c(-40, 60)) + scale_color_manual(values=colors) +
  theme_minimal() + theme(axis.text.x=element_text(size=12), axis.text.y = element_text(size=12))

plot1

ggsave("BayesSimPlots/jointlogBFShortvShortLegend.pdf",plot = plot1, device = "pdf")

naveedmerchant/BayesScreening documentation built on June 13, 2024, 7:56 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

naveedmerchant/BayesScreening
Classical and Bayesian Screening for Binary Classification

In naveedmerchant/BayesScreening: Classical and Bayesian Screening for Binary Classification

Short Tail Experiments

R Package Documentation

Browse R Packages

We want your feedback!

naveedmerchant/BayesScreening Classical and Bayesian Screening for Binary Classification

In naveedmerchant/BayesScreening: Classical and Bayesian Screening for Binary Classification

Short Tail Experiments

R Package Documentation

Browse R Packages

We want your feedback!

naveedmerchant/BayesScreening
Classical and Bayesian Screening for Binary Classification