knitr::opts_chunk$set(echo = TRUE)
HiggsCSV = "C:\\Users\\Naveed\\Documents\\HIGGS.csv.gz"

Code for KDEs

The code below shows the plots of the KDEs of the columns of the Higgs Boson data set.

fulltrans = read.csv(file = HiggsCSV, colClasses = c(NA,rep("NULL",times = 21),rep(NA,times = 7)), header = FALSE)

#fulltrans = read.csv(file = "C:/Users/Naveed/Documents/HIGGS.csv.gz", colClasses = c(rep("NULL",times = 22),rep(NA,times = 7)), header = FALSE)
dim(fulltrans)

noisedataind = fulltrans[,1]== 0

X = fulltrans[noisedataind,]
#This is "background noise"
Y = fulltrans[!noisedataind,]


source("MarginalLikIntfunctions.R")
source("Laplacefunction.R")

fulltrans2 = fulltrans[1:1000000,]

usedsamptrans = fulltrans2[1:20000,]
noisedataind3 = usedsamptrans[,1]== 0
X2 = usedsamptrans[noisedataind3,]
#This is "background noise"
Y2 = usedsamptrans[!noisedataind3,]


plot(density(Y[,8], n = 511, from = 0, to = 4), xlab = "x", main = "")
lines(density(X[,8], n = 511, from = 0, to = 4), col = "blue")



plot(density(Y[,2], n = 511, from = 0, to = 3), xlab = "x", main = "")
lines(density(X[,2], n = 511, from = 0, to = 3), col = "blue")

Drawing from predictive posteriors

The code below draws from the predictive posteriors of CVBF and Polya tree.

library(tictoc)

trainsizelist = seq(from = 1000, by = 1000, to = 5000)

set.seed(1000)
bwvec2 = c()

trainindX = sample(1:nrow(X2) ,size = trainsizelist[5])
trainindY = sample(1:nrow(Y2) ,size = trainsizelist[5])
XT1 <- X2[trainindX,2]
XV1 <- X2[-(trainindX),2]

likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))}
bwlik = optimize(f = function(h){  likvec(h)}, lower = 0, upper = 10, maximum = TRUE)

beta = bwlik$maximum
bwvec2[1] = bwlik$maximum
postcurr = sum(log(HallKernel(bwvec2[1], datagen2 = XT1, x = XV1))) + log(2*beta) - .5*log(pi) - 2*log(bwvec2[1]) - (beta^2 / bwvec2[1]^2)
#Lets try a normal proposal first 
#N(0,.0005)
acceptances = 0
#tic()
iter = 3000
for(j in 1:iter)
{
  bwprop = rnorm(1, mean = bwvec2[j], sd = .0025)
  if(bwprop <= 0)
  {
    bwvec2[j+1] = bwvec2[j]
  }
  else
  {
    postprop = sum(log(HallKernel(bwprop, datagen2 = XT1, x = XV1))) + log(2*beta) - .5*log(pi) - 2*log(bwprop) - (beta^2 / bwprop^2)
    logpostdif = postprop - postcurr
    if(exp(logpostdif) > runif(1))
    {
      bwvec2[j+1] = bwprop
      postcurr = sum(log(HallKernel(bwvec2[j+1], datagen2 = XT1, x = XV1))) + log(2*beta) - .5*log(pi) - 2*log(bwvec2[j+1]) - (beta^2 / bwvec2[j+1]^2)
      acceptances = acceptances + 1
    }
    else
    {
      bwvec2[j+1] = bwvec2[j]
    }
  }
  #print(j)
}
#toc()
plot(bwvec2)
#Its gonna take a lot more time than PT, this can't easily be sped up, but can be parallelized with Independence Metrop sampler
#Maybe approximate KDE with less points and proceed (this sounds like an approximate techinique though idk)
#Independence sampler also PROBABLY has higher acceptance rates since predictive posterior of the BW is close to a 
#Normal distribution with the mean being around the place where the bw maximizes the likelihood.
#Mixing seems nice enough

tic()
sum(log(HallKernel(bwvec2[j+1], datagen2 = XT1, x = XV1)))
toc()

#Roughly 2.75 seconds to evaluate


library(coda)

autocorr.plot(bwvec2, auto.layout = FALSE)
#Maybe take every 10th sample?

thinnedbwvec2 = bwvec2[seq(1,length(bwvec2),10)]

predpostavg2 = 0
for(j in 1:length(thinnedbwvec2))
{
  predpostavg2 = (1/length(thinnedbwvec2))*HallKernel(thinnedbwvec2[j], datagen = XT1, seq(from = 0, to = 4, length.out = 10000)) + predpostavg2
}

#################################Code for fig 3



set.seed(500)
datalist = X2[,2]

alphalist = list()
leveltot = 9
c = 1
for(m in 1:leveltot)
{
  alphalist[[m]] = rep(c*m^2,2^m)
}

#An alternative way to write this?
epsilonlist = c()
epsilonlist2 = list()
splitlist = list()
for(j in 1:leveltot)
{
  splitlist[[j]] = rep(0,2^j)
}

for(j in 1:length(datalist))
{
  thresh = .5
  for(m in 1:leveltot)
  {
    epsilonlist[m] = datalist[j] > qnorm(thresh)
    multivec = 2^seq(m-1, 0, -1)
    ind = sum(epsilonlist[1:m]*multivec) + 1
    splitlist[[m]][ind] = splitlist[[m]][ind] + 1
    thresh = thresh + (2*epsilonlist[m] - 1) / 2^{m+1}
  }
  epsilonlist2[[j]] <- epsilonlist
  epsilonlist = c()
}
#splitlist

# epsilonlist3 = list()
# splitlist2 = list()
# for(j in 1:leveltot)
# {
#   splitlist2[[j]] = rep(0,2^j)
# }
# for(j in 1:length(datalist2))
# {
#   thresh = .5
#   for(m in 1:leveltot)
#   {
#     epsilonlist[m] = datalist2[j] > qnorm(thresh)
#     multivec = 2^seq(m-1, 0, -1)
#     ind = sum(epsilonlist[1:m]*multivec) + 1
#     splitlist2[[m]][ind] = splitlist2[[m]][ind] + 1
#     thresh = thresh + (2*epsilonlist[m] - 1) / 2^{m+1}
#   }
#   epsilonlist3[[j]] <- epsilonlist
#   epsilonlist = c()
# }
# splitlist2
# 

largeqnormlist = seq(from = 0, to = 1, by = .0000001)
largeqnormlist = qnorm(largeqnormlist)


#Want a sample from distribution that has polya tree prior

postbetalist = list()
for(j in 1:leveltot)
{
  postbetalist[[j]] = splitlist[[j]] + alphalist[[j]]
}

#posteriorPTdrawlist = list()

posteriorPTdraw = c()
leveltot = 9
ndraw = 1000
k = 1
indlist = c()
for(i in 1:ndraw)
{
  for(m in 1:leveltot)
  {
    postdraw = rbinom(1,size = 1, prob = (postbetalist[[m]][k]) / (postbetalist[[m]][k+1] + postbetalist[[m]][k]))
    if(postdraw == 1)
    {
      epsilonlist[m] = 0
      #k = k 
    }
    else
    {
      epsilonlist[m] = 1
      #k = k + 2
    }
    #epsilonlist[m] = datalist[j] > qnorm(thresh)
    multivec = 2^seq(m, 1, -1)
    ind = sum(epsilonlist[1:m]*multivec) + 1

    #print(postbetalist[[m]][k])
    #print(postbetalist[[m]][k+1])
    #print(postdraw)
    #print(ind)
    k = ind

    #splitlist[[m]][ind] = splitlist[[m]][ind] + 1
  }
  #epsilonlist2[[j]] <- epsilonlist
  #need to draw from qnorm((ind - 1) / 2^9)  qnorm(ind / 2^9)
  #VFY below line
  posteriorPTdraw[i] = sample(largeqnormlist[(largeqnormlist < qnorm((ind+1) / 2^(leveltot+1))) & (largeqnormlist > qnorm((ind) / 2^(leveltot+1)))], size = 1)
  epsilonlist = c()
  indlist[i] = ind
  k=1
  #print(i)
}

plot(density(fulltrans[noisedataind,2], from = 0, to = 4, n = 10000), lwd = 2)
lines(seq(from = 0, to = 4, length.out = 10000), predpostavg2, col = "purple", main = "Predictive Posteriors vs True Density Estimate", xlab = "23rd column values of Noise", ylab = "Density", lwd = 2)

lines(density(posteriorPTdraw, from = 0, to = 4, n = 10000), col = "red", lwd = 2)

The shown plots are predictive posteriors. The black line is the KDE out of 11,000,000 observations. The purple line is the predictive posterior from CVBF, the red line is the predictive posterior of Polya tree.



naveedmerchant/BayesScreening documentation built on June 13, 2024, 7:56 a.m.