knitr::opts_chunk$set(echo = TRUE) HiggsCSV = "C:\\Users\\Naveed\\Documents\\HIGGS.csv.gz"
The code below shows the plots of the KDEs of the columns of the Higgs Boson data set.
fulltrans = read.csv(file = HiggsCSV, colClasses = c(NA,rep("NULL",times = 21),rep(NA,times = 7)), header = FALSE) #fulltrans = read.csv(file = "C:/Users/Naveed/Documents/HIGGS.csv.gz", colClasses = c(rep("NULL",times = 22),rep(NA,times = 7)), header = FALSE) dim(fulltrans) noisedataind = fulltrans[,1]== 0 X = fulltrans[noisedataind,] #This is "background noise" Y = fulltrans[!noisedataind,] source("MarginalLikIntfunctions.R") source("Laplacefunction.R") fulltrans2 = fulltrans[1:1000000,] usedsamptrans = fulltrans2[1:20000,] noisedataind3 = usedsamptrans[,1]== 0 X2 = usedsamptrans[noisedataind3,] #This is "background noise" Y2 = usedsamptrans[!noisedataind3,] plot(density(Y[,8], n = 511, from = 0, to = 4), xlab = "x", main = "") lines(density(X[,8], n = 511, from = 0, to = 4), col = "blue") plot(density(Y[,2], n = 511, from = 0, to = 3), xlab = "x", main = "") lines(density(X[,2], n = 511, from = 0, to = 3), col = "blue")
The code below draws from the predictive posteriors of CVBF and Polya tree.
library(tictoc) trainsizelist = seq(from = 1000, by = 1000, to = 5000) set.seed(1000) bwvec2 = c() trainindX = sample(1:nrow(X2) ,size = trainsizelist[5]) trainindY = sample(1:nrow(Y2) ,size = trainsizelist[5]) XT1 <- X2[trainindX,2] XV1 <- X2[-(trainindX),2] likvec = function(h) {sum(log(HallKernel(h,datagen2 = XT1, x = XV1)))} bwlik = optimize(f = function(h){ likvec(h)}, lower = 0, upper = 10, maximum = TRUE) beta = bwlik$maximum bwvec2[1] = bwlik$maximum postcurr = sum(log(HallKernel(bwvec2[1], datagen2 = XT1, x = XV1))) + log(2*beta) - .5*log(pi) - 2*log(bwvec2[1]) - (beta^2 / bwvec2[1]^2) #Lets try a normal proposal first #N(0,.0005) acceptances = 0 #tic() iter = 3000 for(j in 1:iter) { bwprop = rnorm(1, mean = bwvec2[j], sd = .0025) if(bwprop <= 0) { bwvec2[j+1] = bwvec2[j] } else { postprop = sum(log(HallKernel(bwprop, datagen2 = XT1, x = XV1))) + log(2*beta) - .5*log(pi) - 2*log(bwprop) - (beta^2 / bwprop^2) logpostdif = postprop - postcurr if(exp(logpostdif) > runif(1)) { bwvec2[j+1] = bwprop postcurr = sum(log(HallKernel(bwvec2[j+1], datagen2 = XT1, x = XV1))) + log(2*beta) - .5*log(pi) - 2*log(bwvec2[j+1]) - (beta^2 / bwvec2[j+1]^2) acceptances = acceptances + 1 } else { bwvec2[j+1] = bwvec2[j] } } #print(j) } #toc() plot(bwvec2) #Its gonna take a lot more time than PT, this can't easily be sped up, but can be parallelized with Independence Metrop sampler #Maybe approximate KDE with less points and proceed (this sounds like an approximate techinique though idk) #Independence sampler also PROBABLY has higher acceptance rates since predictive posterior of the BW is close to a #Normal distribution with the mean being around the place where the bw maximizes the likelihood. #Mixing seems nice enough tic() sum(log(HallKernel(bwvec2[j+1], datagen2 = XT1, x = XV1))) toc() #Roughly 2.75 seconds to evaluate library(coda) autocorr.plot(bwvec2, auto.layout = FALSE) #Maybe take every 10th sample? thinnedbwvec2 = bwvec2[seq(1,length(bwvec2),10)] predpostavg2 = 0 for(j in 1:length(thinnedbwvec2)) { predpostavg2 = (1/length(thinnedbwvec2))*HallKernel(thinnedbwvec2[j], datagen = XT1, seq(from = 0, to = 4, length.out = 10000)) + predpostavg2 } #################################Code for fig 3 set.seed(500) datalist = X2[,2] alphalist = list() leveltot = 9 c = 1 for(m in 1:leveltot) { alphalist[[m]] = rep(c*m^2,2^m) } #An alternative way to write this? epsilonlist = c() epsilonlist2 = list() splitlist = list() for(j in 1:leveltot) { splitlist[[j]] = rep(0,2^j) } for(j in 1:length(datalist)) { thresh = .5 for(m in 1:leveltot) { epsilonlist[m] = datalist[j] > qnorm(thresh) multivec = 2^seq(m-1, 0, -1) ind = sum(epsilonlist[1:m]*multivec) + 1 splitlist[[m]][ind] = splitlist[[m]][ind] + 1 thresh = thresh + (2*epsilonlist[m] - 1) / 2^{m+1} } epsilonlist2[[j]] <- epsilonlist epsilonlist = c() } #splitlist # epsilonlist3 = list() # splitlist2 = list() # for(j in 1:leveltot) # { # splitlist2[[j]] = rep(0,2^j) # } # for(j in 1:length(datalist2)) # { # thresh = .5 # for(m in 1:leveltot) # { # epsilonlist[m] = datalist2[j] > qnorm(thresh) # multivec = 2^seq(m-1, 0, -1) # ind = sum(epsilonlist[1:m]*multivec) + 1 # splitlist2[[m]][ind] = splitlist2[[m]][ind] + 1 # thresh = thresh + (2*epsilonlist[m] - 1) / 2^{m+1} # } # epsilonlist3[[j]] <- epsilonlist # epsilonlist = c() # } # splitlist2 # largeqnormlist = seq(from = 0, to = 1, by = .0000001) largeqnormlist = qnorm(largeqnormlist) #Want a sample from distribution that has polya tree prior postbetalist = list() for(j in 1:leveltot) { postbetalist[[j]] = splitlist[[j]] + alphalist[[j]] } #posteriorPTdrawlist = list() posteriorPTdraw = c() leveltot = 9 ndraw = 1000 k = 1 indlist = c() for(i in 1:ndraw) { for(m in 1:leveltot) { postdraw = rbinom(1,size = 1, prob = (postbetalist[[m]][k]) / (postbetalist[[m]][k+1] + postbetalist[[m]][k])) if(postdraw == 1) { epsilonlist[m] = 0 #k = k } else { epsilonlist[m] = 1 #k = k + 2 } #epsilonlist[m] = datalist[j] > qnorm(thresh) multivec = 2^seq(m, 1, -1) ind = sum(epsilonlist[1:m]*multivec) + 1 #print(postbetalist[[m]][k]) #print(postbetalist[[m]][k+1]) #print(postdraw) #print(ind) k = ind #splitlist[[m]][ind] = splitlist[[m]][ind] + 1 } #epsilonlist2[[j]] <- epsilonlist #need to draw from qnorm((ind - 1) / 2^9) qnorm(ind / 2^9) #VFY below line posteriorPTdraw[i] = sample(largeqnormlist[(largeqnormlist < qnorm((ind+1) / 2^(leveltot+1))) & (largeqnormlist > qnorm((ind) / 2^(leveltot+1)))], size = 1) epsilonlist = c() indlist[i] = ind k=1 #print(i) } plot(density(fulltrans[noisedataind,2], from = 0, to = 4, n = 10000), lwd = 2) lines(seq(from = 0, to = 4, length.out = 10000), predpostavg2, col = "purple", main = "Predictive Posteriors vs True Density Estimate", xlab = "23rd column values of Noise", ylab = "Density", lwd = 2) lines(density(posteriorPTdraw, from = 0, to = 4, n = 10000), col = "red", lwd = 2)
The shown plots are predictive posteriors. The black line is the KDE out of 11,000,000 observations. The purple line is the predictive posterior from CVBF, the red line is the predictive posterior of Polya tree.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.