knitr::opts_chunk$set(echo = TRUE)
HiggsCSV = "C:\\Users\\Naveed\\Documents\\HIGGS.csv.gz"

R Markdown

Want to make many CVBFs of the Higgs Boson data set, and want to do it in parallel.

fulltrans = read.csv(file = HiggsCSV, colClasses = c(NA,rep("NULL",times = 21),rep(NA,times = 7)), header = FALSE)
dim(fulltrans)

noisedataind = fulltrans[,1]== 0

X = fulltrans[noisedataind,]
#This is "background noise"
Y = fulltrans[!noisedataind,]

fulltrans2 = fulltrans[1:1000000,]

usedsamptrans = fulltrans2[1:20000,]
noisedataind3 = usedsamptrans[,1]== 0
X2 = usedsamptrans[noisedataind3,]
#This is "background noise"
Y2 = usedsamptrans[!noisedataind3,]

library(BSCRN)
library(matrixStats)
library(foreach)
library(rootSolve)
library(doRNG)
trainsizes = seq(from = 1000, to = 5000, by = 1000)
numberofsplits = 20
logBFmat3 = matrix(nrow = length(trainsizes), ncol = numberofsplits)
set.seed(1000)
ncores = 20

for(i in 1:length(trainsizes))
{
  for(j in 1:20)
  {
    logBFmat3[i,j] = BSCRN::CVBFtestrsplit(dataset1 = X2[,2], dataset2 = Y2[,2], trainsize1 = trainsizes[i], trainsize2 = trainsizes[i])$logBF
  }
}

logBFmeans3 = rowMeans(logBFmat3)
logBFranges3 = rowRanges(logBFmat3)

dfmat3 <- data.frame(trainsize=trainsizes, min=logBFranges3[,1], max=logBFranges3[,2], mean = logBFmeans3)
library(ggplot2)
ggplot(dfmat3, aes(x=trainsize))+
  geom_linerange(aes(ymin=min,ymax=max),linetype=2,color="blue")+
  geom_point(aes(y=min),size=3,color="red")+
  geom_point(aes(y=max),size=3,color="red")+
  geom_point(aes(y=mean),size = 4, color = "green")+
  geom_line(aes(x = trainsize, y = mean), size = 3, color = "purple", linetype = 2)
theme_bw()

Plot of col 24

Below is code for col 29

logBFmat3 = matrix(nrow = length(trainsizes), ncol = numberofsplits)

ncores = 20
for(i in 1:length(trainsizes))
{
  for(j in 1:20)
  {
    logBFmat3[i,j] = BSCRN::CVBFtestrsplit(dataset1 = X2[,8], dataset2 = Y2[,8], trainsize1 = trainsizes[i], trainsize2 = trainsizes[i])$logBF
  }
}

logBFmeans3 = rowMeans(logBFmat3)
logBFranges3 = rowRanges(logBFmat3)

dfmat3 <- data.frame(trainsize=trainsizes, min=logBFranges3[,1], max=logBFranges3[,2], mean = logBFmeans3)
library(ggplot2)
ggplot(dfmat3, aes(x=trainsize))+
  geom_linerange(aes(ymin=min,ymax=max),linetype=2,color="blue")+
  geom_point(aes(y=min),size=3,color="red")+
  geom_point(aes(y=max),size=3,color="red")+
  geom_point(aes(y=mean),size = 4, color = "green")+
  geom_line(aes(x = trainsize, y = mean), size = 3, color = "purple", linetype = 2)
theme_bw()

Column 29 log BF



naveedmerchant/BayesScreening documentation built on June 13, 2024, 7:56 a.m.