knitr::opts_chunk$set(echo = TRUE)
library(magrittr)
library(ggplot2)
library(dplyr)

We are going to run an experiment using our SGGP code to see how the various options affect the prediction ability of the model.

Internal options

Internal options for our experiment

This is $2^5 3^2 = 288$ options. We can do a fractional factorial experiment to save time.

First experiment (Comparer2)

I saw that MCMC and full Bayesian prediction were far slower than their alternatives. I ran a smaller experiment on my computer with the following

This took over a day on my laptop using 3 cores.

e2 <- readRDS("ComparerRun2_completed.rds")
e2$completed_runs %>% table

I look at the results and saw something weird going on. It turns out that some of the runs became awful after large number of points.

stripchart(RMSE ~ nallotted, e2$outcleandf)
stripchart(RMSE ~ nallotted + corr, e2$outcleandf, las=1)
ggplot(data=e2$outcleandf, mapping=aes(x=nallotted,y=RMSE, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(corr ~ .) + ggplot2::scale_y_log10()
ggplot(data=e2$outcleandf, mapping=aes(x=nallotted,y=score, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(corr ~ .)
ggplot(data=e2$outcleandf[e2$outcleandf$nallotted!=16384,], mapping=aes(x=nallotted,y=score, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(corr ~ .)
sapply(unique(e2$outcleandf$nallotted), function(nn) lm(score ~ corr + sel.method + factor(batchsize) + append.rimseperpoint + grid_size, e2$outcleandf %>% filter(nallotted==nn, corr!="gaussian", corr!="m52"))$coeff) %>% rbind(nallotted=unique(e2$outcleandf$nallotted)) %>% round(4)
sapply(unique(e2$outcleandf$nallotted), function(nn) lm(log(RMSE) ~ corr + sel.method + factor(batchsize) + append.rimseperpoint + grid_size, e2$outcleandf %>% filter(nallotted==nn, corr!="gaussian", corr!="m52"))$coeff) %>% rbind(nallotted=unique(e2$outcleandf$nallotted)) %>% round(4)
sapply(unique(e2$outcleandf$nallotted), function(nn) lm(log(elapsedtime,2) ~ corr + sel.method + factor(batchsize) + append.rimseperpoint + grid_size, e2$outcleandf %>% filter(nallotted==nn, corr!="gaussian", corr!="m52"))$coeff) %>% rbind(nallotted=unique(e2$outcleandf$nallotted)) %>% round(4)

Comparison 3

This is an experiment that only goes up to sample size 1024, taking stats at 512 and 1024. It does do fully Bayesian prediction (vs using the MAP) and MCMC (vs Laplace approx).

e3 <- readRDS("Comparison3_completed.rds")
e3$completed_runs %>% table
e3$calculate_effects()
ggplot(data=e3$outcleandf, mapping=aes(x=nallotted,y=RMSE, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(corr ~ .) + ggplot2::scale_y_log10()
ggplot(data=e3$outcleandf, mapping=aes(x=nallotted,y=score, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(corr ~ .)
ggplot(data=e3$outcleandf[e3$outcleandf$nallotted!=16384,], mapping=aes(x=nallotted,y=score, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(corr ~ .)
sapply(unique(e3$outcleandf$nallotted), function(nn) lm(score ~ sel.method + factor(batchsize) + append.rimseperpoint + grid_size + pred.fullBayes + use_laplaceapprox, e3$outcleandf %>% filter(nallotted==nn, corr!="gaussian", corr!="powerexp"))$coeff) %>% rbind(nallotted=unique(e3$outcleandf$nallotted)) %>% round(4)
sapply(unique(e3$outcleandf$nallotted), function(nn) lm(log(RMSE) ~ sel.method + factor(batchsize) + append.rimseperpoint + grid_size + pred.fullBayes + use_laplaceapprox, e3$outcleandf %>% filter(nallotted==nn, corr!="gaussian", corr!="powerexp"))$coeff) %>% rbind(nallotted=unique(e3$outcleandf$nallotted)) %>% round(4)
sapply(unique(e3$outcleandf$nallotted), function(nn) lm(log(elapsedtime,2) ~ sel.method + factor(batchsize) + append.rimseperpoint + grid_size + pred.fullBayes + use_laplaceapprox, e3$outcleandf %>% filter(nallotted==nn, corr!="gaussian", corr!="powerexp"))$coeff) %>% rbind(nallotted=unique(e3$outcleandf$nallotted)) %>% round(4)

Comparison 4

I ran experiments of five different functions, only using the fast prediction and fit options. I also changed the fast grid to make it faster, this is also what we currently have the default set to in our package. Two of the correlation functions were removed.

e4a <- readRDS("ComparerRun4_completed_SQTfixed.rds")
e4a$completed_runs %>% table
e4b <- readRDS("ComparerRun4b_completed.rds")
e4b$completed_runs %>% table
e4dfwg <- rbind(e4a$outcleandf, e4b$outcleandf)
summary(e4dfwg)

Each function having different scale is troublesome. Need to normalize RMSEs. Scores should be fine, maybe?

Gaussian is bad

Score should be function invariant? Not really.

ggplot(data=e4dfwg, mapping=aes(x=nallotted,y=score, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr, scales="free_y")

I had to rerun CauchySQT on wing weight since it had NA scores.

plyr::ddply(e4dfwg, c("corr", "f"), function(x) sum(is.na(x$score))) %>% filter(V1>0)

RMSE on linear scale

ggplot(data=e4dfwg, mapping=aes(x=nallotted,y=RMSE, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr, scales="free_y")

With log scale

ggplot(data=e4dfwg, mapping=aes(x=nallotted,y=RMSE, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr, scales="free_y") + scale_y_log10()

Remove Gaussian, Matern 5/2, Beam bending

e4df <- e4dfwg %>% filter(corr != "gaussian", corr != "m52", f!= "beambending")
e4df$score %>% summary
ggplot(data=e4df, mapping=aes(x=nallotted,y=RMSE, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr, scale="free_y") + ggplot2::scale_y_log10()
ggplot(data=e4df, mapping=aes(x=nallotted,y=score, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr, scale="free_y")
ggplot(data=e4df, mapping=aes(x=nallotted,y=CRPscore, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr, scale="free_y") + scale_y_log10()

Regression coefficients

sapply(unique(e4df$nallotted), function(nn) lm(score ~ corr + sel.method + factor(batchsize) + append.rimseperpoint + grid_size, e4df %>% filter(nallotted==nn))$coeff) %>% rbind(nallotted=unique(e4df$nallotted)) %>% round(4)
sapply(unique(e4df$nallotted), function(nn) lm(log(RMSE) ~ corr + sel.method + factor(batchsize) + append.rimseperpoint + grid_size, e4df %>% filter(nallotted==nn))$coeff) %>% rbind(nallotted=unique(e4df$nallotted)) %>% round(4)
sapply(unique(e4df$nallotted), function(nn) lm(log(elapsedtime,2) ~ corr + sel.method + factor(batchsize) + append.rimseperpoint + grid_size, e4df %>% filter(nallotted==nn, corr!="gaussian", corr!="m52"))$coeff) %>% rbind(nallotted=unique(e4df$nallotted)) %>% round(4)

Comparison 5

e5 <- readRDS("ComparerRun5_completed.rds")
e5df <- e5$outcleandf
e5$completed_runs %>% summary
ggplot(data=e5df, mapping=aes(x=nallotted,y=RMSE, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr, scale="free_y") + ggplot2::scale_y_log10()
ggplot(data=e5df, mapping=aes(x=nallotted,y=RMSE, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr + sel.method, scale="free_y") + ggplot2::scale_y_log10()
ggplot(data=e5df, mapping=aes(x=nallotted,y=score, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr, scale="free_y")
ggplot(data=e5df, mapping=aes(x=nallotted,y=CRPscore, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr, scale="free_y") + scale_y_log10()
ggplot(data=e5df, mapping=aes(x=nallotted,y=CRPscore, shape=factor(batchsize), color=sel.method)) + geom_point() +
  facet_grid(f ~ corr+sel.method, scale="free_y") + scale_y_log10()

Regression coefficients

sapply(unique(e5df$nallotted), function(nn) lm(score ~ corr + sel.method, e5df %>% filter(nallotted==nn))$coeff) %>% rbind(nallotted=unique(e5df$nallotted)) %>% round(4)
sapply(unique(e5df$nallotted), function(nn) lm(log(RMSE) ~ corr + sel.method, e5df %>% filter(nallotted==nn))$coeff) %>% rbind(nallotted=unique(e5df$nallotted)) %>% round(4)
sapply(unique(e5df$nallotted), function(nn) lm(log(elapsedtime,2) ~ corr + sel.method, e5df %>% filter(nallotted==nn))$coeff) %>% rbind(nallotted=unique(e5df$nallotted)) %>% round(4)

Regression coefficients by function

c1 <- plyr::ddply(e5df, c("f", "nallotted"), 
            function(x) {
              lm(score ~ corr + sel.method, x)$coeff %>% round(4)
})
ggplot(data=reshape2::melt(c1, c("f", "nallotted")) %>% filter(variable!="(Intercept)"), mapping=aes(nallotted, value)) + geom_point() +facet_grid(f ~ variable)
c2 <- plyr::ddply(e5df, c("f", "nallotted"), 
            function(x) {
              lm(log(RMSE) ~ corr + sel.method, x)$coeff %>% round(4)
})
ggplot(data=reshape2::melt(c2, c("f", "nallotted")) %>% filter(variable!="(Intercept)"), mapping=aes(nallotted, value)) + geom_point() +facet_grid(f ~ variable)
c3 <- plyr::ddply(e5df, c("f", "nallotted"), 
            function(x) {
              lm(log(elapsedtime,2) ~ corr + sel.method, x)$coeff %>% round(4)
})
ggplot(data=reshape2::melt(c3, c("f", "nallotted")) %>% filter(variable!="(Intercept)"), mapping=aes(nallotted, value)) + geom_point() +facet_grid(f ~ variable)


CollinErickson/CGGP documentation built on Feb. 6, 2024, 2:24 a.m.