Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## -----------------------------------------------------------------------------
library(data.table)
n.segments <- 10
seg.mean.vec <- 1:n.segments
data.per.segment <- 10
data.mean.vec <- rep(seg.mean.vec, each=data.per.segment)
n.data <- length(data.mean.vec)
n.validation.sets <- 100
n.folds.vec <- c(10, 2)
prop.valid.vec <- 1/n.folds.vec
sim.result.list <- list()
if(interactive()){
for(data.seed in 1:100){
set.seed(data.seed)
data.vec <- rnorm(n.data, data.mean.vec, 0.1)
is.valid.vec.list <- list()
for(n.folds in n.folds.vec){
uniq.folds <- 1:n.folds
n.seeds <- n.validation.sets/n.folds
split.type <- sprintf("%d-fold %d times", n.folds, n.seeds)
for(seed in 1:n.seeds){
set.seed(seed)
fold.vec <- sample(rep(uniq.folds, l=n.data))
for(valid.fold in uniq.folds){
is.valid.vec.list[[split.type]][[paste(seed, valid.fold)]] <-
fold.vec==valid.fold
}
}
}
for(prop.valid in prop.valid.vec){
split.type <- sprintf("%d%% %d times", 100*prop.valid, n.validation.sets)
prop.vec <- c(subtrain=1-prop.valid, validation=prop.valid)
for(split.i in 1:n.validation.sets){
set.seed(split.i)
is.valid.vec.list[[split.type]][[split.i]] <- binsegRcpp::random_set_vec(
n.data, prop.vec) == "validation"
}
}
loss.dt <- CJ(split.i=1:n.validation.sets, type=names(is.valid.vec.list))[, {
is.valid <- is.valid.vec.list[[type]][[split.i]]
bs.model <- binsegRcpp::binseg_normal(data.vec, is.validation.vec=is.valid)
bs.model$splits[, data.table(
segments,
validation.loss)]
}, by=.(split.i, type)]
loss.stats <- loss.dt[, .(
mean.valid.loss=mean(validation.loss)
), by=.(type, segments)]
select.each.split <- loss.dt[
, .SD[which.min(validation.loss)],
by=.(type, split.i)]
selected.times <- select.each.split[, .(
times=.N
), by=.(type, segments)]
selected.segments <- rbind(
select.each.split[, .(
selected=min(segments)
), by=.(method=paste(type, "min err, min segs"))],
selected.times[, .(
selected=segments[which.max(times)]
), by=.(method=paste(type, "min err, max times"))],
loss.stats[, .(
selected=segments[which.min(mean.valid.loss)]
), by=.(method=paste(type, "mean err, min err"))]
)
sim.result.list[[data.seed]] <- data.table(
data.seed, selected.segments, n.segments)
}
sim.result <- do.call(rbind, sim.result.list)
(sim.err <- sim.result[, .(
zero.one.loss=sum(selected != n.segments),
L1.loss=sum(abs(selected-n.segments)),
L2.loss=sum((selected-n.segments)^2)
), by=method][order(zero.one.loss)])
plot(data.vec)
}
## -----------------------------------------------------------------------------
n.segments <- 20
seg.mean.vec <- 1:n.segments
data.per.segment <- 5
data.mean.vec <- rep(seg.mean.vec, each=data.per.segment)
n.data <- length(data.mean.vec)
n.validation.sets <- 200
prop.valid <- c(0.01, 0.05, 0.1, 0.25, 0.5)
if(interactive()){
sim.result <- data.table(data.seed=1:100)[, {
set.seed(data.seed)
data.vec <- rnorm(n.data, data.mean.vec, 0.1)
select.each.split <- CJ(split.i=1:n.validation.sets, prop.valid)[, {
set.seed(split.i)
prop.sets <- c(subtrain=1-prop.valid, validation=prop.valid)
is.valid <- binsegRcpp::random_set_vec(
n.data, prop.sets)=="validation"
bs.model <- binsegRcpp::binseg_normal(
data.vec, is.validation.vec=is.valid)
bs.model$splits[, .(selected=segments[which.min(validation.loss)])]
}, by=.(split.i, prop.valid)]
data.table(n.splits=1:n.validation.sets)[, {
select.each.split[split.i <= n.splits, .(
times=.N
),
by=.(prop.valid, selected)
][, .SD[which.max(times), .(selected)], by=prop.valid]
}, by=n.splits]
}, by=data.seed]
if(require(ggplot2)){
ggplot()+
scale_color_gradient(low="red", high="black")+
geom_line(aes(
n.splits, selected,
group=paste(data.seed, prop.valid),
color=prop.valid),
data=sim.result)+
scale_y_continuous(breaks=seq(0, 100, by=10))
}
accuracy.dt <- sim.result[, .(
correct=sum(selected==n.segments)
), by=.(prop.valid, n.splits)]
if(require(ggplot2)){
gg <- ggplot()+
geom_line(aes(
n.splits, correct,
group=prop.valid,
color=prop.valid),
size=2,
data=accuracy.dt)+
scale_y_continuous("number of correctly chosen data sets")+
scale_color_gradient(low="red", high="black")
if(require(directlabels)){
direct.label(gg, "right.polygons")
}else{
gg
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.