Nothing
knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(data.table) n.segments <- 10 seg.mean.vec <- 1:n.segments data.per.segment <- 10 data.mean.vec <- rep(seg.mean.vec, each=data.per.segment) n.data <- length(data.mean.vec) n.validation.sets <- 100 n.folds.vec <- c(10, 2) prop.valid.vec <- 1/n.folds.vec sim.result.list <- list() if(interactive()){ for(data.seed in 1:100){ set.seed(data.seed) data.vec <- rnorm(n.data, data.mean.vec, 0.1) is.valid.vec.list <- list() for(n.folds in n.folds.vec){ uniq.folds <- 1:n.folds n.seeds <- n.validation.sets/n.folds split.type <- sprintf("%d-fold %d times", n.folds, n.seeds) for(seed in 1:n.seeds){ set.seed(seed) fold.vec <- sample(rep(uniq.folds, l=n.data)) for(valid.fold in uniq.folds){ is.valid.vec.list[[split.type]][[paste(seed, valid.fold)]] <- fold.vec==valid.fold } } } for(prop.valid in prop.valid.vec){ split.type <- sprintf("%d%% %d times", 100*prop.valid, n.validation.sets) prop.vec <- c(subtrain=1-prop.valid, validation=prop.valid) for(split.i in 1:n.validation.sets){ set.seed(split.i) is.valid.vec.list[[split.type]][[split.i]] <- binsegRcpp::random_set_vec( n.data, prop.vec) == "validation" } } loss.dt <- CJ(split.i=1:n.validation.sets, type=names(is.valid.vec.list))[, { is.valid <- is.valid.vec.list[[type]][[split.i]] bs.model <- binsegRcpp::binseg_normal(data.vec, is.validation.vec=is.valid) bs.model$splits[, data.table( segments, validation.loss)] }, by=.(split.i, type)] loss.stats <- loss.dt[, .( mean.valid.loss=mean(validation.loss) ), by=.(type, segments)] select.each.split <- loss.dt[ , .SD[which.min(validation.loss)], by=.(type, split.i)] selected.times <- select.each.split[, .( times=.N ), by=.(type, segments)] selected.segments <- rbind( select.each.split[, .( selected=min(segments) ), by=.(method=paste(type, "min err, min segs"))], selected.times[, .( selected=segments[which.max(times)] ), by=.(method=paste(type, "min err, max times"))], loss.stats[, .( selected=segments[which.min(mean.valid.loss)] ), by=.(method=paste(type, "mean err, min err"))] ) sim.result.list[[data.seed]] <- data.table( data.seed, selected.segments, n.segments) } sim.result <- do.call(rbind, sim.result.list) (sim.err <- sim.result[, .( zero.one.loss=sum(selected != n.segments), L1.loss=sum(abs(selected-n.segments)), L2.loss=sum((selected-n.segments)^2) ), by=method][order(zero.one.loss)]) plot(data.vec) }
The code above compares several types of cross-validation for selecting the number of segments in simulated random normal data. The table above shows various error rates which compare the selected number of segments to the true number of segments in the simulation. The best methods appears to be the ones which use min err, max times.
n.segments <- 20 seg.mean.vec <- 1:n.segments data.per.segment <- 5 data.mean.vec <- rep(seg.mean.vec, each=data.per.segment) n.data <- length(data.mean.vec) n.validation.sets <- 200 prop.valid <- c(0.01, 0.05, 0.1, 0.25, 0.5) if(interactive()){ sim.result <- data.table(data.seed=1:100)[, { set.seed(data.seed) data.vec <- rnorm(n.data, data.mean.vec, 0.1) select.each.split <- CJ(split.i=1:n.validation.sets, prop.valid)[, { set.seed(split.i) prop.sets <- c(subtrain=1-prop.valid, validation=prop.valid) is.valid <- binsegRcpp::random_set_vec( n.data, prop.sets)=="validation" bs.model <- binsegRcpp::binseg_normal( data.vec, is.validation.vec=is.valid) bs.model$splits[, .(selected=segments[which.min(validation.loss)])] }, by=.(split.i, prop.valid)] data.table(n.splits=1:n.validation.sets)[, { select.each.split[split.i <= n.splits, .( times=.N ), by=.(prop.valid, selected) ][, .SD[which.max(times), .(selected)], by=prop.valid] }, by=n.splits] }, by=data.seed] if(require(ggplot2)){ ggplot()+ scale_color_gradient(low="red", high="black")+ geom_line(aes( n.splits, selected, group=paste(data.seed, prop.valid), color=prop.valid), data=sim.result)+ scale_y_continuous(breaks=seq(0, 100, by=10)) } accuracy.dt <- sim.result[, .( correct=sum(selected==n.segments) ), by=.(prop.valid, n.splits)] if(require(ggplot2)){ gg <- ggplot()+ geom_line(aes( n.splits, correct, group=prop.valid, color=prop.valid), size=2, data=accuracy.dt)+ scale_y_continuous("number of correctly chosen data sets")+ scale_color_gradient(low="red", high="black") if(require(directlabels)){ direct.label(gg, "right.polygons") }else{ gg } } }
The plot above suggests that 100 validation sets is sufficient.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.