knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width=14 )
The code below compares learning with different values of the maxIterations parameter.
if(requireNamespace("penaltyLearning")){ data(neuroblastomaProcessed, package="penaltyLearning", envir=environment()) nb.err <- with(neuroblastomaProcessed$errors, data.frame( example=paste0(profile.id, ".", chromosome), min.lambda, max.lambda, fp, fn)) library(data.table) signal.features <- neuroblastomaProcessed$feature.mat[,c("log2.n","log.hall")] n.noise <- 20 set.seed(1) noise.features <- matrix( rnorm(n.noise*nrow(signal.features)), nrow(signal.features), n.noise) X.sc <- scale(cbind(signal.features, noise.features)) keep <- apply(is.finite(X.sc), 2, all) X.keep <- X.sc[,keep] n.folds <- 3 uniq.folds <- 1:n.folds fold.vec <- sample(rep(uniq.folds, l=nrow(X.keep))) loss.dt.list <- list() timing.dt.list <- list() for(valid.fold in uniq.folds){ index.list <- list( subtrain=fold.vec!=valid.fold, validation=fold.vec==valid.fold) diff.list <- lapply(index.list, function(set.i){ aum::aum_diffs_penalty(nb.err, rownames(X.keep)[set.i]) }) for(maxIterations in 10^seq(2, 6)){ weight.vec <- rep(0, ncol(X.keep)) improvement <- old.aum <- Inf step.number <- 0 cat(sprintf("fold=%d maxIt=%d\n", valid.fold, maxIterations)) seconds <- system.time({ while(improvement > 1e-4){ step.number <- step.number+1 valid.list <- aum::aum( diff.list$validation, X.keep[index.list$validation,] %*% weight.vec) nb.weight.search <- aum::aum_line_search( diff.list$subtrain, maxIterations=maxIterations, feature.mat=X.keep[index.list$subtrain,], weight.vec=weight.vec) loss.dt.list[[paste( valid.fold, step.number, maxIterations )]] <- data.table( valid.fold, step.number, maxIterations, set=c("subtrain", "validation"), aum=c(nb.weight.search$aum, valid.list$aum)) exact.dt <- data.table(nb.weight.search$line_search_result) exact.dt[, kink := .I/.N] best.row <- exact.dt[which.min(aum)] improvement <- old.aum-best.row$aum old.aum <- best.row$aum weight.vec <- weight.vec- best.row$step.size*nb.weight.search$gradient_weight } })[["elapsed"]] timing.dt.list[[paste(valid.fold, maxIterations)]] <- data.table( valid.fold, maxIterations, seconds) } } (timing.dt <- do.call(rbind, timing.dt.list)) print(dcast(timing.dt, maxIterations ~ valid.fold, value.var="seconds")) }
The table above shows the total number of seconds of computation time for running gradient descent, for different values of max iterations of the exact line search (rows), and for each cross-validation fold (columns). It is clear that the minimal computation time is achieved by using an intermediate value for max iterations.
if(requireNamespace("penaltyLearning")){ loss.dt <- do.call( rbind, loss.dt.list)[ , max.step := max(step.number), by=.(maxIterations, valid.fold)][ , min.step := min(max.step), by=maxIterations ][step.number<=min.step] min.dt <- loss.dt[, .SD[which.min(aum)], by=.( valid.fold, maxIterations, set)] if(requireNamespace("ggplot2")){ g <- ggplot2::ggplot()+ ggplot2::facet_grid( valid.fold ~ maxIterations, labeller=ggplot2::label_both, scales="free")+ ggplot2::geom_line(ggplot2::aes( step.number, aum, color=set), data=loss.dt)+ ggplot2::geom_point(ggplot2::aes( step.number, aum, color=set), shape=1, data=min.dt)+ ggplot2::scale_y_log10() print(g) } }
The plot above has a panel for each number of max iterations of the exact line search (from left to right) and for each cross-validation fold (from top to bottom). The expected loss curves are evident: subtrain always decreasing, validation U shaped.
if(requireNamespace("penaltyLearning")){ mean.dt <- loss.dt[, .( mean.aum=mean(aum) ), by=.(step.number,maxIterations,set)] (selected.dt <- mean.dt[set=="validation"][which.min(mean.aum)]) if(requireNamespace("ggplot2")){ g <- ggplot2::ggplot()+ ggplot2::facet_grid( . ~ maxIterations, labeller=ggplot2::label_both, scales="free")+ ggplot2::geom_line(ggplot2::aes( step.number, mean.aum, color=set), data=mean.dt)+ ggplot2::geom_point(ggplot2::aes( step.number, mean.aum, color=set), shape=1, data=selected.dt)+ ggplot2::geom_hline(ggplot2::aes( yintercept=mean.aum, color=set), data=selected.dt[, .(mean.aum, set)])+ ggplot2::scale_y_log10() print(g) } }
The plot above shows the mean AUM over cross-validation folds, for each step (X axis) and for each value of max iterations in the exact line search (panels from left to right). It is clear that the min validation error is about the same for each value of max iterations.
## learn a model for a real changepoint data set. if(requireNamespace("penaltyLearning")){ library(data.table) data(neuroblastomaProcessed, package="penaltyLearning", envir=environment()) nb.err <- with(neuroblastomaProcessed$errors, data.table( example=paste0(profile.id, ".", chromosome), min.log.lambda, max.log.lambda, min.lambda, max.lambda, fp, fn, errors=fp+fn)) signal.features <- neuroblastomaProcessed$feature.mat[,c("log2.n","log.hall")] n.folds <- 3 uniq.folds <- 1:n.folds fold.vec <- sample(rep(uniq.folds, l=nrow(signal.features))) test.fold <- 1 index.list <- list( train=fold.vec!=test.fold, test=fold.vec==test.fold) n.noise <- 40 set.seed(1) noise.features <- matrix( rnorm(n.noise*nrow(signal.features)), nrow(signal.features), n.noise) input.features <- cbind(-54, 1, signal.features, noise.features) set.data <- lapply(index.list, function(is.set){ L <- list(features=input.features[is.set,]) for(denominator in c("count","rate")){ L[[paste0(denominator, ".diffs")]] <- aum::aum_diffs_penalty( nb.err, rownames(L$features), denominator=denominator) } L }) model <- with(set.data$train, aum::aum_linear_model_cv( features, count.diffs)) plot(model) ## verify that the predictions are the same using either scaled or ## original features. with(set.data$train, rbind( predict.on.inputs=t(head( predict(model, features))), scale.then.predict=t(head( scale(features)[,model$keep] %*% model$weight.vec + model$intercept)))) ## verify that the learned intercept results in min errors, at thresh=0. train.list <- with(set.data$train, aum::aum( count.diffs, predict(model, features))) plot(fp_before+fn_before ~ thresh, train.list$total_error) ## use rates instead of counts for computing AUM. set.seed(1) rate.model <- with(set.data$train, aum::aum_linear_model_cv( features, rate.diffs)) plot(rate.model) ## alternative visualization including error bands and min loss. if(requireNamespace("ggplot2")){ g <- ggplot2::ggplot()+ ggplot2::geom_ribbon(ggplot2::aes( step.number, ymin=aum_mean-aum_sd, ymax=aum_mean+aum_sd, fill=set), alpha=0.5, data=rate.model$set.loss)+ ggplot2::geom_line(ggplot2::aes( step.number, aum_mean, color=set), data=rate.model$set.loss)+ ggplot2::geom_point(ggplot2::aes( step.number, aum_mean, color=set), data=rate.model$set.loss[, .SD[which.min(aum_mean)], by=set])+ ggplot2::scale_y_log10() print(g) } ## alternative visualization showing each fold. if(requireNamespace("ggplot2")){ g <- ggplot2::ggplot()+ ggplot2::geom_line(ggplot2::aes( step.number, aum, color=set), data=rate.model$fold.loss)+ ggplot2::geom_point(ggplot2::aes( step.number, aum, color=set), data=rate.model$fold.loss[ , .SD[which.min(aum)], by=.(valid.fold, set)])+ ggplot2::scale_y_log10()+ ggplot2::facet_grid(. ~ valid.fold, labeller = "label_both") print(g) } ## compute test ROC curves, and compare against L1 regularized ## linear model with squared hinge loss. target.dt <- penaltyLearning::targetIntervals(nb.err, "example") target.mat <- target.dt[ rownames(set.data$train$features), cbind(min.log.lambda, max.log.lambda), on="example"] named.features <- as.matrix(data.frame(input.features)) ircv <- penaltyLearning::IntervalRegressionCV( named.features[rownames(set.data$train$features),], target.mat) pred.list <- with(set.data$test, list( IRCV=-predict(ircv, named.features[rownames(features),]), AUM.count=predict(model, features), AUM.rate=predict(rate.model, features), zero=rep(0, nrow(features)))) roc.dt <- data.table(pred.name=names(pred.list))[, { aum::aum(set.data$test$rate.diffs, pred.list[[pred.name]])$total_error }, by=pred.name][, tp_before := 1-fn_before][] setkey(roc.dt, pred.name, thresh) if(requireNamespace("ggplot2")){ pred.dt <- roc.dt[0 < thresh, .SD[1], by=pred.name] g <- ggplot2::ggplot()+ ggplot2::ggtitle("Test set ROC curves, dot for predicted threshold")+ ggplot2::geom_path(ggplot2::aes( fp_before, tp_before, color=pred.name), data=roc.dt)+ ggplot2::geom_point(ggplot2::aes( fp_before, tp_before, color=pred.name), shape=21, fill="white", data=pred.dt)+ ggplot2::coord_equal(xlim=c(0,0.25), ylim=c(0.75,1))+ ggplot2::xlab("False Positive Rate")+ ggplot2::ylab("True Positive Rate") print(g) } ## first and last row of each pred. roc.dt[, .SD[c(1,.N)], by=pred.name] ## visualize area under min. if(requireNamespace("nc")){ roc.dt[, min_before := pmin(fp_before, fn_before)] roc.tall <- nc::capture_melt_single( roc.dt, error.type="fp|fn|min", "_before", value.name="error.value") err.sizes <- c( fp=3, fn=2, min=1) err.colors <- c( fp="red", fn="deepskyblue", min="black") if(requireNamespace("ggplot2")){ g <- ggplot2::ggplot()+ ggplot2::ggtitle("Train set AUM in green")+ ggplot2::theme_bw()+ ggplot2::geom_step(ggplot2::aes( thresh, error.value, color=error.type, linewidth=error.type), data=roc.tall)+ ggplot2::geom_polygon(ggplot2::aes( thresh, min_before), fill="green", data=roc.dt)+ ggplot2::facet_grid(pred.name ~ ., labeller="label_both")+ ggplot2::xlab("Constant added to predicted values")+ ggplot2::ylab("Error rate")+ ggplot2::scale_color_manual(values=err.colors)+ ggplot2::scale_size_manual(values=err.sizes) print(g) } } }
The comparison below examines the number of grid points with the line search timings.
X.sc <- scale(neuroblastomaProcessed$feature.mat) keep <- apply(is.finite(X.sc), 2, all) X.keep <- X.sc[,keep] weight.vec <- rep(0, ncol(X.keep)) (nb.diffs <- aum::aum_diffs_penalty(nb.err, rownames(X.keep))) if(requireNamespace("atime")){ ls.list <- atime::atime( N=2^seq(1, 8, by=1), setup={ step.grid <- 10^seq(-9, 1, l=N) }, grid={ pred.vec <- X.keep %*% weight.vec aum.list <- aum::aum(nb.diffs, pred.vec) pred.grad.vec <- rowMeans(aum.list$derivative_mat) weight.grad.vec <- t(X.keep) %*% pred.grad.vec data.table(step.size=step.grid, aum=sapply(step.grid, function(step){ step.weight <- weight.vec-step*weight.grad.vec aum::aum(nb.diffs, X.keep %*% step.weight)$aum })) }, exact.linear=aum::aum_line_search( nb.diffs, feature.mat=X.keep, weight.vec=weight.vec), exact.quadratic=aum::aum_line_search( nb.diffs, feature.mat=X.keep, weight.vec=weight.vec, maxIterations = nrow(nb.diffs)*(nrow(nb.diffs)-1)/2), result=TRUE) plot(ls.list) }
The figure and table above show that the exact line search with a linear number of iterations can be computed in about the same amount of time as grid search with 8 points.
if(requireNamespace("atime") && require(ggplot2)){ exact.results <- ls.list$measurements[ expr.name!="grid", result[[1]]$line_search_result[, prop.step := seq(1, .N)/.N], by=expr.name] exact.best <- exact.results[, .SD[which.min(aum)], by=expr.name] grid.dt <- ls.list$measurements[expr.name=="grid", { result[[1]][which.min(aum)] }, by=N] ggplot()+ geom_hline(aes( yintercept=aum, color=expr.name), data=exact.best)+ geom_point(aes( N, aum), data=grid.dt)+ scale_x_log10( "Number of grid search points", breaks=unique(grid.dt$N))+ theme(panel.grid.minor=element_blank()) }
The figure above shows that a very small number of grid points (only 4) is needed to get a better step size than the exact/approx line search with linear number of iterations (equal to the number of inputs/breakpoints/lines). It also shows that a modest number of grid points, (8 or 128, depending on how close you want) is required to get a step size which is almost as good as the exact line search with quadratic number of iterations.
if(requireNamespace("atime") && require(ggplot2)){ exact.quad <- exact.results[ step.size<max(grid.dt$step.size) ][expr.name=="exact.quadratic"][seq(1, .N, l=1000)] exact.lin <- exact.results[expr.name=="exact.linear"] some.exact <- rbind(exact.quad, exact.lin) gg <- ggplot()+ geom_line(aes( step.size, aum, color=expr.name, size=expr.name), data=some.exact)+ geom_point(aes( step.size, aum), data=grid.dt)+ scale_size_manual( values=c(exact.linear=1.5, exact.quadratic=0.5)) if(require(ggrepel)){ gg <- gg+ geom_text_repel(aes( step.size, aum, label=N), data=grid.dt)+ ggtitle("Best AUM for number of grid points shown") } gg }
The figure above shows the AUM as a function of step size, with colored lines for two versions of the exact line search, and points for the grid search.
if(requireNamespace("atime")){ atimeX.list <- atime::atime( N=2^seq(1, 10, by=1), setup={ some.X <- X.keep[1:N,] (some.diffs <- aum::aum_diffs_penalty(nb.err, rownames(some.X))) max.it <- N*(N-1)/2 }, exact.quadratic=aum::aum_line_search( some.diffs, feature.mat=some.X, weight.vec=weight.vec, maxIterations = max.it), result=TRUE, seconds.limit=0.1 ) plot(atimeX.list) }
The plot above shows the time it takes to compute the full/quadratic
exact line search, for various data sizes N
.
if(requireNamespace("atime")){ bestX.list <- atime::references_best(atimeX.list) plot(bestX.list) }
The figure above shows reference lines in red, which clearly show the quadratic time/space complexity of computing the full exact line search.
if(requireNamespace("atime") && requireNamespace("nc") && require(ggplot2) && requireNamespace("directlabels")){ (N.step.wide <- atimeX.list$measurements[, { ls.list <- result[[1]] res.dt <- ls.list$line_search_result s <- res.dt$step.size N.lines <- nrow(ls.list$line_search_input) data.table( min.step=s[2], linear.step=s[N.lines], best.step=res.dt[which.min(aum), step.size], max.step=max(s)) }, by=N]) N.step.tall <- nc::capture_melt_single( N.step.wide[max.step>0], step.type=".*?", "[.]step", value.name="step.size") gg <- ggplot()+ geom_line(aes( N, step.size, color=step.type), data=N.step.tall)+ scale_y_log10()+ scale_x_log10(limits=c(NA,max(N.step.tall$N)*2)) directlabels::direct.label(gg,"right.polygons") }
The figure above shows various step sizes from the exact line search,
as a function of data size N
. It is clear that, in general, the
larger data sizes result in smaller step sizes.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.