In tdhock/aum: Area Under Minimum of False Positives and Negatives

Accuracy comparison

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width=14  
)

Comparing line search with different number of max iterations

The code below compares learning with different values of the maxIterations parameter.

if(requireNamespace("penaltyLearning")){

  data(neuroblastomaProcessed, package="penaltyLearning", envir=environment())
  nb.err <- with(neuroblastomaProcessed$errors, data.frame(
    example=paste0(profile.id, ".", chromosome),
    min.lambda,
    max.lambda,
    fp, fn))
  library(data.table)
  signal.features <- neuroblastomaProcessed$feature.mat[,c("log2.n","log.hall")]
  n.noise <- 20
  set.seed(1)
  noise.features <- matrix(
    rnorm(n.noise*nrow(signal.features)),
    nrow(signal.features), n.noise)
  X.sc <- scale(cbind(signal.features, noise.features))
  keep <- apply(is.finite(X.sc), 2, all)
  X.keep <- X.sc[,keep]
  n.folds <- 3
  uniq.folds <- 1:n.folds
  fold.vec <- sample(rep(uniq.folds, l=nrow(X.keep)))
  loss.dt.list <- list()
  timing.dt.list <- list()
  for(valid.fold in uniq.folds){
    index.list <- list(
      subtrain=fold.vec!=valid.fold,
      validation=fold.vec==valid.fold)
    diff.list <- lapply(index.list, function(set.i){
      aum::aum_diffs_penalty(nb.err, rownames(X.keep)[set.i])
    })
    for(maxIterations in 10^seq(2, 6)){
      weight.vec <- rep(0, ncol(X.keep))
      improvement <- old.aum <- Inf
      step.number <- 0
      cat(sprintf("fold=%d maxIt=%d\n", valid.fold, maxIterations))
      seconds <- system.time({
        while(improvement > 1e-4){
          step.number <- step.number+1
          valid.list <- aum::aum(
            diff.list$validation,
            X.keep[index.list$validation,] %*% weight.vec)
          nb.weight.search <- aum::aum_line_search(
            diff.list$subtrain,
            maxIterations=maxIterations,
            feature.mat=X.keep[index.list$subtrain,],
            weight.vec=weight.vec)
          loss.dt.list[[paste(
            valid.fold, step.number, maxIterations
          )]] <- data.table(
            valid.fold, step.number, maxIterations,
            set=c("subtrain", "validation"),
            aum=c(nb.weight.search$aum, valid.list$aum))
          exact.dt <- data.table(nb.weight.search$line_search_result)
          exact.dt[, kink := .I/.N]
          best.row <- exact.dt[which.min(aum)]
          improvement <- old.aum-best.row$aum
          old.aum <- best.row$aum
          weight.vec <- weight.vec-
            best.row$step.size*nb.weight.search$gradient_weight
        }
      })[["elapsed"]]
      timing.dt.list[[paste(valid.fold, maxIterations)]] <- data.table(
        valid.fold, maxIterations, seconds)
    }
  }
  (timing.dt <- do.call(rbind, timing.dt.list))
  print(dcast(timing.dt, maxIterations ~ valid.fold, value.var="seconds"))

}

The table above shows the total number of seconds of computation time for running gradient descent, for different values of max iterations of the exact line search (rows), and for each cross-validation fold (columns). It is clear that the minimal computation time is achieved by using an intermediate value for max iterations.

if(requireNamespace("penaltyLearning")){

  loss.dt <- do.call(
    rbind, loss.dt.list)[
    , max.step := max(step.number), by=.(maxIterations, valid.fold)][
    , min.step := min(max.step), by=maxIterations
    ][step.number<=min.step]
  min.dt <- loss.dt[, .SD[which.min(aum)], by=.(
    valid.fold, maxIterations, set)]
  if(requireNamespace("ggplot2")){
    g <- ggplot2::ggplot()+
      ggplot2::facet_grid(
        valid.fold ~ maxIterations,
        labeller=ggplot2::label_both,
        scales="free")+
      ggplot2::geom_line(ggplot2::aes(
        step.number, aum, color=set),
        data=loss.dt)+
      ggplot2::geom_point(ggplot2::aes(
        step.number, aum, color=set),
        shape=1,
        data=min.dt)+
      ggplot2::scale_y_log10()
    print(g)
  }

}

The plot above has a panel for each number of max iterations of the exact line search (from left to right) and for each cross-validation fold (from top to bottom). The expected loss curves are evident: subtrain always decreasing, validation U shaped.

if(requireNamespace("penaltyLearning")){

  mean.dt <- loss.dt[, .(
    mean.aum=mean(aum)
  ), by=.(step.number,maxIterations,set)]
  (selected.dt <- mean.dt[set=="validation"][which.min(mean.aum)])
  if(requireNamespace("ggplot2")){
    g <- ggplot2::ggplot()+
      ggplot2::facet_grid(
        . ~ maxIterations,
        labeller=ggplot2::label_both,
        scales="free")+
      ggplot2::geom_line(ggplot2::aes(
        step.number, mean.aum, color=set),
        data=mean.dt)+
      ggplot2::geom_point(ggplot2::aes(
        step.number, mean.aum, color=set),
        shape=1,
        data=selected.dt)+
      ggplot2::geom_hline(ggplot2::aes(
        yintercept=mean.aum, color=set),
        data=selected.dt[, .(mean.aum, set)])+
      ggplot2::scale_y_log10()
    print(g)
  }

}

The plot above shows the mean AUM over cross-validation folds, for each step (X axis) and for each value of max iterations in the exact line search (panels from left to right). It is clear that the min validation error is about the same for each value of max iterations.

linear model cv

## learn a model for a real changepoint data set.
if(requireNamespace("penaltyLearning")){

  library(data.table)
  data(neuroblastomaProcessed, package="penaltyLearning", envir=environment())
  nb.err <- with(neuroblastomaProcessed$errors, data.table(
    example=paste0(profile.id, ".", chromosome),
    min.log.lambda,
    max.log.lambda, 
    min.lambda, 
    max.lambda, 
    fp, fn, errors=fp+fn))
  signal.features <- neuroblastomaProcessed$feature.mat[,c("log2.n","log.hall")]
  n.folds <- 3
  uniq.folds <- 1:n.folds
  fold.vec <- sample(rep(uniq.folds, l=nrow(signal.features)))
  test.fold <- 1
  index.list <- list(
    train=fold.vec!=test.fold,
    test=fold.vec==test.fold)
  n.noise <- 40
  set.seed(1)
  noise.features <- matrix(
    rnorm(n.noise*nrow(signal.features)),
    nrow(signal.features), n.noise)
  input.features <- cbind(-54, 1, signal.features, noise.features)
  set.data <- lapply(index.list, function(is.set){
    L <- list(features=input.features[is.set,])
    for(denominator in c("count","rate")){
      L[[paste0(denominator, ".diffs")]] <- aum::aum_diffs_penalty(
        nb.err, rownames(L$features), denominator=denominator)
    }
    L
  })
  model <- with(set.data$train, aum::aum_linear_model_cv(
    features, count.diffs))
  plot(model)

  ## verify that the predictions are the same using either scaled or
  ## original features.
  with(set.data$train, rbind(
    predict.on.inputs=t(head(
      predict(model, features))),
    scale.then.predict=t(head(
      scale(features)[,model$keep] %*% model$weight.vec +
        model$intercept))))

  ## verify that the learned intercept results in min errors, at thresh=0.
  train.list <- with(set.data$train, aum::aum(
    count.diffs, predict(model, features)))
  plot(fp_before+fn_before ~ thresh, train.list$total_error)

  ## use rates instead of counts for computing AUM.
  set.seed(1)
  rate.model <- with(set.data$train, aum::aum_linear_model_cv(
    features, rate.diffs))
  plot(rate.model)

  ## alternative visualization including error bands and min loss.
  if(requireNamespace("ggplot2")){
    g <- ggplot2::ggplot()+
      ggplot2::geom_ribbon(ggplot2::aes(
        step.number, ymin=aum_mean-aum_sd, ymax=aum_mean+aum_sd, fill=set),
        alpha=0.5,
        data=rate.model$set.loss)+
      ggplot2::geom_line(ggplot2::aes(
        step.number, aum_mean, color=set),
        data=rate.model$set.loss)+
      ggplot2::geom_point(ggplot2::aes(
        step.number, aum_mean, color=set),
        data=rate.model$set.loss[, .SD[which.min(aum_mean)], by=set])+
      ggplot2::scale_y_log10()
    print(g)
  }

  ## alternative visualization showing each fold.
  if(requireNamespace("ggplot2")){
    g <- ggplot2::ggplot()+
      ggplot2::geom_line(ggplot2::aes(
        step.number, aum, color=set),
        data=rate.model$fold.loss)+
      ggplot2::geom_point(ggplot2::aes(
        step.number, aum, color=set),
        data=rate.model$fold.loss[
        , .SD[which.min(aum)], by=.(valid.fold, set)])+
      ggplot2::scale_y_log10()+
      ggplot2::facet_grid(. ~ valid.fold, labeller = "label_both")
    print(g)
  }

  ## compute test ROC curves, and compare against L1 regularized
  ## linear model with squared hinge loss.
  target.dt <- penaltyLearning::targetIntervals(nb.err, "example")
  target.mat <- target.dt[
    rownames(set.data$train$features),
    cbind(min.log.lambda, max.log.lambda),
    on="example"]
  named.features <- as.matrix(data.frame(input.features))
  ircv <- penaltyLearning::IntervalRegressionCV(
    named.features[rownames(set.data$train$features),], target.mat)
  pred.list <- with(set.data$test, list(
    IRCV=-predict(ircv, named.features[rownames(features),]),
    AUM.count=predict(model, features),
    AUM.rate=predict(rate.model, features),
    zero=rep(0, nrow(features))))
  roc.dt <- data.table(pred.name=names(pred.list))[, {
    aum::aum(set.data$test$rate.diffs, pred.list[[pred.name]])$total_error
  }, by=pred.name][, tp_before := 1-fn_before][]
  setkey(roc.dt, pred.name, thresh)
  if(requireNamespace("ggplot2")){
    pred.dt <- roc.dt[0 < thresh, .SD[1], by=pred.name]
    g <- ggplot2::ggplot()+
      ggplot2::ggtitle("Test set ROC curves, dot for predicted threshold")+
      ggplot2::geom_path(ggplot2::aes(
        fp_before, tp_before, color=pred.name),
        data=roc.dt)+
      ggplot2::geom_point(ggplot2::aes(
        fp_before, tp_before, color=pred.name),
        shape=21,
        fill="white",
        data=pred.dt)+
      ggplot2::coord_equal(xlim=c(0,0.25), ylim=c(0.75,1))+
      ggplot2::xlab("False Positive Rate")+
      ggplot2::ylab("True Positive Rate")
    print(g)
  }
  ## first and last row of each pred.
  roc.dt[, .SD[c(1,.N)], by=pred.name]

  ## visualize area under min.
  if(requireNamespace("nc")){
    roc.dt[, min_before := pmin(fp_before, fn_before)]
    roc.tall <- nc::capture_melt_single(
      roc.dt, 
      error.type="fp|fn|min",
      "_before",
      value.name="error.value")
    err.sizes <- c(
      fp=3,
      fn=2,
      min=1)
    err.colors <- c(
      fp="red",
      fn="deepskyblue",
      min="black")
    if(requireNamespace("ggplot2")){
      g <- ggplot2::ggplot()+
        ggplot2::ggtitle("Train set AUM in green")+
        ggplot2::theme_bw()+
        ggplot2::geom_step(ggplot2::aes(
          thresh, error.value, color=error.type, linewidth=error.type),
          data=roc.tall)+
        ggplot2::geom_polygon(ggplot2::aes(
          thresh, min_before),
          fill="green",
          data=roc.dt)+
        ggplot2::facet_grid(pred.name ~ ., labeller="label_both")+
        ggplot2::xlab("Constant added to predicted values")+
        ggplot2::ylab("Error rate")+
        ggplot2::scale_color_manual(values=err.colors)+
        ggplot2::scale_size_manual(values=err.sizes)
      print(g)
    }
  }

}

Comparing line search speed

The comparison below examines the number of grid points with the line search timings.

X.sc <- scale(neuroblastomaProcessed$feature.mat)
keep <- apply(is.finite(X.sc), 2, all)
X.keep <- X.sc[,keep]
weight.vec <- rep(0, ncol(X.keep))
(nb.diffs <- aum::aum_diffs_penalty(nb.err, rownames(X.keep)))
if(requireNamespace("atime")){
  ls.list <- atime::atime(
    N=2^seq(1, 8, by=1),
    setup={
      step.grid <- 10^seq(-9, 1, l=N)
    }, 
    grid={
      pred.vec <- X.keep %*% weight.vec
      aum.list <- aum::aum(nb.diffs, pred.vec)
      pred.grad.vec <- rowMeans(aum.list$derivative_mat)
      weight.grad.vec <- t(X.keep) %*% pred.grad.vec
      data.table(step.size=step.grid, aum=sapply(step.grid, function(step){
        step.weight <- weight.vec-step*weight.grad.vec
        aum::aum(nb.diffs, X.keep %*% step.weight)$aum
      }))
    },
    exact.linear=aum::aum_line_search(
      nb.diffs,
      feature.mat=X.keep,
      weight.vec=weight.vec),
    exact.quadratic=aum::aum_line_search(
      nb.diffs,
      feature.mat=X.keep,
      weight.vec=weight.vec,
      maxIterations = nrow(nb.diffs)*(nrow(nb.diffs)-1)/2),
    result=TRUE)
  plot(ls.list)
}

The figure and table above show that the exact line search with a linear number of iterations can be computed in about the same amount of time as grid search with 8 points.

if(requireNamespace("atime") && require(ggplot2)){
  exact.results <- ls.list$measurements[
    expr.name!="grid", 
    result[[1]]$line_search_result[, prop.step := seq(1, .N)/.N],
    by=expr.name]
  exact.best <- exact.results[, .SD[which.min(aum)], by=expr.name]
  grid.dt <- ls.list$measurements[expr.name=="grid", {
    result[[1]][which.min(aum)]
  }, by=N]
  ggplot()+
    geom_hline(aes(
      yintercept=aum, color=expr.name),
      data=exact.best)+
    geom_point(aes(
      N, aum),
      data=grid.dt)+
    scale_x_log10(
      "Number of grid search points",
      breaks=unique(grid.dt$N))+
    theme(panel.grid.minor=element_blank())
}

The figure above shows that a very small number of grid points (only 4) is needed to get a better step size than the exact/approx line search with linear number of iterations (equal to the number of inputs/breakpoints/lines). It also shows that a modest number of grid points, (8 or 128, depending on how close you want) is required to get a step size which is almost as good as the exact line search with quadratic number of iterations.

if(requireNamespace("atime") && require(ggplot2)){
  exact.quad <- exact.results[
    step.size<max(grid.dt$step.size)
  ][expr.name=="exact.quadratic"][seq(1, .N, l=1000)]
  exact.lin <- exact.results[expr.name=="exact.linear"]
  some.exact <- rbind(exact.quad, exact.lin)
  gg <- ggplot()+
    geom_line(aes(
      step.size, aum, color=expr.name, size=expr.name),
      data=some.exact)+
    geom_point(aes(
      step.size, aum),
      data=grid.dt)+
    scale_size_manual(
      values=c(exact.linear=1.5, exact.quadratic=0.5))
  if(require(ggrepel)){
    gg <- gg+
      geom_text_repel(aes(
        step.size, aum, label=N),
        data=grid.dt)+
      ggtitle("Best AUM for number of grid points shown")
  }
  gg   
}

The figure above shows the AUM as a function of step size, with colored lines for two versions of the exact line search, and points for the grid search.

if(requireNamespace("atime")){
  atimeX.list <- atime::atime(
    N=2^seq(1, 10, by=1),
    setup={
      some.X <- X.keep[1:N,]
      (some.diffs <- aum::aum_diffs_penalty(nb.err, rownames(some.X)))
      max.it <- N*(N-1)/2
    }, 
    exact.quadratic=aum::aum_line_search(
      some.diffs,
      feature.mat=some.X,
      weight.vec=weight.vec,
      maxIterations = max.it),
    result=TRUE,
    seconds.limit=0.1
  )
  plot(atimeX.list)
}

The plot above shows the time it takes to compute the full/quadratic exact line search, for various data sizes N.

if(requireNamespace("atime")){
  bestX.list <- atime::references_best(atimeX.list)
  plot(bestX.list)
}

The figure above shows reference lines in red, which clearly show the quadratic time/space complexity of computing the full exact line search.

if(requireNamespace("atime") && requireNamespace("nc") && require(ggplot2) && requireNamespace("directlabels")){
  (N.step.wide <- atimeX.list$measurements[, {
    ls.list <- result[[1]]
    res.dt <- ls.list$line_search_result
    s <- res.dt$step.size
    N.lines <- nrow(ls.list$line_search_input)
    data.table(
      min.step=s[2],
      linear.step=s[N.lines],
      best.step=res.dt[which.min(aum), step.size],
      max.step=max(s))
  }, by=N])
  N.step.tall <- nc::capture_melt_single(
    N.step.wide[max.step>0],
    step.type=".*?",
    "[.]step",
    value.name="step.size")
  gg <- ggplot()+
    geom_line(aes(
      N, step.size, color=step.type),
      data=N.step.tall)+
    scale_y_log10()+
    scale_x_log10(limits=c(NA,max(N.step.tall$N)*2))
  directlabels::direct.label(gg,"right.polygons")
}

The figure above shows various step sizes from the exact line search, as a function of data size N. It is clear that, in general, the larger data sizes result in smaller step sizes.

tdhock/aum documentation built on Oct. 26, 2024, 5:39 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com