Nothing
ROChange <- structure(function # ROC curve for changepoints
### Compute a Receiver Operating Characteristic curve for a penalty
### function.
(models,
### data.frame describing the number of incorrect labels as a function
### of log(lambda), with columns min.log.lambda, max.log.lambda, fp,
### fn, possible.fp, possible.fn, etc. This can be computed via
### labelError(modelSelection(...), ...)$model.errors -- see examples.
predictions,
### data.frame with a column named pred.log.lambda, the predicted
### log(penalty) value for each segmentation problem.
problem.vars=character()
### character: column names used to identify data set / segmentation
### problem.
){
possible.fp <- possible.fn <- min.log.lambda <- fp <- fn <- thresh <-
log.lambda <- pred.log.lambda <- errors <- FPR <- tp <- TPR <-
error.percent <- min.thresh <- max.thresh <- max.log.lambda <-
next.min <- problems <- n.inconsistent <- min.fp.fn <-
. <- min.adj <- fp.adj <- fn.adj <- fp.tot.diff <- fn.tot.diff <-
deriv.min <- deriv.max <-NULL
### The code above is to avoid CRAN NOTEs like
### ROChange: no visible binding for global variable
if(!(
is.character(problem.vars) &&
0 < length(problem.vars) &&
all(!is.na(problem.vars)) &&
all(problem.vars %in% names(predictions)) &&
all(problem.vars %in% names(models))
)){
stop("problem.vars should be a character vector of column names (IDs for predictions and models)")
}
exp.cols <- c(
"fp", "possible.fp", "fn", "possible.fn", "errors", "labels",
problem.vars, "min.log.lambda", "max.log.lambda")
if(!(
is.data.frame(models) &&
all(exp.cols %in% names(models))
)){
stop("models should have columns ", paste(exp.cols, collapse=", "))
}
for(col.name in exp.cols){
if(any(is.na(models[[col.name]]))){
stop(col.name, " should not be NA")
}
}
if(!(
is.data.frame(predictions) &&
0 < nrow(predictions) &&
"pred.log.lambda" %in% names(predictions)
)){
stop("predictions should be a data.frame with at least one row and a column named pred.log.lambda")
}
if(!all(is.finite(predictions[["pred.log.lambda"]]))){
stop("all predictions must be finite")
}
pred <- data.table(predictions)
bad.pred <- pred[, list(
problems=.N
), by=problem.vars][1 < problems]
if(nrow(bad.pred)){
print(pred[bad.pred, on=problem.vars])
stop("more than one prediction per problem")
}
err <- data.table(models)[pred, on=problem.vars]
setkeyv(err, c(problem.vars, "min.log.lambda"))
err.missing <- err[is.na(labels)]
if(nrow(err.missing)){
print(err.missing)
stop("some predictions do not exist in models")
}
min.max.not.Inf <- err[, list(
min=min(min.log.lambda),
max=max(max.log.lambda)
), by=problem.vars][-Inf < min | max < Inf]
if(nrow(min.max.not.Inf)){
print(min.max.not.Inf)
stop("for every problem, the smallest min.log.lambda should be -Inf, and the largest max.log.lambda should be Inf")
}
err[, next.min := c(min.log.lambda[-1], Inf), by=problem.vars]
inconsistent.counts <- err[, list(
n.inconsistent=sum(next.min != max.log.lambda)
), by=problem.vars][0 < n.inconsistent]
if(nrow(inconsistent.counts)){
print(err[inconsistent.counts, on=problem.vars])
stop("max.log.lambda should be equal to the next min.log.lambda")
}
for(col.name in c("labels", "possible.fp", "possible.fn")){
possible.ranges <- err[, {
x <- .SD[[col.name]]
list(
min=min(x),
max=max(x)
)}, by=problem.vars]
possible.inconsistent <- possible.ranges[min != max]
if(nrow(possible.inconsistent)){
print(possible.inconsistent)
stop(
col.name,
" should be constant for each problem")
}
}
negative <- err[possible.fp<0 | possible.fn<0 | labels<0]
if(nrow(negative)){
print(negative)
stop("possible.fn/possible.fp/labels should be non-negative")
}
possible.name.vec <- c(
errors="labels",
fp="possible.fp",
fn="possible.fn")
for(err.name in names(possible.name.vec)){
poss.name <- possible.name.vec[[err.name]]
poss.num <- err[[poss.name]]
err.num <- err[[err.name]]
out.of.range <- err[poss.num < err.num | err.num < 0]
if(nrow(out.of.range)){
print(out.of.range)
stop(
err.name,
" should be in [0,",
poss.name,
"]")
}
}
first.dt <- err[min.log.lambda==-Inf]
last.dt <- err[max.log.lambda==Inf]
total.dt <- first.dt[, list(
labels=sum(labels),
possible.fp=sum(possible.fp),
possible.fn=sum(possible.fn))]
if(total.dt$possible.fp==0){
stop("no negative labels")
}
if(total.dt$possible.fn==0){
stop("no positive labels")
}
thresh.dt <- err[order(min.log.lambda), {
fp.prb.diff <- diff(fp)
fn.prb.diff <- diff(fn)
any.change <- fp.prb.diff != 0 | fn.prb.diff != 0
data.table(
log.lambda=max.log.lambda[c(any.change, FALSE)],
fp.prb.diff=as.numeric(fp.prb.diff[any.change]),
fn.prb.diff=as.numeric(fn.prb.diff[any.change]))
}, by=problem.vars]
fp.fn.problems <- thresh.dt[pred, on=problem.vars, nomatch=0L]
fp.fn.problems[, thresh := log.lambda - pred.log.lambda]
uniq.thresh <- fp.fn.problems[order(thresh), list(
fp.tot.diff=sum(fp.prb.diff),
fn.tot.diff=sum(fn.prb.diff)
), by=thresh]
fp.fn.totals <- uniq.thresh[, data.table(
min.thresh=c(-Inf, thresh),
max.thresh=c(thresh, Inf),
fp = cumsum(c(sum(first.dt$fp), fp.tot.diff)),
fn = rev(cumsum(rev(c(-fn.tot.diff, sum(last.dt$fn)))))
)]
## Compute aum = area under min(fp,fn).
fp.fn.totals[, min.fp.fn := pmin(fp, fn)]
aum <- fp.fn.totals[, sum(ifelse(
min.fp.fn==0, 0, min.fp.fn*(max.thresh-min.thresh)))]
## Compute directional derivatives coming from lo and hi values. The
## main idea is that we look at what happens if the predicted value
## for a particular problem is decreased, which results in a bigger
## threshold. If that threshold/diff is relevant then it will result
## in a change in the min after the threshold, relative to the
## actual min after the threshold.
pos.or.neg.vec <- c(min=-1, max=1)
for(min.or.max in names(pos.or.neg.vec)){
pos.or.neg <- pos.or.neg.vec[[min.or.max]]
## To compute the directional derivatives of aum we need to join
## fp.fn.totals(min.thresh,max.thresh,fp,fn) to the diffs with
## individual thresholds/problems, fp.fn.problems(
## problem,log.lambda,fp.prb.diff,fn.prb.diff,pred.log.lambda,thresh)
fp.fn.join <- fp.fn.totals[
fp.fn.problems,
.(fp, fn, min.fp.fn, fp.prb.diff, fn.prb.diff),
on=structure("thresh", names=paste0(min.or.max, ".thresh"))]
## changes fp.prb.diff,fn.prb.diff occur at thresh.
## fp,fn are the totals in (min.thresh,max.thresh).
## by joining min.thresh or max.thresh = thresh,
## we get the total fp/fn after or before thresh.
for(fX in c("fp", "fn")){
## here we compute what happens to the fp/fn adjacent to the
## totals, on the other side of thresh, when we apply
## fp.prb.diff,fn.prb.diff for each thresh/problem. e.g. when
## min.or.max=="min" the fp/fn columns give the total fp/fn
## after the thresh where there is a change in the problem fp/fn
## curve, and so here we compute the fp/fn before the thresh.
set(
fp.fn.join,
j=paste0(fX, ".adj"),
value=fp.fn.join[[fX]]+
pos.or.neg*fp.fn.join[[paste0(fX, ".prb.diff")]]
)
}
## here we compute Min(FP,FN) adjacent to the totals, on the other
## side of thresh, when we apply fp.prb.diff,fn.prb.diff for each
## thresh/problem.
fp.fn.join[, min.adj := pmin(fp.adj, fn.adj)]
## finally we check to see how much Min(FP,FN) would change via
## fp.prb.diff,fn.prb.diff at each thresh/problem. this amounts to
## computing min.after-min.before the thresh.
set(
fp.fn.problems,
j=paste0("deriv.", min.or.max),
value=fp.fn.join[, pos.or.neg*(min.adj - min.fp.fn)]
)
}
## Compute TPR/FPR rates for ROC-AUC analysis.
interval.dt <- fp.fn.totals[, data.table(
total.dt,
min.thresh,
max.thresh,
fp, fn, min.fp.fn)]
interval.dt[, errors := fp+fn]
interval.dt[, FPR := fp/possible.fp]
interval.dt[, tp := possible.fn - fn]
interval.dt[, TPR := tp/possible.fn]
interval.dt[, error.percent := 100*errors/labels]
dist00.vec <- interval.dt[c(1, .N), sqrt(FPR^2+TPR^2)]
indices <- if(dist00.vec[1]<dist00.vec[2]){
1:nrow(interval.dt)
}else{
nrow(interval.dt):1
}
sorted.dt <- interval.dt[indices]
roc.polygon <- sorted.dt[, {
has11 <- FPR[.N]==1 & TPR[.N]==1
has00 <- FPR[1]==0 & TPR[1]==0
list(
FPR=c(if(!has00)0, FPR, if(!has11)1, 1),
TPR=c(if(!has00)0, TPR, if(!has11)1, 0)
)
}]
# if this is a sequence from q=1 to Q subscripts are:
left <- roc.polygon[-.N] # _q
right <- roc.polygon[-1] # _{q+1}
##value<< named list of results:
list(
roc=interval.dt, ##<< a data.table with one row for each point on
##the ROC curve
thresholds=rbind(##<< two rows of roc which correspond to the
##predicted and minimal error thresholds
data.table(
threshold="predicted",
interval.dt[min.thresh < 0 & 0 <= max.thresh, ]),
data.table(threshold="min.error", interval.dt[which.min(errors), ])),
auc.polygon=roc.polygon, ##<<a data.table with one row for
##each vertex of the polygon used to
##compute AUC
auc=##<<numeric Area Under the ROC curve
sum((right$FPR-left$FPR)*(right$TPR+left$TPR)/2),
aum=aum, ##<< numeric Area Under Min(FP,FN)
aum.grad=##<< data.table with one row for each prediction, and
##columns hi/lo bound for the aum
##generalized gradient.
fp.fn.problems[pred, .(
lo=sum(deriv.min, na.rm=TRUE),
hi=sum(deriv.max, na.rm=TRUE)
), by=.EACHI, on=problem.vars]
)
##end<<
}, ex=function(){
library(penaltyLearning)
library(data.table)
data(neuroblastomaProcessed, envir=environment())
## Get incorrect labels data for one profile.
pid <- 11
pro.errors <- neuroblastomaProcessed$errors[
profile.id==pid][order(chromosome, min.log.lambda)]
dcast(pro.errors, n.segments ~ chromosome, value.var="errors")
## Get the feature that corresponds to the BIC penalty = log(n),
## meaning log(penalty) = log(log(n)).
chr.vec <- paste(c(1:4, 11, 17))
pid.names <- paste0(pid, ".", chr.vec)
BIC.feature <- neuroblastomaProcessed$feature.mat[pid.names, "log2.n"]
pred <- data.table(pred.log.lambda=BIC.feature, chromosome=chr.vec)
## edit one prediction so that it ends up having the same threshold
## as another one, to illustrate an aum sub-differential with
## un-equal lo/hi bounds.
err.changes <- pro.errors[, {
.SD[c(NA, diff(errors) != 0), .(min.log.lambda)]
}, by=chromosome]
(ch.vec <- err.changes[, structure(min.log.lambda, names=chromosome)])
other <- "11"
(diff.other <- ch.vec[[other]]-pred[other, pred.log.lambda, on=.(chromosome)])
pred["1", pred.log.lambda := ch.vec[["1"]]-diff.other, on=.(chromosome)]
pred["4", pred.log.lambda := 2, on=.(chromosome)]
ch.vec[["1"]]-pred["1", pred.log.lambda, on=.(chromosome)]
result <- ROChange(pro.errors, pred, "chromosome")
library(ggplot2)
## Plot the ROC curves.
ggplot()+
geom_path(aes(FPR, TPR), data=result$roc)+
geom_point(aes(FPR, TPR, color=threshold), data=result$thresholds, shape=1)
## Plot the number of incorrect labels as a function of threshold.
ggplot()+
geom_segment(aes(
min.thresh, errors,
xend=max.thresh, yend=errors),
data=result$roc)+
geom_point(aes((min.thresh+max.thresh)/2, errors, color=threshold),
data=result$thresholds,
shape=1)+
xlab("log(penalty) constant added to BIC penalty")
## Plot area under Min(FP,FN).
err.colors <- c(
"fp"="red",
"fn"="deepskyblue",
"min.fp.fn"="black")
err.sizes <- c(
"fp"=3,
"fn"=2,
"min.fp.fn"=1)
roc.tall <- melt(result$roc, measure.vars=names(err.colors))
area.rects <- data.table(
chromosome="total",
result$roc[0<min.fp.fn])
(gg.total <- ggplot()+
geom_vline(
xintercept=0,
color="grey")+
geom_rect(aes(
xmin=min.thresh, xmax=max.thresh,
ymin=0, ymax=min.fp.fn),
data=area.rects,
alpha=0.5)+
geom_text(aes(
min.thresh, min.fp.fn/2,
label=sprintf(
"Area Under Min(FP,FN)=%.3f ",
result$aum)),
data=area.rects[1],
hjust=1,
color="grey50")+
geom_segment(aes(
min.thresh, value,
xend=max.thresh, yend=value,
color=variable, size=variable),
data=data.table(chromosome="total", roc.tall))+
scale_size_manual(values=err.sizes)+
scale_color_manual(values=err.colors)+
theme_bw()+
theme(panel.grid.minor=element_blank())+
scale_x_continuous(
"Prediction threshold")+
scale_y_continuous(
"Incorrectly predicted labels",
breaks=0:10))
## Add individual error curves.
tall.errors <- melt(
pro.errors[pred, on=.(chromosome)],
measure.vars=c("fp", "fn"))
gg.total+
geom_segment(aes(
min.log.lambda-pred.log.lambda, value,
xend=max.log.lambda-pred.log.lambda, yend=value,
size=variable, color=variable),
data=tall.errors)+
facet_grid(chromosome ~ ., scales="free", space="free")+
theme(panel.spacing=grid::unit(0, "lines"))+
geom_blank(aes(
0, errors),
data=data.table(errors=c(1.5, -0.5)))
print(result$aum.grad)
if(interactive()){#this can be too long for CRAN.
## Plot how Area Under Min(FP,FN) changes with each predicted value.
aum.dt <- pred[, {
data.table(log.pen=seq(0, 4, by=0.5))[, {
chr <- paste(chromosome)
new.pred.dt <- data.table(pred)
new.pred.dt[chr, pred.log.lambda := log.pen, on=.(chromosome)]
with(
ROChange(pro.errors, new.pred.dt, "chromosome"),
data.table(aum))
}, by=log.pen]
}, by=chromosome]
bounds.dt <- melt(
result$aum.grad,
measure.vars=c("lo", "hi"),
variable.name="bound",
value.name="slope")[pred, on=.(chromosome)]
bounds.dt[, intercept := result$aum-slope*pred.log.lambda]
ggplot()+
geom_abline(aes(
slope=slope, intercept=intercept),
size=1,
data=bounds.dt)+
geom_text(aes(
2, 2, label=sprintf("directional derivatives = [%d, %d]", lo, hi)),
data=result$aum.grad)+
scale_color_manual(
values=c(
predicted="red",
new="black"))+
geom_point(aes(
log.pen, aum, color=type),
data=data.table(type="new", aum.dt))+
geom_point(aes(
pred.log.lambda, result$aum, color=type),
shape=1,
data=data.table(type="predicted", pred))+
theme_bw()+
theme(panel.spacing=grid::unit(0, "lines"))+
facet_wrap("chromosome", labeller=label_both)+
coord_equal()+
xlab("New log(penalty) value for chromosome")+
ylab("Area Under Min(FP,FN)
using new log(penalty) for this chromosome
and predicted log(penalty) for others")
}
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.