R/utility.function.R

Defines functions xlimits ylimits static.labels only.unique.vals chull.points ahull.points default.ahull make.tiebreaker vertical.qp outside.chull outside.ahull extreme.points label.pieces in1box enlarge.box ignore.na bumpup draw.rects draw.polygons midrange far.from.others.borders

Documented in ahull.points bumpup chull.points default.ahull draw.polygons draw.rects enlarge.box extreme.points far.from.others.borders ignore.na in1box label.pieces make.tiebreaker midrange only.unique.vals outside.ahull outside.chull static.labels vertical.qp xlimits ylimits

### Find the point on each curve which maximizes the distance to the
### plot border or to another curve.
far.from.others.borders <- function(all.groups,...,debug=FALSE){
  group.data <- split(all.groups, all.groups$group)
  group.list <- list()
  for(groups in names(group.data)){
    ## Run linear interpolation to get a set of points on which we
    ## could place the label (this is useful for e.g. the lasso path
    ## where there are only a few points plotted).
    one.group <- group.data[[groups]]
    approx.list <- with(one.group, approx(x, y))
    if(debug){
      with(approx.list, grid.points(x, y, default.units="cm"))
    }
    group.list[[groups]] <- data.frame(
      approx.list,
      groups,
      label=one.group$label[1])
  }
  output.list <- list()
  for(group.i in seq_along(group.list)){
    one.group <- group.list[[group.i]]
    ## From Mark Schmidt: "For the location of the boxes, I found the
    ## data point on the line that has the maximum distance (in the
    ## image coordinates) to the nearest data point on another line or
    ## to the image boundary."
    dist.mat <- matrix(NA, length(one.group$x), 3)
    colnames(dist.mat) <- c("x","y","other")
    ## dist.mat has 3 columns: the first two are the shortest distance
    ## to the nearest x and y border, and the third is the shortest
    ## distance to another data point.
    for(xy in c("x", "y")){
      xy.vec <- one.group[,xy]
      xy.mat <- rbind(xy.vec, xy.vec)
      lim.fun <- get(sprintf("%slimits", xy))
      diff.mat <- xy.mat - lim.fun()
      dist.mat[,xy] <- apply(abs(diff.mat), 2, min)
    }
    other.groups <- group.list[-group.i]
    other.df <- do.call(rbind, other.groups)
    for(row.i in 1:nrow(dist.mat)){
      r <- one.group[row.i,]
      other.dist <- with(other.df, (x-r$x)^2 + (y-r$y)^2)
      dist.mat[row.i,"other"] <- sqrt(min(other.dist))
    }
    shortest.dist <- apply(dist.mat, 1, min)
    picked <- calc.boxes(one.group[which.max(shortest.dist),])
    ## Mark's label rotation: "For the angle, I computed the slope
    ## between neighboring data points (which isn't ideal for noisy
    ## data, it should probably be based on a smoothed estimate)."
    left <- max(picked$left, min(one.group$x))
    right <- min(picked$right, max(one.group$x))
    neighbors <- approx(one.group$x, one.group$y, c(left, right))
    slope <- with(neighbors, (y[2]-y[1])/(x[2]-x[1]))
    picked$rot <- 180*atan(slope)/pi
    output.list[[group.i]] <- picked
  }
  output <- do.call(rbind, output.list)
  ##browser()
  output
}

label.endpoints <- function
### Make a Positioning Method that labels a certain x value.
(FUN,
### FUN(d$x) should return an index of which point to label. for
### example you can use which.min or which.max.
 HJUST
### hjust of the labels.
 ){
  stopifnot(is.function(FUN))
  stopifnot(length(HJUST)==1)
  stopifnot(is.numeric(HJUST))
  stopifnot(is.finite(HJUST))
  function(d,...)gapply(d,function(d,...){
    i <- FUN(d$x)==d$x
    if(length(i)==0){
      data.frame()
    }else{
      sub.df <- d[i,]
      if(nrow(sub.df) > 1){
        y.target <- mean(range(sub.df$y))
        sub.df <- sub.df[1,]
        sub.df$y <- y.target
      }
      sub.df$hjust <- HJUST
      sub.df$vjust <- 0.5
      sub.df
    }
  })
### A Positioning Method like first.points or last.points.
}

dl.combine <- structure(function # Combine output of several methods
### Apply several Positioning methods to the original data frame.
(...
### Several Positioning Methods.
 ){
  FUNS <- list(...)
  pf <- function(d,...){
    dfs <- lapply(FUNS,apply.method,d,...)
    res <- data.frame()
    for(df in dfs){
      ## if cex is undefined, we will get NAs which will not be
      ## plotted.
      if(!"cex"%in%names(df)){
        df$cex <- 1
      }

      ## we need to do merge to keep all the columns around.
      if(nrow(res))res <- merge(df,res,all=TRUE)
      else res <- df
    }
    res
  }
  pf
### A Positioning Method that returns the combined data frame after
### applying each specified Positioning Method.
},ex=function(){

  ## Simple example: label the start and endpoints
  if(require(nlme) && require(lattice)){
    ratplot <- xyplot(
      weight~Time|Diet,BodyWeight,groups=Rat,type='l',layout=c(3,1))
    both <- dl.combine("first.points","last.points")
    rat.both <- direct.label(ratplot,"both")
    print(rat.both)
    ## same as repeated call to direct.label:
    rat.repeated <-
      direct.label(direct.label(ratplot,"last.points"),"first.points")
    print(rat.repeated)
  }
  
  ## same with ggplot2:
  if(require(nlme) && require(ggplot2)){
    rp2 <- qplot(
      Time,weight,data=BodyWeight,geom="line",facets=.~Diet,colour=Rat)
    print(direct.label(direct.label(rp2,"last.points"),"first.points"))
    print(direct.label(rp2,"both"))
  }

  ## more complex example: first here is a function for computing the
  ## lasso path.
  mylars <- function
  ## Least angle regression algorithm for calculating lasso solutions.
  (x,
   ## Matrix of predictor variables.
   y,
   ## Vector of responses.
   epsilon=1e-6
   ## If correlation < epsilon, we are done.
   ){
    xscale <- scale(x) # need to work with standardized variables
    b <- rep(0,ncol(x))# coef vector starts at 0
    names(b) <- colnames(x)
    ycor <- apply(xscale,2,function(xj)sum(xj*y))
    j <- which.max(ycor) # variables in active set, starts with most correlated
    alpha.total <- 0
    out <- data.frame()
    while(1){## lar loop
      xak <- xscale[,j] # current variables
      r <- y-xscale%*%b # current residual
      ## direction of parameter evolution
      delta <- solve(t(xak)%*%xak)%*%t(xak)%*%r
      ## Current correlations (actually dot product)
      intercept <- apply(xscale,2,function(xk)sum(r*xk))
      ## current rate of change of correlations
      z <- xak%*%delta
      slope <- apply(xscale,2,function(xk)-sum(z*xk))
      ## store current values of parameters and correlation
      out <- rbind(out,data.frame(variable=colnames(x),
                                  coef=b,
                                  corr=abs(intercept),
                                  alpha=alpha.total,
                                  arclength=sum(abs(b)),
                                  coef.unscaled=b/attr(xscale,"scaled:scale")))
      if(sum(abs(intercept)) < epsilon)#corr==0 so we are done
        return(transform(out,s=arclength/max(arclength)))
      ## If there are more variables we can enter into the regression,
      ## then see which one will cross the highest correlation line
      ## first, and record the alpha value of where the lines cross.
      d <- data.frame(slope,intercept)
      d[d$intercept<0,] <- d[d$intercept<0,]*-1
      d0 <- data.frame(d[j[1],])# highest correlation line
      d2 <- data.frame(rbind(d,-d),variable=names(slope))#reflected lines
      ## Calculation of alpha for where lines cross for each variable
      d2$alpha <- (d0$intercept-d2$intercept)/(d2$slope-d0$slope)
      subd <- d2[(!d2$variable%in%colnames(x)[j])&d2$alpha>epsilon,]
      subd <- subd[which.min(subd$alpha),]
      nextvar <- subd$variable
      alpha <- if(nrow(subd))subd$alpha else 1
      ## If one of the coefficients would hit 0 at a smaller alpha
      ## value, take it out of the regression and continue.
      hit0 <- xor(b[j]>0,delta>0)&b[j]!=0
      alpha0 <- -b[j][hit0]/delta[hit0]
      takeout <- length(alpha0)&&min(alpha0) < alpha
      if(takeout){
        i <- which.min(alpha0)
        alpha <- alpha0[i]
      }
      b[j] <- b[j]+alpha*delta ## evolve parameters
      alpha.total <- alpha.total+alpha
      ## add or remove a variable from the active set
      j <- if(takeout)j[j!=which(names(i)==colnames(x))]
           else c(j,which(nextvar==colnames(x)))
    }
  }

  ## Calculate lasso path, plot labels at two points: (1) where the
  ## variable enters the path, and (2) at the end of the path.
  if(require(lars) && require(lattice)){
    data(diabetes,envir=environment())
    dres <- with(diabetes,mylars(x,y))
    P <- xyplot(coef~arclength,dres,groups=variable,type="l")
    mylasso <- dl.combine("lasso.labels", "last.qp")
    plot(direct.label(P,"mylasso"))
  }

})

gapply.fun <- structure(function # Direct label groups independently
### Makes a function you can use to specify the location of each group
### independently.
(expr
### Expression that takes a subset of the d data frame, with data from
### only a single group, and returns the direct label position.
 ){
  foo <- substitute(expr)
  f <- function(d,...)eval(foo)
  src <- paste("gapply.fun(",paste(deparse(foo),collapse="\n"),")",sep="")
  pf <- structure(function(d,...)gapply(d,f,...),"source"=src)
  pf
### A Positioning Function.
},ex=function(){
  complicated <- list(dl.trans(x=x+10),
                      gapply.fun(d[-2,]),
                      rot=c(30,180))
  if(require(lattice)){
    direct.label(dotplot(VADeaths,type="o"),complicated,TRUE)
  }
})

dl.trans <- structure(function # Direct label data transform
### Make a function that transforms the data. This is for conveniently
### making a function that calls transform on the data frame, with the
### arguments provided. See examples.
(...
### Arguments to pass to transform.
 ){
  L <- as.list(match.call())[-1]
  pf <- function(d,...)do.call("transform",c(list(d),L))
  pf
### A Positioning Function.
},ex=function(){
  complicated <- list(dl.trans(x=x+10),
                      gapply.fun(d[-2,]),
                      rot=c(30,180))
  if(require(lattice)){
    direct.label(dotplot(VADeaths,type="o"),complicated,TRUE)
  }
})

dl.move <- structure(function # Manually move a direct label
### Sometimes there is 1 label that is placed oddly by another
### Positioning Function. This function can be used to manually place
### that label in a good spot.
(group,
### Group to change.
 x,
### Horizontal position of the new label.
 y,
### Vertical position of the new label. If missing(y) and !missing(x)
### then we will calculate a new y value using linear interpolation.
 ...
### Variables to change for the specified group
 ){
  L <- list(...)
  pos <- list()
  if(!missing(x))pos$x <- x
  if(!missing(y))pos$y <- y
  pf <- function(d,...,axes2native){
    native <- axes2native(do.call(data.frame,pos))
    ## first convert user-specified axes units to cm
    for(var in names(pos)){
      u <- unit(native[[var]],"native")
      L[[var]] <- convertUnit(u,"cm",var,"location",var,"location")
    }
    v <- d$groups==group
    for(N in names(L))
      d[v,N] <- L[[N]]
    ## maybe generalize this to be symmetric on x and y one day?
    if("x" %in% names(L) && (!"y" %in% names(L))){
      orig <- attr(d,"orig.data")
      orig <- orig[orig$label==group,]
      ## do linear interpolation to find a good y-value
      f <- with(orig,approxfun(x,y))
      d[v,"y"] <- f(L$x)
    }
    d
  }
  pf
### A Positioning Function that moves a label into a good spot.
},ex=function(){
  if(require(ggplot2) && require(lattice)){
    scatter <- xyplot(jitter(cty)~jitter(hwy),mpg,groups=class,aspect=1)
    dlcompare(list(scatter),
              list("extreme.grid",
                   `+dl.move`=list(extreme.grid,dl.move("suv",15,15))))
    p <- qplot(log10(gamma),rate,data=svmtrain,group=data,colour=data,
               geom="line",facets=replicate~nu)
    adjust.kif <- dl.move("KIF11",-0.9,hjust=1,vjust=1)
    dlcompare(list(p+xlim(-8,7)),
              list("last.points",
                   `+dl.move`=list(last.points,adjust.kif)))
  }
})

### Jitter the label positions.
dl.jitter <- dl.trans(x=jitter(x),y=jitter(y))

calc.boxes <- function
### Calculate boxes around labels, for collision detection.
(d,
 debug=FALSE,
 ...
 ){
  vp <- current.viewport()
  convert <- function(str.prop, worh=str.prop){
    conv <- get(paste("convert",worh,sep=""))
    stri <- get(paste("string", str.prop, sep=""))
    as.numeric(sapply(seq_along(d$groups),function(i){
      if("cex"%in%names(d))vp$gp <- gpar(cex=d$cex[i])
      pushViewport(vp)
      if(debug)grid.rect() ##highlight current viewport
      cm <- conv(stri(as.character(d$label[i])),"cm")
      popViewport()
      cm
    }))
  }
  ## abs since we have a weird bug with ggplot2 sometimes
  d$w <- abs(convert("Width"))
  d$h <- abs(convert("Height"))
  d$descent <- abs(convert("Descent", "Height"))
  calc.borders(d)
}

### Calculate big boxes around the means of each cluster.
big.boxes <- list("get.means","calc.boxes","enlarge.box")

### Point halfway between the min and max
midrange <- function(x){
  r <- range(x)
  (r[2]-r[1])/2+r[1]
}

### Point in the middle of the min and max for each group.
visualcenter <- gapply.fun(dl.summarize(d,x=midrange(x),y=midrange(y)))

### Positioning Function for the mean of each cluster of points.
get.means <-
  gapply.fun(dl.summarize(d,x=mean(x),y=mean(y)))

calc.borders <- function
### Calculate bounding box based on newly calculated width and height.
(d,
### Data frame of point labels, with new widths and heights in the w
### and h columns.
 ...
### ignored.
 ){
  for(just in c("hjust","vjust")){
    if(!just %in% names(d)){
      d[,just] <- 0.5
    }
  }
  if(!"descent" %in% names(d)){
    d$descent <- 0
  }
  d$top <- d$y+(1-d$vjust)*d$h
  d$bottom <- d$y-d$vjust*d$h - d$descent
  d$right <- d$x+(1-d$hjust)*d$w
  d$left <- d$x-d$hjust*d$w
  d
}

polygon.method <- function
### Make a Positioning Method that places non-overlapping speech
### polygons at the first or last points.
(top.bottom.left.right,
### Character string indicating what side of the plot to label.
  offset.cm=0.1,
### Offset from the polygon to the most extreme data point.
  padding.cm=0.05,
### Padding inside the polygon.
  custom.colors=NULL
### Positioning method applied just before draw.polygons, can set
### box.color and text.color for custom colors.
){
  if(is.null(custom.colors)){
    custom.colors <- gapply.fun({
      rgb.mat <- col2rgb(d[["colour"]])
      d$text.color <- with(data.frame(t(rgb.mat)), {
        gray <- 0.3*red + 0.59*green + 0.11*blue
        ifelse(gray/255 < 0.5, "white", "black")
      })
      d
    })
  }
  opposite.side <- c(
    left="right",
    right="left",
    top="bottom",
    bottom="top")[[top.bottom.left.right]]
  direction <- if(
    top.bottom.left.right %in% c("bottom", "left")
  ) -1 else 1
  min.or.max <- if(
    top.bottom.left.right %in% c("top", "right")
  ) max else min
  if(top.bottom.left.right %in% c("left", "right")){
    min.or.max.xy <- "x"
    qp.target <- "y"
    qp.max <- "top"
    qp.min <- "bottom"
    padding.h.factor <- 2
    padding.w.factor <- 1
    limits.fun <- ylimits
    reduce.method <- "reduce.cex.lr"
  }else{
    min.or.max.xy <- "y"
    qp.target <- "x"
    qp.max <- "right"
    qp.min <- "left"
    padding.h.factor <- 1
    padding.w.factor <- 2
    limits.fun <- xlimits
    reduce.method <- "reduce.cex.tb"
  }
  list(
    paste0(top.bottom.left.right, ".points"),
    function(d,...){
      ## set the end of the speech polygon to the original data point.
      for(xy in c("x", "y")){
        extra.coord <- sprintf(# e.g. left.x
          "%s.%s", opposite.side, xy)
        d[[extra.coord]] <- d[[xy]]
      }
      ## set the speech polygon position to the min or max of all
      ## label positions. e.g. max
      d[[min.or.max.xy]] <- min.or.max(d[[min.or.max.xy]]) + offset.cm*direction
      d
    },
    "calc.boxes",
    reduce.method,
    function(d, ...){
      d$h <- d$h + padding.cm * padding.h.factor
      d$w <- d$w + padding.cm * padding.w.factor
      d
    },
    "calc.borders",
    qp.labels(
      qp.target,
      qp.min,
      qp.max,
      make.tiebreaker(min.or.max.xy, qp.target),
      limits.fun),
    "calc.borders",
    custom.colors,
    "draw.polygons")
}

### Draw polygons around label positions.
draw.polygons <- function(d,...){
  for(side in c("left", "right", "top", "bottom")){
    for(xy in c("x", "y")){
      col.name <- paste0(side, ".", xy)
      if(!col.name %in% names(d)){
        d[[col.name]] <- NA
      }
    }
  }
  if(! "box.color" %in% names(d)){
    d$box.color <- "black"
  }
  if(! "text.color" %in% names(d)){
    d$text.color <- "white"
  }
  for(i in 1:nrow(d))with(d[i,], {
    L <- list(
      x=c(left.x, left, top.x, right, right.x, right, bottom.x, left),
      y=c(left.y, top, top.y, top, right.y, bottom, bottom.y, bottom))
    for(xy.name in names(L)){
      xy <- L[[xy.name]]
      L[[xy.name]] <- xy[!is.na(xy)]
    }
    grid::grid.polygon(
      L$x, L$y,
      default.units="cm",
      gp=grid::gpar(col=box.color, fill=colour),
      name="directlabels.draw.polygon"
    )
  })
  d$colour <- d$text.color
  d
}

### Positioning Function that draws boxes around label positions. Need
### to have previously called calc.boxes. Does not edit the data
### frame.
draw.rects <- function(d,...){
  if(is.null(d$box.color))d$box.color <- "black"
  if(is.null(d$fill))d$fill <- "white"
  for(i in 1:nrow(d)){
    with(d[i,], grid.rect(
      gp = gpar(col = box.color, fill = fill),
      vp = viewport(x, y, w, h, "cm", c(hjust, vjust), angle=rot),
      name="directlabels.draw.rects"
    ))
  }
  d
}

### Sequentially bump labels up, starting from the bottom, if they
### collide with the label underneath.
bumpup <- function(d,...){
  ## If there is only 1, then there is no collision detection to do.
  if(nrow(d) == 1)return(d)
  d <- calc.boxes(d)[order(d$y),]
  "%between%" <- function(v,lims)lims[1]<v&v<lims[2]
  obox <- function(x,y){
    tocheck <- with(x,c(left,(right-left)/2+left,right))
    tocheck %between% with(y,c(left,right))
  }
  for(i in 2:nrow(d)){
    dif <- d$bottom[i]-d$top[i-1]
    ## here we are trying to test if box i can possibly collide with
    ## the box below it! Originally we checked if the bottom points of
    ## this box fall in the box below it, but this causes problems
    ## since we are reassigning box positions. If all boxes start at
    ## the same place, 2 will get moved up, 3 will not since its
    ## bottom points are no longer inside box 2. Solution: Look at box
    ## left and right limits and see if they collide!

    ## GOTCHA: If all the boxes are exactly the same size, on top of
    ## each other, then if we only examine left and right points of
    ## each box, none of the boxes will be detected as
    ## overlapping. One way to fix this is change > to >= in %between%
    ## but this is a bad idea since you can have boxes right next to
    ## each other that we don't want to move, that would be detected
    ## as overlapping. Solution: use the midpoint of the box as well!
    overlap <- c(obox(d[i,],d[i-1,]),obox(d[i-1,],d[i,]))
    if(dif < 0 && any(overlap)){
      d$bottom[i] <- d$bottom[i]-dif
      d$top[i] <- d$top[i]-dif
      d$y[i] <- d$y[i]-dif
    }
  }
  d
}

### Remove rows for which either x or y is NA
ignore.na <- function(d,...){
  not.na <- is.finite(d$x)
  if("y"%in% names(d)){
    not.na <- not.na & is.finite(d$y)
  }
  d[not.na,]
}

qp.labels <- structure(function# Make a Positioning Method for non-overlapping lineplot labels
### Use a QP solver to find the best places to put the points on a
### line, subject to the constraint that they should not overlap.
(target.var,
### Variable name of the label target.
 lower.var,
### Variable name of the lower limit of each label bounding box.
 upper.var,
### Variable name of the upper limit of each label bounding box.
 order.labels=function(d)order(d[,target.var]),
### Function that takes the data.frame of labels and returns an
### ordering, like from the order function. That ordering will be used
### to reorder the rows. This is useful to e.g. break ties when two
### groups have exactly the same value at the endpoint near the label.
 limits=NULL
### Function that takes the data.frame of labels an returns a numeric
### vector of length 2. If finite, these values will be used to add
### constraints to the QP: limits[1] is the lower limit for the first
### label's lower.var, and limits[2] is the upper limit for the last
### labels's upper.var. Or NULL for no limits.
 ){
  ## Reality checks. These also have the side effect of forcing
  ## evaluation of all the arguments in the returned closure.
  stopifnot(is.function(order.labels))
  essential <- list(target.var,upper.var,lower.var)
  for(v in essential){
    stopifnot(is.character(v))
    stopifnot(length(v)==1)
  }
  stopifnot(is.function(limits)||is.null(limits))

  function(d,...){

    ## If there is only 1 label, there is no collision detection to
    ## do, so just return it.
    if(nrow(d)==1)return(d)

    ##browser()

    ## Reality checks.
    for(v in essential){
      if(! v %in% names(d)){
        stop("need to have calculated ",v)
      }
    }

    ## sorts data so that target_1 <= target_2 <= ... <= target_n.
    d <- d[order.labels(d),]

    ## check limits to see if there is enough space, given specified
    ## cex.
    if(is.function(limits)){
      l <- limits(d)
      stopifnot(is.numeric(l))
      stopifnot(length(l)==2)
      stopifnot(l[1]<l[2])

      h.available <- l[2] - l[1]
      h <- d[,upper.var]-d[,lower.var]
      h.occupied <- sum(h)
      if(h.occupied > h.available){ ## then the feasible set is empty.
        ## total hack:
        cex <- h.available / h.occupied  * 0.9
        if("cex" %in% names(d)){
          d$cex <- d$cex * cex
        }else{
          d$cex <- cex
        }
        d <- calc.boxes(d)
      }
    }

    ## These are the standard form matrices described in the
    ## directlabels poster.
    target <- d[,target.var]
    k <- nrow(d)
    D <- diag(rep(1,k))
    Ik <- diag(rep(1,k-1))
    A <- rbind(0,Ik)-rbind(Ik,0)
    y.up <- d[,upper.var]
    y.lo <- d[,lower.var]
    b0 <- (y.up-target)[-k] + (target-y.lo)[-1]

    ## limit constraints.
    if(is.function(limits)){
      if(is.finite(l[1])){
        c.vec <- rep(0,k)
        c.vec[1] <- 1
        A <- cbind(A,c.vec)
        b0 <- c(b0,l[1]+target[1]-y.lo[1])
      }
      if(is.finite(l[2])){
        c.vec <- rep(0,k)
        c.vec[k] <- -1
        A <- cbind(A,c.vec)
        b0 <- c(b0,y.up[k]-target[k]-l[2])
      }
    }

    ##print(A)
    ##print(b0)
    ##browser()
    sol <- solve.QP(D,target,A,b0)
    d[,target.var] <- sol$solution
    d
  }
### Positioning Method that adjusts target.var so there is no overlap
### of the label bounding boxes, as specified by upper.var and
### lower.var.
},ex=function(){
  SegCost$error <- factor(SegCost$error,c("FP","FN","E","I"))
  if(require(ggplot2)){
    fp.fn.colors <- c(FP="skyblue",FN="#E41A1C",I="black",E="black")
    fp.fn.sizes <- c(FP=2.5,FN=2.5,I=1,E=1)
    fp.fn.linetypes <- c(FP="solid",FN="solid",I="dashed",E="solid")
    err.df <- subset(SegCost,type!="Signal")

    kplot <- ggplot(err.df,aes(segments,cost))+
      geom_line(aes(colour=error,size=error,linetype=error))+
      facet_grid(type~bases.per.probe)+
      scale_linetype_manual(values=fp.fn.linetypes)+
      scale_colour_manual(values=fp.fn.colors)+
      scale_size_manual(values=fp.fn.sizes)+
      scale_x_continuous(limits=c(0,20),breaks=c(1,7,20),minor_breaks=NULL)+
      theme_bw()+theme(panel.margin=grid::unit(0,"lines"))

    ## The usual ggplot without direct labels.
    print(kplot)

    ## Get rid of legend for direct labels.
    no.leg <- kplot+guides(colour="none",linetype="none",size="none")

    ## Default direct labels.
    direct.label(no.leg)

    ## Explore several options for tiebreaking and limits. First let's
    ## make a qp.labels Positioning Method that does not tiebreak.
    no.tiebreak <- list("first.points",
                        "calc.boxes",
                        qp.labels("y","bottom","top"))
    direct.label(no.leg, no.tiebreak)

    ## Look at the weird labels in the upper left panel. The E curve is
    ## above the FN curve, but the labels are the opposite! This is
    ## because they have the same y value on the first points, which are
    ## the targets for qp.labels. We need to tiebreak.
    qp.break <- qp.labels("y","bottom","top",make.tiebreaker("x","y"))
    tiebreak <- list("first.points",
                     "calc.boxes",
                     "qp.break")
    direct.label(no.leg, tiebreak)

    ## Enlarge the text size and spacing.
    tiebreak.big <- list("first.points",
                         cex=2,
                         "calc.boxes",
                         dl.trans(h=1.25*h),
                         "calc.borders",
                         "qp.break")
    direct.label(no.leg, tiebreak.big)

    ## Even on my big monitor, the FP runs off the bottom of the screen
    ## in the top panels. To avoid that you can specify a limits
    ## function.

    ## Below, the ylimits function uses the limits of each panel, so
    ## labels appear inside the plot region. Also, if you resize your
    ## window so that it is small, you can see that the text size of the
    ## labels is decreased until they all fit in the plotting region.
    qp.limited <-  qp.labels("y","bottom","top",make.tiebreaker("x","y"),ylimits)
    tiebreak.lim <- list("first.points",
                         cex=2,
                         "calc.boxes",
                         dl.trans(h=1.25*h),
                         "calc.borders",
                         "qp.limited")
    direct.label(no.leg, tiebreak.lim)
  }
})


### Make text bounding box larger by some amount.
enlarge.box <- function(d,...){
  if(!"h"%in%names(d))stop("need to have already calculated height and width.")
  calc.borders(within(d,{
    w <- w+h
    h <- h+h
  }))
}

in1which <- function
### Calculate which points fall in a box.
(p,
### data frame of points with columns x and y and many rows.
 box
### data frame of 1 row with columns left right top bottom.
 ){
  p$x>=box$left & p$x<=box$right & p$y<=box$top & p$y>=box$bottom
}

### Calculate how many points fall in a box.
in1box <- function(p,box)sum(in1which(p,box))

### Make a Positioning Method that will, for every piece, select
### points and assign a vjust value.
label.pieces <- function(FUN,VJUST){
  function(d,...){
    processed <- gapply(d,function(d,...)d[FUN(d$y),],groups="piece")
    transform(processed,hjust=0.5,vjust=VJUST)
  }
}

inside <- function
### Calculate for each box how many points are inside.
(boxes,
### Data frame of box descriptions, each row is 1 box, need columns
### left right top bottom.
 points
### Data frame of points, each row is 1 point, need columns x y.
 ){
  sapply(1:nrow(boxes),function(i)in1box(points,boxes[i,]))
### Vector of point counts for each box.
}

dl.summarize <- function
### summarize which preserves important columns for direct labels.
(OLD,
### data frame
 ...
 ){
  rownames(OLD) <- NULL
  NEW <- unique(transform(OLD,...))
  to.copy <- names(OLD)[!names(OLD)%in%names(NEW)]
  for(N in to.copy)
    NEW[,N] <- OLD[,N]
  NEW
}

gapply <- function
### apply a Positioning Method to every group. works like ddply from
### plyr package, but the grouping column is always called groups, and
### the Positioning Method is not necessarily a function (but can be).
(d,
### data frame with column groups.
 method,
### Positioning Method to apply to every group separately.
 ...,
### additional arguments, passed to Positioning Methods.
 groups="groups"
### can also be useful for piece column.
 ){
  stopifnot(is.data.frame(d))
  dfs <- split(d,as.character(d[[groups]]))
  f <- function(d,...){
    res <- apply.method(method,d,columns.to.check=c("x","y"),...)
    if(nrow(res)){
      res[[groups]] <- d[[groups]][1]
      res[["label"]] <- d[["label"]][1]
    }
    res
  }
  results <- lapply(dfs,f,...)
  if(any(!sapply(results,is.data.frame))){
    print(results)
    stop("Positioning Method did not return data.frame")
  }
  do.call(rbind,results)
### data frame of results after applying FUN to each group in d.
}

### Label the points furthest from the middle for each group.
extreme.points <- function(d,...){
  d$dist.from.center <- sqrt((d$x-midrange(d$x))^2+(d$y-midrange(d$y))^2)
  gapply(d,function(d,...)d[which.max(d$dist.from.center),])
}

edges.to.outside <- function
### Given a list of edges from the convex or alpha hull, and a list of
### cluster centers, calculate a point near to each cluster on the
### outside of the hull.
(edges,centers,debug=FALSE,...){
  if(debug){
    with(centers,grid.points(
      x,y,pch="+",default.units="cm",
      name="directlabels.points.edges.to.outside"
    ))
    with(edges,grid.segments(
      x1,y1,x2,y2,default.units="cm",
      name="directlabels.segments.edges.to.outside"
    ))
  }
  closepts <- gapply(centers,project.onto.segments,edges,debug=debug,...)
  closepts$vjust <- ifelse(closepts$y-centers$y>0,0,1)
  closepts$hjust <- ifelse(closepts$x-centers$x>0,0,1)
  r <- apply.method("big.boxes",closepts)
  r$x <- (r$right-r$left)/2+r$left
  r$y <- (r$top-r$bottom)/2+r$bottom
  r$hjust <- 0.5
  r$vjust <- 0.5
  r
}

### Calculate closest point on the alpha hull with size of the boxes,
### and put it outside that point.
outside.ahull <- function(d,...){
  edges.to.outside(ahull.points(d),visualcenter(d),...)
}

### Calculate closest point on the convex hull and put it outside that
### point. Assume d is the center for each point cloud and then use
### orig.data to calculate hull.
outside.chull <- function(d,...){
  edges.to.outside(chull.points(d),visualcenter(d),...)
}

project.onto.segments <- function
### Given a point and a set of line segments representing a convex or
### alpha hull, calculate the closest point on the segments.
(m,
### m is 1 row, a center of a point cloud, we need to find the
### distance to the closest point on each segment of the convex
### hull.
 h,
### Data frame describing the line segments of the convex or alpha
### hull.
 debug=FALSE,
 ...
### ignored
 ){
  h$s <- (h$y2-h$y1)/(h$x2-h$x1)
  ## the closest point on the line formed by expanding this line
  ## segment (this expression is calculated by finding the minimum
  ## of the distance function).
  h$xstar <- (m$x + m$y*h$s + h$x1*h$s^2 - h$s*h$y1)/(h$s^2+1)
  h$minval <- apply(cbind(h$x1,h$x2),1,min)
  h$maxval <- apply(cbind(h$x1,h$x2),1,max)
  ## xopt is the closest point on the line segment
  h$xopt <- ifelse(h$xstar<h$minval,h$minval,
                   ifelse(h$xstar>h$maxval,h$maxval,h$xstar))
  h$yopt <- h$s*(h$xopt-h$x1)+h$y1
  ## distance to each point on line segment from the center
  h$d <- (m$x-h$xopt)^2+(m$y-h$yopt)^2
  i <- which.min(h$d)
  result <- with(h[i,],data.frame(x=xopt,y=yopt))
  if(debug){
    grid.segments(
      m$x,m$y,result$x,result$y,default.units="cm",
      name="directlabels.segments.project.onto.segments"
    )
  }
  result
}

### Make a Positioning Function from a set of points on a vertical
### line that will be spaced out using qp.labels.
vertical.qp <- function(M){
  avoid.collisions <-
    qp.labels("y","bottom","top",make.tiebreaker("x","y"),ylimits)
  list(M,"reduce.cex.lr",avoid.collisions)
}

### Make a tiebreaker function that can be used with qp.labels.
make.tiebreaker <- function(x.var,tiebreak.var){
  force(x.var)
  force(tiebreak.var)
  function(d,...){
    orig <- attr(d,"orig.data")
    xvals <- unique(orig[,x.var])
    x <- unique(d[,x.var])
    if(length(x)>1){
      stop("labels are not aligned")
    }
    xvals <- xvals[order(abs(xvals-x))]
    group.dfs <- split(orig,orig$groups)
    glist <- lapply(d$groups,function(g){
      df <- group.dfs[[as.character(g)]]
      group.x <- df[,x.var]
      group.y <- df[,tiebreak.var]
      all.unique <- length(unique(group.x)) == length(group.x)
      ## approx gives the following error if we only have one point:
      ## need at least two non-NA values to interpolate
      x.not.missing <- !is.na(group.x)
      y.not.missing <- !is.na(group.y)
      not.missing <- sum(x.not.missing & y.not.missing)
      if(all.unique && 1 < not.missing){
        ## rule=2 means to use the most extreme value instead of the
        ## default NA, for any points that are outside the range -
        ## this is required to get a good ordering in some cases.
        approx(group.x, group.y, xvals, rule=2)$y
      }else{
        iord <- order(abs(group.x-x))
        closest <- iord[1]
        rep(group.y[closest], length(xvals))
      }
    })
    m <- do.call(cbind,glist)
    ## useful for debugging:
    ##print(m)
    L <- lapply(1:nrow(m),function(i)m[i,])
    do.call(order,L)
  }
}

### Calculate the default alpha parameter for ashape based on the
### average size of label boxes.
default.ahull <- function(d,...){
  labels <- apply.method("big.boxes",d,...)
  mean(unlist(labels[,c("w","h")]))
}

### Calculate the points on the ashape.
ahull.points <- function(d,...,ahull=default.ahull(d)){
  xy <- unique(d[,c("x","y")])
  as <- alphahull::ashape(xy,alpha = ahull)
  as.data.frame(as$edges)
}

### Calculate the points on the convex hull.
chull.points <- function(d,...){
  bpts <- d[with(d,chull(x,y)),]
  r <- data.frame(i1=1:nrow(bpts),i2=c(2:nrow(bpts),1))
  r$x1 <- bpts$x[r$i1]
  r$y1 <- bpts$y[r$i1]
  r$x2 <- bpts$x[r$i2]
  r$y2 <- bpts$y[r$i2]
  r
}

check.for.columns <- function
### Stop if a data.frame does not have some columns.
(d,
### data.frame to check.
 must.have
### column names to check.
 ){
  stopifnot(is.character(must.have))
  for(N in must.have){
    if(! N %in% names(d)){
      stop("data must have a column named ",N)
    }
  }
}

apply.method <- function # Apply a Positioning Method
### Run a Positioning Method list on a given data set. This function
### contains all the logic for parsing a Positioning Method and
### sequentially applying its elements to the input data to obtain the
### label positions.
(method,
### Direct labeling Positioning Method. Starting from the data frame
### of points to plot for the panel, the elements of the Positioning
### Method list are applied in sequence, and then each row of the
### resulting data frame is used to draw a direct label. The
### elements of a Positioning Method list can be
### \itemize{
### \item a Positioning Function is any function(d,...) which takes a
### data.frame d with columns x,y,groups and returns another
### data.frame representing the positions of the desired direct
### labels. For a description of all the columns that are interpreted
### for drawing direct labels, see \code{\link{drawDetails.dlgrob}}.
### For example, maxvar.points is a Positioning Function that returns
### a data.frame with columns x,y,groups,hjust,vjust.
### \item a character vector of length 1 is treated as the name of an
### R object. For example, specifying "maxvar.points" means to look up
### the variable called maxvar.points and use that. Using the name of
### a Positioning Function is preferable to specifying the Positioning
### Function itself, since then the name is visible in the Positioning
### Method list, which is more interpretable when debugging.
### \item a named list element is used to add or update variables in
### the data.frame of direct labels to plot. For example
### list("first.points",cex=1.5) means take only the first points of
### every group and then set the cex column to 1.5.
### \item an element of a Positioning Method list can be another
### Positioning Method list, in which case the elements of the inner
### list are applied.
### }
 d,
### Data frame to which we apply the Positioning Method. The x and y
### columns should be in centimeters (cm), so that Positioning Methods
### can easily calculate the L2/Euclidean/visual distance between
### pairs of points.
 columns.to.check=c("x","y","groups","label"),
### After applying each Positioning Method list element, we check for
### the presence of these columns, and if not found we stop with an
### error.
 ...,
### Named arguments, passed to Positioning Functions.
 debug=FALSE
### If TRUE, print each Positioning Method list elmenent and the
### direct label data.frame that results from its evaluation.
 ){
  attr(d,"orig.data") <- d ##DONT DELETE: if the first Positioning
                           ##Method needs orig.data, this needs to be
                           ##here!
  check.for.columns(d,columns.to.check)
  if(!is.list(method))method <- list(method)
  isconst <- function(){
    m.var <- names(method)[1]
    !(is.null(m.var)||m.var=="")
  }
  islist <- function()is.list(method[[1]])
  isref <- function()(!isconst())&&is.character(method[[1]])
  while(length(method)){
    if(debug)print(method[1])##not [[1]] --- named items!
    ##browser()
    ## Resolve any names or nested lists
    while(islist()||isref()){
      if(islist()){
        method <- c(method[[1]],method[-1])
      }else{ #must be character -> get the fun(s)
        if(length(method[[1]])>1){
          warning("using first element of character vector")
          method[[1]] <- method[[1]][1]
        }
        method <- c(get(method[[1]]),method[-1])
      }
    }
    if(isconst())
      d[[names(method)[1]]] <- method[[1]]
    else{ #should be a Positioning Function
      old <- d
      group.dfs <- split(d,d$groups)
      group.specific <- lapply(group.dfs,only.unique.vals)
      to.restore <- Reduce(intersect,lapply(group.specific,names))
      d <- method[[1]](d,debug=debug,...)
      if(length(d)==0){#NULL or list()
        return(data.frame())
      }else{
        check.for.columns(d,columns.to.check)
        if("groups" %in% names(d)){
          ## do not restore if they are present in the returned list!
          to.restore <- to.restore[!to.restore %in% names(d)]
          for(N in to.restore){
            d[[N]] <- NA
            group.vec <- paste(unique(d$groups))
            for(g in group.vec){
              old.val <- group.specific[[g]][,N]
              if(is.factor(old.val))old.val <- paste(old.val)
              d[d$groups==g,N] <- old.val
            }
          }
        }
      }
      attr(d,"orig.data") <-
        if(is.null(attr(old,"orig.data")))old
        else attr(old,"orig.data")
    }
    if(debug){
      print(d)
    }
    method <- method[-1]
  }
  d
### The final data frame returned after applying all of the items in
### the Positioning Method list, with x and y in units of cm.
}

### Create a 1-row data.frame consisting of only the columns for which
### there is only 1 unique value.
only.unique.vals <- function(d,...){
  unique.vals <- lapply(d,unique)
  n.vals <- sapply(unique.vals,length)
  do.call(data.frame,unique.vals[n.vals==1])
}

### to hard-code label positions...
static.labels <- function(x,y,groups,...){
  L <- list(...)
  force(x)
  force(y)
  force(groups)
  function(d,...,axes2native){
    native <- axes2native(data.frame(x,y))
    L$x <- convertX(unit(native$x,"native"),"cm",valueOnly=TRUE)
    L$y <- convertY(unit(native$y,"native"),"cm",valueOnly=TRUE)
    L$groups <- groups
    do.call(data.frame,L)
  }
}

### Return the positions of the plot vertical limits in cm, for use as
### the limit argument to qp.labels.
ylimits <- function(...){
  convertY(unit(c(0,1),"npc"),"cm",valueOnly=TRUE)
}

### Return the positions of the plot horizontal limits in cm, for use
### as the limit argument to qp.labels.
xlimits <- function(...){
  convertX(unit(c(0,1),"npc"),"cm",valueOnly=TRUE)
}

reduce.cex <- structure(function
### If edges of the text are going out of the plotting
### region, then decrease cex until it fits. We call calc.boxes
### inside, so you should set cex before using this.
(sides
### string: lr (left and right) or tb (top and bottom).
){
  if(sides=="lr"){
    hi <- "right"
    lo <- "left"
    limits.fun <- xlimits
  }else{
    hi <- "top"
    lo <- "bottom"
    limits.fun <- ylimits
  }
  function(d,...){
    d <- calc.boxes(d)
    l <- limits.fun()
    positive.part <- function(x)ifelse(x>0,x,0)
    hi.pp <- positive.part(d[, hi]-l[2])
    lo.pp <- positive.part(l[1]-d[, lo])
    w <- d[, hi]-d[, lo]
    if(is.null(d$cex)){
      d$cex <- 1
    }
    d$cex <- (w-hi.pp)/w * (w-lo.pp)/w * d$cex
    calc.boxes(d)
  }
},ex=function(){

  if(require(lars) && require(ggplot2)){
    data(diabetes,package="lars",envir=environment())
    X <- diabetes$x
    colnames(X) <- paste(colnames(X), colnames(X))
    fit <- lars(X,diabetes$y,type="lasso")
    beta <- scale(coef(fit),FALSE,1/fit$normx)
    arclength <- rowSums(abs(beta))
    path.list <- list()
    for(variable in colnames(beta)){
      standardized.coef <- beta[, variable]
      path.list[[variable]] <-
        data.frame(step=seq_along(standardized.coef),
                   arclength,
                   variable,
                   standardized.coef)
    }
    path <- do.call(rbind, path.list)
    p <- ggplot(path,aes(arclength,standardized.coef,colour=variable))+
      geom_line(aes(group=variable))
    ## the legend isn't very helpful.
    print(p)
    ## add direct labels at the end of the lines.
    direct.label(p, "last.points")
    ## on my screen, some of the labels go off the end, so we can use
    ## this Positioning Method to reduce the text size until the labels
    ## are on the plot.
    direct.label(p, list("last.points",reduce.cex("lr")))
    ## the default direct labels for lineplots are similar.
    direct.label(p)
  }

})

### If edges of the text are going left or right out of the plotting
### region, then decrease cex until it fits.
reduce.cex.lr <- reduce.cex("lr")

### If edges of the text are going over the top or bottom of the
### plotting region, then decrease cex until it fits.
reduce.cex.tb <- reduce.cex("tb")

empty.grid <- function
### Label placement method for scatterplots that ensures labels are
### placed in different places. A grid is drawn over the whole
### plot. Each cluster is considered in sequence and assigned to the
### point on this grid which is closest to the point given by
### the input data points. Makes use of attr(d,"orig.data").
(d,
### Data frame of target points on the scatterplot for each label.
 debug=FALSE,
### Show debugging info on the plot?
 ...
### ignored.
 ){
  NREP <- 10
  orig <- attr(d,"orig.data")
  all.points <- orig[,c("x","y")]
  if(any(table(d$groups)>1))d <- get.means(d)
  label.targets <- d
  ranges <- list(x=convertX(unit(c(0,1),"npc"),"cm",valueOnly=TRUE),
                 y=convertY(unit(c(0,1),"npc"),"cm",valueOnly=TRUE))
  gl <- function(v){
    s <- seq(min(all.points[,v]),max(all.points[,v]),l=NREP)
    if(expand){
      dif <- s[2]-s[1]
      s <- seq(min(ranges[[v]])-expand*dif,
               max(ranges[[v]])+expand*dif,
               l=NREP+2*expand)
    }
    list(centers=s,diff=s[2]-s[1])
  }
  hgrid <- function(x,w){
    hboxes <- floor(diff(ranges[[x]])/r[,w])
    (-expand:(hboxes+expand-1))*r[,w]+r[,w]/2+min(ranges[[x]])
  }
  if(debug)with(label.targets,{
    grid.points(
      x,y,default.units="cm",gp=gpar(col="green"),
      name="directlabels.points.empty.grid.label.targets"
    )
  })
  draw <- function(g){
    gridlines <- with(g,list(x=unique(c(left,right)),y=unique(c(top,bottom))))
    drawlines <- function(a,b,c,d,name)
      grid.segments(a,b,c,d,"cm",name=name,gp=gpar(col="grey"))
    with(gridlines,drawlines(
      min(x),y,max(x),y,"directlabels.segments.empty.grid.vertical"))
    with(gridlines,drawlines(
      x,min(y),x,max(y),"directlabels.segments.empty.grid.horizontal"))
  }
  res <- data.frame()
  label.targets <-
    label.targets[order(nchar(as.character(label.targets$groups))),]
  for(v in label.targets$groups){
    r <- label.targets[label.targets$groups==v,]
    no.points <- data.frame()
    expand <- 0
    while(nrow(no.points)==0){
      boxes <- if("left"%in%names(label.targets)){
        list(x=hgrid("x","w"),y=hgrid("y","h"),w=r$w,h=r$h)
      }else{
        L <- sapply(c("x","y"),gl,simplify=FALSE)
        list(x=L$x$centers,y=L$y$centers,w=L$x$diff,h=L$y$diff)
      }
      boxes <- calc.borders(do.call(expand.grid,boxes))
      boxes <- cbind(boxes,data=inside(boxes,all.points))
      no.points <- transform(subset(boxes,data==0))
      expand <- expand+1 ## look further out if we can't find any labels inside
    }
    if(debug)draw(boxes)

    ## TDH 29 Aug 2012. For every box, figure out the class of the
    ## point which is its nearest neighbor.
    no.points$nearest <- NA
    for(i in 1:nrow(no.points)){
      b <- no.points[i,]
      d.orig <- with(orig,(b$x-x)^2+(b$y-y)^2)
      no.points[i,"nearest"] <- as.character(orig$groups[which.min(d.orig)])
    }
    ## Only consider boxes that are closest to this class.
    closest <- no.points[no.points$nearest == rownames(r),]
    if(nrow(closest) == 0){
      closest <- no.points
    }
    closest$len <- with(closest,(r$x-x)^2+(r$y-y)^2)
    best <- closest[closest$len == min(closest$len), ][1, ]

    res <- rbind(res,transform(r,x=best$x,y=best$y))
    ## add points to cloud
    newpts <- with(best,{
      expand.grid(x=seq(left,right,l=3),
                  y=seq(top,bottom,l=3))
    })
    all.points <- rbind(all.points,newpts)
  }
  if(debug)with(all.points,grid.points(
    x,y,default.units="cm",name="directlabels.points.empty.grid.all.points"))
  res
### Data frame with columns groups x y, 1 line for each group, giving
### the positions on the grid closest to each cluster.
}
tdhock/directlabels documentation built on April 21, 2024, 11:32 a.m.