R/plotluck.R

Defines functions sample.plotluck.testopts sample.plotluck print.plotluck_multi plotluck.multi mplot add.conditional.layer redundant.factor.color add.facet.grid add.facet.wrap format.facets determine.plot.type.2 determine.plot.type.1 determine.plot.type.0 plotluck info.threshold info.options process.weights correct.varnames parse.formula parse.formula.term sample.data plotluck.options gplt.blank gplt.spine3 gplt.spine norm.sum gplt.histogram gplt.density gplt.box gplt.violin gplt.dot gplt.heat gplt.hex gplt.scatter geom_vline_center geom_point_center add.axis.transform add.smooth.line add.color.legend add.color.fill get.palette estimate.label.overlap is.grid.like cond.entropy.data quantize cond.entropy entropy marginalize preprocess.factors discretize nclass.FD.modified discretize.few.unique limit.factor.levels order.factors order.factor.by.value order.factor.by.freq replace.by.central group.central weighted.mode Mode get.trans.fun log_mod_trans log_mod_breaks decode.int.power encode.int.power safe_log

Documented in plotluck plotluck.options sample.plotluck

#' @import ggplot2
#' @importFrom plyr ddply summarise .
#' @importFrom stats as.formula median quantile runif xtabs
#' @importFrom grDevices colorRampPalette hcl
#' @importFrom stats na.omit na.pass
#'
safe_log <- function(x,...) {
  ifelse(x<=0,0,log(x,...))
}

# calculate the floor or ceiling of the exponent of a number
# note: for log-modulus (offset=1), sign of returned value signifies sign of
#   input value x; therefore, different exponents cannot be represented for |x| < 1.
# note: for log-modulus, in order to distinguish between -10^0, 0, and 10^0,
#   we encode -10^x=-x, -10=-1, -1=0, 0=1, 1=2, 10^x=x+2, ..

encode.int.power <- function(x, base=10, floor=TRUE, offset=0) {

  # if floor=FALSE, compute ceiling
  # -1 * floor(-1*x) = ceiling(x)
  floor.factor <- ifelse(floor, 1, -1)

  if (offset == 0) {
    # regular log transform
    y <- floor.factor * floor(floor.factor * safe_log(x, base))
  } else {
    # log-modulus transform
    # no fractional values!
    x <- floor.factor * floor(floor.factor * x)

    y <- floor.factor * floor(floor.factor * sign(x) * safe_log(abs(x), base))

    # encoding as described in note above
    y <- y + ifelse(x >= 1, 2, ifelse(x >= 0, 1, 0))
  }
  y
}


decode.int.power <- function(x, base=10, offset=0) {
  if (offset == 0) {
    # regular log transform
    base^x
  } else {
    # log-modulus transform
    ifelse(x <= 0, - base^abs(x),
           ifelse(x == 1, 0,
                  base^abs(x-2)))
  }
}


# calculate breaks for log/log-modulus scale
log_mod_breaks <- function(n=5, base=10, offset=0)
{
  function(x) {

    rng <- range(x, na.rm=TRUE)

    if (any(is.infinite(rng))) {
      # Note: known issue with log scaling
      # https://github.com/hadley/ggplot2/issues/930
      # seems to be happening mostly with scatter plot, density/hex less affected
      warning('Skipping tick mark computation due to existing ggplot issue.
                 Try using hex or density instead of scatter plot or histogram.')
      return(1)
    }

    min.pos <- encode.int.power(rng[1], base=base, floor=TRUE,  offset=offset)
    max.pos <- encode.int.power(rng[2], base=base, floor=FALSE, offset=offset)

    # the range of powers that the values span
    total  <- abs(max.pos - min.pos)

    if (total + 1 <= n/2) {
      # less than half of requested ticks:
      # introduce subdivisions by lowering base
      sub <- floor(n/(total + 1))
      base <- base^(1/sub)
      min.pos <- encode.int.power(rng[1], base=base, floor=TRUE,  offset=offset)
      max.pos <- encode.int.power(rng[2], base=base, floor=FALSE, offset=offset)
      by <- 1
    } else {
      # enough or more dynamic range than breaks requested
      by <- max(1, floor(total/(n-1)))
    }

    sapply(seq(min.pos, max.pos, by),
           function(x) decode.int.power(x,base=base, offset=offset))
  }
}


# log (offset=0) or log-modulus (offset=1) scale
# John & Draper 1980; see e.g. http://blogs.sas.com/content/iml/2014/07/14/log-transformation-of-pos-neg/
log_mod_trans <- function(base = exp(1), offset=0)
{
  eps<-1e-100
  if (offset == 0) {
    # regular log transform
    trans <- function(x) log(x, base)
    inv   <- function(x) base^x
    domain = c(eps, Inf)
  } else {
    # log-modulus transform
    trans <- function(x) sign(x) * log(abs(x) + offset, base)
    inv   <- function(x) sign(x) * (base^abs(x) - offset)
    domain = c(-Inf, Inf)
  }
  scales::trans_new(paste0("log-mod-", format(base)),
                    trans,
                    inv,
                    log_mod_breaks(base=base, offset=offset),
                    domain=domain)
}


# calculate axis scale transform based on value distribution
# either:
# - identity
# - log-10
# - log-10-modulus: sign(x) * log(1+|x|)
# John & Draper 1980; see http://blogs.sas.com/content/iml/2014/07/14/log-transformation-of-pos-neg/

get.trans.fun <- function(data, x, expansion.thresh=2, label=x, verbose=FALSE) {

  if (!is.numeric(data[[x]]) ||
      length(unique(data[[x]])) <= 3) {
    return(list(scales::identity_trans(), label))
  }

  q <- quantile(data[[x]], c(0, 0.25, 0.5, 0.75, 1), na.rm=TRUE)

  # heuristic:
  # - compute the ratio of the plotting range occupied by the
  #   'core' part of the distribution between the lower and upper
  #   quartile
  # - if range is positive only, candidate transform is log
  # - if range includes negative values, candidate transform is log-modulus
  # - compute this ratio for the case that transform is applied
  # - decide for transform is this ratio is greater than expansion.thresh

  range.core     <- c(q[4], q[2])
  range.complete <- c(q[5], q[1])
  ratio.before   <- diff(range.core) / diff(range.complete)

  sgn <- ifelse(q[5] <= 0,
                -1,
                ifelse(q[1] > 0,
                       1,
                       0))

  # offset == 1 is interpreted as using log-modulus, instead of log
  if (sgn == 1) {
    offset <- 0
  } else {
    offset <- 1
  }

  fun.trans <- log_mod_trans(base=10, offset=offset)

  if (ratio.before == 0) {
     ratio.after <- 1
  } else {
     range.core.after     <- fun.trans$trans(range.core)
     range.complete.after <- fun.trans$trans(range.complete)
     ratio.after          <- diff(range.core.after) / diff(range.complete.after)
  }
  if (verbose) {
    if (ratio.after <= expansion.thresh * ratio.before) {
      cat(sprintf('Not applying logarithmic axis scaling for %s; expansion ratio is %f, trans.log.thresh = %f\n',
                  x, ratio.after/ratio.before, expansion.thresh))
    } else {
      cat(sprintf('Applying logarithmic axis scaling for %s; expansion ratio is %f, trans.log.thresh = %f\n',
                  x, ratio.after/ratio.before, expansion.thresh))
    }
  }
  if (ratio.after > expansion.thresh * ratio.before) {
    list(fun.trans,  sprintf('%s (log scale)', label))
  } else {
    list(scales::identity_trans(), label)
  }
}


Mode <- function(x) {
  ux <- unique(x)
  # if not returning a data frame, ddply will create an integer result column
  data.frame(ux[which.max(tabulate(match(x, ux)))])
}

weighted.mode <- function(data, x, w) {
  wtd <- plyr::ddply(data, x, function(d) sum(d[[w]], na.rm=TRUE))
  idx <- which.max(wtd[[2]])
  # need to rename to make later merge possible
  names(wtd)[1] <- '.wtd.mode.'
  wtd[idx, 1, drop=FALSE]
}

# calculate measure of central tendency per group.
# for numeric data[[x]], apply (weighted or unweighted) method as specified.
# for ordered factors, compute the (weighted or unweighted) mean or median over
#    the sorted integer levels, then round at the end.
# for unordered factors, always compute the mode (most frequent value)
# ordered.as.num - treat ordered factors as integers, and return a numeric average
# (in the case that the centers are used for sorting only, this prevents the rounding loss)
group.central <- function(data, x, group.vars, w='NULL', method=c('median', 'mean'),
                          col.name='.center.', ordered.as.num=FALSE) {

  method <- match.arg(method)

  is.num <- is.numeric(data[[x]])
  is.ord <- (!is.num) && is.ordered(data[[x]])

  lev <- NULL
  grp.center <- NULL

  if (is.ord) {
    # treat ordinals as integer
    lev <- levels(data[[x]])
    data[[x]] <- as.integer(data[[x]])
  }

  if (w == 'NULL' || length(unique(data[[w]])) == 1) {
    if (is.num || is.ord) {
      fun <- ifelse(method=='median', median, mean)
      grp.center <- plyr::ddply(data, group.vars, function(d) fun(d[[x]], na.rm=TRUE))
    } else {
      # known issue: for ordering, it would be better to take also the frequency of the mode
      # into account
      grp.center <- plyr::ddply(data, group.vars, function(d) Mode(d[[x]]))
    }
  } else {
    if (is.num || is.ord) {
      if (method == 'median') {
        # workaround for problems in wtd.quantile:
        # - if all values are NA, wtd.mean throws error: 'zero non-NA points'
        # - if weights are integer, sum can lead to NA due to overflow
        f <- function(d) if (all(is.na(d[[x]]))) { NA } else { Hmisc::wtd.quantile(d[[x]],
                                                                                   weights=as.numeric(d[[w]]), probs=0.5,
                                                                                   type='i/(n+1)', na.rm=TRUE, normwt=TRUE) }
        grp.center <- plyr::ddply(data, group.vars, f)
      } else {
        grp.center <- plyr::ddply(data, group.vars, function(d) Hmisc::wtd.mean(d[[x]], d[[w]], na.rm=TRUE))
      }
    } else { # !(is.num || is.ord)
      grp.center <- plyr::ddply(data, group.vars, function(d) weighted.mode(d, x, w))
    }
  }

  # for ordinals, reattach levels, but keep order
  if (is.ord && (!ordered.as.num)) {
    grp.center[[length(grp.center)]] <- ordered(lev[round(grp.center[[length(grp.center)]])], levels=lev, exclude=NULL)
  }

  names(grp.center)[length(grp.center)] <- col.name
  grp.center
}

# make all values of data[[x]] identically equal to the group mean
replace.by.central <- function(data, x, g, w) {
  x.avg <- group.central(data, x, g, w, method='mean', col.name='.replace.center.')
  data <- merge(data, x.avg)
  data[[x]] <- data[[length(data)]]
  data[[length(data)]] <- NULL
  data
}

# return data frame with the levels of unordered factor x resorted by (weighted) frequency
order.factor.by.freq <- function(data, x, w='NULL', decreasing=FALSE, verbose=FALSE) {
  x.data <- data[[x]]
  if (is.numeric(x.data) || is.ordered(x.data)) {
    # do not change ordered factors
    return(data)
  }

  if (w=='NULL') {
    if (verbose) {
      cat(sprintf('Ordering %s levels by number of lines\n', x))
    }
    tab <- table(x.data, useNA='ifany')
  } else {
    if (verbose) {
      cat(sprintf('Ordering %s levels by sum of %s\n', x, w))
    }
    tab <- xtabs(as.formula(sprintf('%s ~ %s', w, x)), data, exclude=NULL, na.action=na.pass)
  }
  ord <- order(tab, decreasing=decreasing)
  data[[x]] <- factor(x.data, levels=levels(x.data)[ord],
                      exclude=NULL)
  return(data)
}


# return data frame with the levels of unordered factor x resorted by (weighted) central tendency of dependent variable y
order.factor.by.value <- function(data, x, y, w='NULL', decreasing=FALSE, verbose=FALSE) {

  x.data <- data[[x]]
  if (is.numeric(x.data) || is.ordered(x.data) || length(unique(data[[x]])) == 1) {
    # do not change ordered factors
    return(data)
  }

  if (is.null(y) || y == 'NULL' || y == x) {
    return(order.factor.by.freq(data, x, w, decreasing))
  }

  y.data <- data[[y]]

  if (verbose) {
    cat(sprintf('Ordering %s levels by %s\n', x, y))
  }
  # we are displaying median lines for distributions, so the sorting should agree with this
  centers <- group.central(data, y, x, w, method='median', ordered.as.num=TRUE)

  # if medians are equal for two or more classes, resolve ties by using the mean
  if (length(unique(centers$.center.)) != nrow(centers)) {
    centers <- group.central(data, y, x, w, method='mean', ordered.as.num=TRUE)
  }
  if (!is.numeric(centers$.center.)) {
    centers$.center. <- as.integer(centers$.center.)
  }
  ord <- order(centers$.center., decreasing=decreasing)

  data[[x]] <- factor(x.data,
                      levels= centers[ord, x],
                      exclude=NULL)
  data
}

# reorder unordered factor levels of x,y,z to highlight dependencies

order.factors <- function(data, x, y, z='NULL', w='NULL', verbose=FALSE) {

  # if at least one factor is ordered, sort the other factors accordingly
  # if none is ordered, sort according to z (if it exists), otherwise y

  if (z == 'NULL') {
    v.all <- c(y,x)
  } else {
    v.all <- c(z,y,x)
  }
  v.sort <- NULL
  for (v in v.all) {
    if (is.numeric(data[[v]]) || is.ordered(data[[v]])) {
      v.sort <- v
      break
    }
  }
  if (is.null(v.sort)) {
    v.sort <- v.all[1]
    data <- order.factor.by.freq(data, v.sort, w)
  }

  for (v in setdiff(v.all, v.sort)) {
    if ((!is.numeric(data[[v]])) && (!is.ordered(data[[v]]))) {
      # v is sortable
      data <- order.factor.by.value(data, v, v.sort, w, verbose=verbose)
    }
  }
  data
}

# limit the number of levels of a factor so as to not overcrowd the display
# - for unordered factors, keep the highest ones, group lower levels into '.other.'
# - for ordered factors, form about equidistant subgroups, in order
# - If there a slightly more levels than desired (according to tol), still don't
#   keep it, as quantizing makes the factor less intelligible
# - by.frequency keep the most frequent levels (according to count or weight)
#
limit.factor.levels <- function(data, x, w='NULL',
                                max.levels=30,
                                tol=max.levels*1.5,
                                by.frequency=TRUE,
                                reverse=FALSE) {

  x.data <- data[[x]]

  if (is.numeric(x.data)) {
    return(data)
  }

  # note: there can be unused levels!
  u <- length(levels(x.data))

  if (u <= tol) {
    return(data)
  }
  message(sprintf('Factor variable %s has too many levels (%d), truncating to %d', x, u, max.levels))

  if (!is.ordered(x.data)) {
    ord <- seq(length(levels(x.data)))
    if(by.frequency) {
      # keep the most frequent levels in the same order, combine the rest into '.other.',
      data.tmp <- data
      data.tmp[[x]] <- as.integer(data.tmp[[x]])
      tab <- NULL
      if (w=='NULL') {
        tab <- table(data.tmp[[x]], useNA='ifany')
      } else {
        tab <- xtabs(as.formula(sprintf('%s ~ %s', w, x)), data.tmp,  exclude=NULL, na.action=na.pass)
      }
      # make sure that ties are broken according to previous order!
      tab <- tab[order(tab, as.numeric(names(tab)), decreasing=FALSE)]
    }
    levels.ord <- levels(x.data)[as.integer(names(tab))]
    ll <- length(levels.ord)
    if (!reverse) {
      levels.reduced <- levels.ord[seq(max(1,ll-max.levels+1),ll)]
    } else {
      levels.reduced <- levels.ord[seq(min(max.levels,ll))]
    }
    # note: we want to maintain the original order of levels!
    idx.trunc <- !(x.data %in% levels.reduced)
    x.data <- factor(x.data, levels=c('.other.', levels(x.data)),exclude=NULL)
    x.data[idx.trunc] <- '.other.'
    data[[x]] <- factor(x.data, exclude=NULL)

  } else {  # is.ordered(data[[x]])

    #  Note: The weighted case is very hard to get right for very unequal weights.
    #  Ignoring for now.

    breaks <- 1 + (seq(0,max.levels)/max.levels) * (length(levels(x.data))-1)

    # dig.lab: in cut(), avoids scientific formatting for integers
    dig.lab=50
    x.data <- cut(as.integer(x.data), breaks=breaks,
                  right=FALSE, include.lowest=TRUE, ordered.result=TRUE, dig.lab=dig.lab)

    # recreate level information by parsing those generated by 'cut'
    # make lists of elements, e.g.:
    #  - original levels 'a', 'b', 'c', 'd'
    #  - binned level '[2,4)' ->  'c,d'
    x.data <- ordered(x.data)

    reconstruct.levels <- function(interval, levels) {
      left.bracket <- substr(interval,1,1)
      right.bracket <- substr(interval,nchar(interval),nchar(interval))
      eps <- 0
      if (left.bracket == '(') {
        eps <- 1e-10
      }
      if (right.bracket == ')') {
        eps <- -1e-10
      }
      left.idx  <- ceiling(eps + as.numeric(gsub('[(\\[]([0-9.]+),.*', '\\1', interval)))
      right.idx <- floor(eps + as.numeric(gsub('.*,([0-9.]+).', '\\1', interval)))

      # catch if level doesn't fit interval pattern - this shouldn't happen!
      if (is.na(left.idx) || is.na(right.idx)) {
        return(interval);
      }
      return(paste(levels[left.idx:right.idx], sep='', collapse=','))
    }

    levels(x.data) <- sapply(levels(x.data), reconstruct.levels, levels=levels(data[[x]]))
    x.data <- x.data[,drop=T]
    data[[x]] <- x.data
  }
  data
}

# if one numerical axis has very few values, it sometimes makes sense to treat them as a factor
discretize.few.unique <- function(data, x, few.unique.as.factor=5, verbose=FALSE) {
  if (!is.numeric(data[[x]])) {
    return(data)
  }
  non.na <- !is.na(data[[x]])
  # works for both data frames and tibbles
  u <- nrow(unique(data[non.na,x,drop=FALSE]))

  if (u <= few.unique.as.factor &&
      nrow(data) > u * u) { # heuristic similar to histogram binning
    data[[x]] <- ordered(data[[x]], exclude=NULL)
    info.threshold(verbose, '%s has only %d levels, treating as factor', few.unique.as.factor, x, u)
  }
  data
}


# determine number of histogram bins
# note: this is a modification the 'nclass.FD' function, which sometimes returns '1'
# for very uneven distributions.

nclass.FD.modified <- function(x, max.bins=200) {
  u <- unique(x)
  if (length(u) == 1) {
    # only one value
    return(1L)
  }
  # minimum reasonable bin size
  h <- 0.5 * min(abs(diff(u)), na.rm=TRUE)

  # for very few unique values, show all of them individually
  if (length(u) > 10) {
    h2 <- 2 * stats::IQR(x, na.rm=TRUE)
    if (h2 == 0)
      h2 <- 2 * stats::mad(x, constant=2, na.rm=TRUE)
    if (h2 > 0)
      h <- max(h, h2 * length(x)^(-1/3))
  }

  # capping the number of bins, since for some features the estimated bins are very high causing the hist() function to throw an error
  num.bins <- ceiling(diff(range(u, na.rm=TRUE))/h)
  num.bins <- min(max.bins,
                  max(min(5, length(u), max.bins, length(x)/log2(length(x))),
                      num.bins))
  num.bins
}


# break a numeric variable into intervals
# - method=quantile - bins with equal number of samples
# - method=histogram - equidistant intervals
# - max.breaks - requested number of breaks
# - tol - if there are less than that many distinct values to
#     begin with, don't do additional binning
# - estimate.breaks=FALSE - use exactly max.breaks many breaks

discretize <- function(data, x,
                       w='NULL',
                       max.breaks,
                       tol=max.breaks*1.5,
                       estimate.breaks=TRUE,
                       method=c('quantile', 'histogram')) {
  x.data <- data[[x]]

  if (!is.numeric(x.data)) {
    return(data)
  }

  u <- length(unique(x.data))

  if (missing(max.breaks)) {
    max.breaks = ceiling(min(u-1, sqrt(nrow(data))))
  }
  max.breaks = floor(min(max.breaks, u-1, sqrt(nrow(data))))
  max.breaks <- min(max.breaks, nrow(data))


  if (u - 1 <= max(tol, max.breaks)) {
    return(data)
  }

  method <- match.arg(method)

  breaks <- NULL
  # compute number of breaks
  if (method == 'quantile') {
    probs <- seq(0, max.breaks) / max.breaks
    if (w=='NULL') {
      breaks <- quantile(x.data, probs=probs, na.rm=TRUE)
    } else {
      breaks <- Hmisc::wtd.quantile(x.data, weights=data[[w]],
                                    probs=probs, na.rm=TRUE, normwt=TRUE,
                                    type='i/(n+1)')
    }
    data[[x]] <- ordered(cut(x.data, breaks=unique(breaks), include.lowest=TRUE),
                         exclude=NULL)

  } else { # method == 'histogram'
    n.breaks <- max.breaks
    if (estimate.breaks) {
      n.breaks <- nclass.FD.modified(x.data, max.breaks)
    }
    breaks <- graphics::hist(x.data, breaks=n.breaks, plot=FALSE)$breaks
    # leave type numeric
    step<-breaks[2] - breaks[1]
    data[[x]] <- round((x.data-breaks[1])/step)*step+breaks[1]
  }

  data
}

# - convert character vectors into factors
# - with na.rm=TRUE, remove rows containing NA
# - from here on, don't exclude any levels from factors any more
# - check for all-NA and constant values
# - convert character vectors into factors
# - convert logicals into ordered factors

preprocess.factors <- function(data, vars, na.rm=FALSE) {

  for (x in vars) {

    x.data <- data[[x]]
    non.na <- !is.na(x.data)
    if (length(which(non.na)) == 0) {
      stop(sprintf('Variable %s has only missing values, giving up', x))
    }

    u <- length(unique(x.data[non.na]))
    if (u == 1) {
      stop(sprintf('Variable %s has only single level ("%s"), giving up', x, unique(x.data[non.na])))
    }

    if (u == 2) {
      # binary variables are trivially ordered
      data[[x]] <- ordered(x.data)
    } else if (!is.numeric(x.data)) {

      if (!is.factor(x.data)) {
        # i.e., character or logical
        x.data <- factor(x.data, exclude=NULL)
      }

      # suppress unused levels
      # for NA treatment, see http://r.789695.n4.nabble.com/droplevels-inappropriate-change-td4723942.html
      # droplevels(x.data, exclude=exclude.factor) does not seem to be working correctly
      data[[x]] <- x.data[,drop=TRUE]
    }
  }

  if (na.rm) {
    factor.with.na.level <- any(sapply(seq(length(data)),
                                       function(x) {is.factor(data[[x]]) && any(is.na(levels(data[[x]])))}))
    if (factor.with.na.level) {
      message('Option "na.rm=TRUE", but data contains factor(s) with an NA level. Keeping these ...')
    }
    data <- na.omit(data)

    if (nrow(data) == 0) {
      stop(sprintf('Variable(s) have only missing values, giving up'))
    }

  }

  # if na.rm==TRUE, all NA's have been removed. In either case, from now on all levels should be
  # included

  for (i in seq(length(data))) {
    if (is.factor(data[[i]])) {
      data[[i]] <- factor(data[[i]], exclude=NULL, ordered=is.ordered(data[[i]]))
      # note: rename <NA> - else ggplot doesn't show a color legend
      idx<-which(is.na(levels(data[[i]])))
      if (length(idx) > 0) {
        levels(data[[i]])[idx] <- '?'
      }
    }
  }

  data
}


marginalize <- function(data, var.list, w='NULL') {
  rhs <- paste(var.list, sep='', collapse='+')
  prop.table(xtabs(as.formula(sprintf('%s ~ %s', ifelse(w=='NULL', c(''), w), rhs)), data, exclude=NULL, na.action=na.pass))
}

# entropy: H(vars) = - \sum p(vars)log2(p(vars))
# assumption: vars are factors
entropy <- function(data, vars, w) {
  tab  <- marginalize(data, vars, w)
  log.tab <- log2(tab)
  log.tab[tab==0] <- 0 # guard for -Inf
  -sum(tab * log.tab)
}


# conditional entropy: H(y|x) = H(x,y) - H(x)
# assumption: x, y are factors
cond.entropy <- function(data, x, y, w) {
  if (x == y) {
    0
  } else {
    hxy <- entropy(data, c(x, y), w)
    hx <- entropy(data, x, w)
    hxy - hx
  }
}

quantize <- function(data, v, target='NULL', w, n.levels, estimate.breaks=FALSE, method='histogram') {
  if (is.numeric(data[[v]])) {
    data <- discretize(data, v, w=w,
                       max.breaks=n.levels, tol=n.levels, estimate.breaks, method=method)
  } else {
    if (target != 'NULL') {
      data <- order.factor.by.value(data, v, target, w=w)
    }
    data <- limit.factor.levels(data, v, w,
                                max.levels=n.levels,
                                tol=n.levels)
  }
  data
}


# return a list of conditional entropies H(target|x), for each column in the data frame.
# for comparability, all variables are quantized into the same number of bins
cond.entropy.data <- function(data, target, given, w='NULL') {
  n.row <- nrow(data)
  n.levels <- floor(max(2, min(log2(n.row), n.row/10))) # heuristic

  cond.entropy.quantized <- function(data, x, y, w, n.levels) {
    if (x == y) {
      0
    } else {
      x.data <- quantize(data, x, y, w, n.levels)
      # note: we need to make a copy for the special case of x=w
      names(x.data)[names(x.data)==x]<-'.x'
      if (w != 'NULL') {
        x.data[[w]] <- data[[w]]
      }
      cond.entropy(x.data, '.x', y, w)
    }
  }

  entropies <- matrix(nrow=length(data), ncol=length(data))
  colnames(entropies) <- names(data)
  rownames(entropies) <-names(data)

  if (!missing(target)) {
    # single row/column
    data <- quantize(data, target, w=w, n.levels=n.levels)
    entropies[target,] <- sapply(names(data), function(x)
      cond.entropy.quantized(data, x, target, w, n.levels))
  } else {
    for (target in names(data)) {
      data <- quantize(data, target, w=w, n.levels=n.levels)
      if (!missing(given)) {
        entropies[target, given] <- cond.entropy.quantized(data, x=given, y=target, w, n.levels)
      } else {
        # all vs all
        entropies[target,] <- sapply(names(data), function(x)
          cond.entropy.quantized(data, x, target, w, n.levels))
      }

    }
  }
  entropies
}


# heuristic to decide whether to plot a heat map:
# - axes x, y must be either numeric, or ordered factors
# - axes must have at least min.size distinct values
# - at least a fraction min.coverage of the grid points must have data
is.grid.like <- function(data, x, y, z, min.size=3, min.coverage=0.5) {
  if (!is.numeric(data[[z]]) ||
      (!(is.numeric(data[[x]]) || is.ordered(data[[x]]))) ||
      (!(is.numeric(data[[y]]) || is.ordered(data[[y]])))) {
    return(FALSE)
  }
  ux <- length(unique(data[[x]]))
  if (ux < min.size) {
    return(FALSE)
  }
  uy <- length(unique(data[[y]]))
  if (uy < min.size) {
    return(FALSE)
  }
  u <- nrow(unique(data[,c(x, y)]))
  u >= min.coverage * ux * uy
}

# some common themes
theme_panel_num_x <-
  theme(panel.grid.major.x=element_line(color='black', linetype='dotted'),
        panel.grid.minor.x=element_line(color='black',linetype='dotted'))
theme_panel_num_y <-
  theme(panel.grid.major.y=element_line(color='black', linetype='dotted'),
        panel.grid.minor.y=element_line(color='black',linetype='dotted'))
# points on the grid line are hard to see; remove grid line for factors
theme_panel_fac_x <- theme(panel.grid.major.x=element_blank(),
                           panel.grid.minor.x=element_blank())
theme_panel_fac_y <- theme(panel.grid.major.y=element_blank(),
                           panel.grid.minor.y=element_blank())
# for factors, write horizontal axis labels at an angle to avoid overlap
theme_slanted_text_x <- theme(axis.text.x=element_text(angle=-45, hjust=0, vjust=1))


# detect if x axis text might overlap
estimate.label.overlap <- function(labels, breaks=seq(length(labels))) {
   if (length(breaks) < 2) {
      return(FALSE)
   }
   lb <- length(breaks)

   # account for some empty space at the boundaries
   mrg.left <- (breaks[2]-breaks[1])/2
   mrg.right <- (breaks[lb]-breaks[lb-1])/2
   rg <- breaks[lb] - breaks[1] + mrg.left + mrg.right

   for (i in seq(lb)) {
      space.left  <- ifelse(i==1,  2*mrg.left,  (breaks[i]-breaks[i-1]))/rg
      space.right <- ifelse(i==lb, 2*mrg.right, (breaks[i+1]-breaks[i]))/rg
      label.width <- as.numeric(grid::convertX(unit(1, 'strwidth', labels[i]),'npc'))
      if (label.width > min(space.left, space.right)) {
         return(TRUE)
      }
   }
   return(FALSE)
}


# add color and/or fill scale layers to plot
# - use qualitative (sequential) brewer scale for (un)ordered factors, if
#   palette size supports it.
#   Otherwise, extend it using colorRampPalette.
# - if too few colors, use qualitative (first few colors of brewer palette might not look good by themselves!)

get.palette <- function(x, palette.brewer.seq='YlGn', palette.brewer.qual='Set1',
                        adjust.size=FALSE) {

   is.num <- is.numeric(x)
   is.ord <- is.ordered(x)
   if (is.num) {
      u <- length(unique(x))
   } else {
      # note: there can be unused levels
      u <- length(levels(x))
   }
   if (u <= 2) {
      # treat as qualitative
      is.ord <- FALSE
      is.num <- FALSE
   }
   name <- ifelse(is.num || is.ord, palette.brewer.seq, palette.brewer.qual)

   sz <- RColorBrewer::brewer.pal.info[name, 'maxcolors']
   if (is.na(sz)) {
      stop('invalid palette name: ', name)
   }

   pal <- RColorBrewer::brewer.pal(sz, name)

   if (adjust.size) {
      if ((is.num || is.ord) && u < sz) {
         # note: the first few colors of the brewer palettes are very light, sometimes
         # hard to see. If less values are needed than the palette size, we want to
         # rather drop the lightest ones instead of the darkest ones.
         pal <- pal[seq(sz-u+1, sz)]
      }
      if ((!is.num) && u > sz) {
         # not enough colors, interpolate
         # note: maybe do something different for qualitative scale?
         pal <- colorRampPalette(pal)(u)
      }
      names(pal) <- levels(x)
   }
   pal
}

add.color.fill <- function(p, data, x, aesth=c('color', 'fill'),
                           palette.brewer.seq='YlGn',
                           palette.brewer.qual='Set1') {

   na.value <- 'darkgray'
   if ('color' %in% aesth) {
      p <- p + aes_string(color=x)
   }
   if ('fill' %in% aesth) {
      p <- p + aes_string(fill=x)
   }

   pal <- get.palette(data[[x]], palette.brewer.seq, palette.brewer.qual, TRUE)
   lp <- length(pal)

   if (is.numeric(data[[x]])) {
      if ('color' %in% aesth) {
         p <- p + scale_color_gradientn(colors=pal[c(1,lp)])
      }
      if ('fill' %in% aesth) {
         p <- p + scale_fill_gradientn(colors=pal[c(1,lp)])
      }
   } else { # !is.num

      if ('color' %in% aesth) {
         p <- p + scale_color_manual(values=pal, na.value=na.value)
      }
      if ('fill' %in% aesth) {
         p <- p + scale_fill_manual(values=pal, na.value=na.value)
      }
   }
   p
}

# place legend either at the bottom or on the right, wherever it occupies less space
# pack as tightly as possible
add.color.legend <- function(p, data, x, aesth=c('color','fill')) {

  aesth=match.arg(aesth)
  p <- p + theme(legend.key.width=unit(0.5,'lines'))
  if (is.numeric(data[[x]])) {
    # color bars always on the right margin, to avoid number label overwriting
    if (aesth == 'color') {
      p <- p + guides(color=guide_colorbar(direction='vertical'), fill='none')
    } else {
      p <- p + guides(fill=guide_colorbar(direction='vertical'), color='none')
    }
    p <-  p + theme(legend.position='right', legend.background=element_blank())
    return(p)
  }

  # rough estimate of key symbol plus spacing; in the default theme.gray(), net key size is 1.2 lines
  size.key <- as.numeric(grid::convertX(unit(1.6, 'lines'), 'npc'))
  title.length <- as.numeric(grid::convertX(unit(1, 'strwidth', x), 'npc'))
  legend.margin <- as.numeric(unit(0.2, 'cm'))

  # vertical placement
  ll <- length(levels(data[[x]]))
  npc.length <- sapply(levels(data[[x]]), function(x) grid::convertX(unit(1, 'strwidth', x), 'npc'))
  npc.length <- sort(npc.length, decreasing=TRUE)
  ncol.vert <- ceiling(ll*size.key)
  nrow.vert <- ceiling(ll/ncol.vert)
  npc.width.vert <- legend.margin + max(title.length, ncol.vert * (size.key + npc.length[1]))

  # horizontal placement
  # try out how many legend columns we can fit underneath the graph
  ncol.horz <- 0
  npc.horz <- title.length
  while (ncol.horz < ll && (npc.horz + npc.length[ncol.horz+1] < 1)) {
    ncol.horz <- ncol.horz + 1
    npc.horz <- npc.horz + npc.length[ncol.horz] + size.key
  }
  nrow.horz <- ceiling(ll/ncol.horz)
  npc.height.horz <- legend.margin + nrow.horz * size.key
  aspect.ratio <- as.numeric(grid::convertX(unit(1, 'npc'),'cm')) /
    as.numeric(grid::convertY(unit(1, 'npc'),'cm'))
  if (npc.width.vert < aspect.ratio * npc.height.horz) {
    # vertical - by column, starting with first level at the bottom
    if (aesth == 'color') {
      p <- p + guides(color=guide_legend(direction='vertical', byrow=FALSE,
                                         reverse=TRUE, ncol=ncol.vert), fill='none')
    } else {
      p <- p + guides(fill=guide_legend(direction='vertical', byrow=FALSE,
                                        reverse=TRUE, ncol=ncol.vert), color='none')
    }
    p + theme(legend.position='right', legend.background=element_blank())
  } else {
    # horizontal - by row
    if (aesth == 'color') {
      p <- p + guides(color=guide_legend(direction='horizontal', byrow=TRUE,
                                         reverse=FALSE, nrow=nrow.horz), fill='none')
    } else {
      p <- p + guides(fill=guide_legend(direction='horizontal', byrow=TRUE,
                                        reverse=FALSE, nrow=nrow.horz), color='none')
    }
    p + theme(legend.position='bottom', legend.background=element_blank())
  }
}

add.smooth.line <- function(p, w='NULL', fill.smooth='NULL') {

  if (w == 'NULL') {
    # note: for more than 1000 points, ggplot2 uses mgcv package,
    # leads to weird dependency issue
    if (fill.smooth == 'NULL') {
      p + geom_smooth(alpha=0.2, na.rm=TRUE)
    } else {
      p + geom_smooth(alpha=0.2, na.rm=TRUE, color=fill.smooth, fill=fill.smooth)
    }
  } else {
    if (fill.smooth == 'NULL') {
      p + geom_smooth(aes_string(weight=w), alpha=0.2, na.rm=TRUE)
    } else {
      p + geom_smooth(aes_string(weight=w), alpha=0.2, na.rm=TRUE, color=fill.smooth, fill=fill.smooth)
    }
  }
}

add.axis.transform <- function(p, data, x, ax=c('x','y'), trans.log.thresh=2, verbose=FALSE) {
  if (!is.numeric(data[[x]])) {
    p
  } else {
    ax <- match.arg(ax)
    trans <- get.trans.fun(data, x, trans.log.thresh, verbose=verbose)
    if (ax == 'x') {
      # note: "scale_x_continuous(trans=trans.x[[1]], name=trans.x[[2]])" doesn't allow
      # to overwrite the label later
      p + scale_x_continuous(trans=trans[[1]]) + xlab(trans[[2]])
    } else {
      p + scale_y_continuous(trans=trans[[1]]) + ylab(trans[[2]])
    }
  }
}

# note: these layers work, but in case a log transform is applied,
# the transformed data is received here - we need the original one.
StatCenterX <- ggproto("StatCenterX", Stat,
                       required_aes = c("x"),
                       default_aes = aes(xintercept = ..x..),

                       setup_params = function(data, params) {
                          if (is.null(params$weight)) {
                             params$weight <- 1
                          }
                          params
                       },

                       compute_group = function(data, scales, weight) {
                          grp <- setdiff(names(data), c('x','weight'))
                          data <- group.central(data, 'x', grp, w='weight', method='median', col.name='x')
                          data
                       }
)

StatCenterY <- ggproto("StatCenterY", Stat,
                      required_aes = c("y", "group"),

                      setup_params = function(data, params) {
                         if (is.null(params$weight)) {
                            params$weight <- 1
                         }
                         params
                      },

                      compute_group = function(data, scales, weight) {
                         grp <- setdiff(names(data), c('y','weight'))
                         data <- group.central(data, 'y', grp, w='weight', method='median', col.name='y')
                         data$x <- data$group
                         data
                      }
)

geom_point_center <- function(mapping = NULL, data = NULL,
                              position = position_dodge(width=0.9),
                              shape = 1, color = "black", size = 2, fill = NA,  alpha = NA,
                              show.legend = NA, inherit.aes = TRUE, na.rm = TRUE, ...)
{
   layer(data = data, mapping = mapping, stat = StatCenterY, geom = GeomPoint,
         position = position, show.legend = show.legend, inherit.aes = inherit.aes,
         params = list(shape=shape, color=color, size=size, fill=fill, alpha=alpha,
                       na.rm = na.rm, ...))
}

# different from base geom_vline(), adapt color according to group, because of inherit.aes=TRUE
geom_vline_center <- function(mapping = NULL, data = NULL,
                              position = 'identity',
                              color = NULL, size = 0.7, alpha = NA,
                              linetype='dashed',
                              show.legend = NA, inherit.aes = TRUE, na.rm = TRUE, ...)
{
   layer(data = data, mapping = mapping, stat = StatCenterX, geom = GeomVline,
         position = position, show.legend = show.legend, inherit.aes = inherit.aes,
         params = list(size=size, alpha=alpha, linetype=linetype,
                       na.rm = na.rm, ...))
}

geom_vline_inheritable <- function (mapping = NULL, data = NULL, ..., xintercept, na.rm = FALSE,
          show.legend = NA)
{
   if (!missing(xintercept)) {
      data <- data.frame(xintercept = xintercept)
      mapping <- aes(xintercept = xintercept)
      show.legend <- FALSE
   }
   layer(data = data, mapping = mapping, stat = StatIdentity,
         geom = GeomVline, position = PositionIdentity, show.legend = show.legend,
         inherit.aes = TRUE, params = list(na.rm = na.rm, ...))
}


# 2D scatter plot of numeric or factor variables
# - overlay smoothing line if both variables are numeric
# - dedupe.scatter='area' - unique repeated points, count frequency
# - dedupe.scatter='jitter' - plot each repeated point separately, add
#   jitter if there are more than min.points.jitter with identical coordinates
# - counts/weights are drawn with a shaded circle of proportional area
# - jitter.x, jitter.y - the amount of jittering as a multiple of resolution
# - trans.log.thresh - threshold to decide on log-transform
# - fill.smooth - fill color for smoothing line
gplt.scatter <- function(data, x, y, w='NULL',
                         dedupe.scatter=c('area','jitter'),
                         min.points.jitter=3,
                         jitter.x=0.4,
                         jitter.y=0.4,
                         trans.log.thresh=2,
                         max.factor.levels=30,
                         fill.smooth='NULL',
                         verbose=FALSE,
                         ...) {

  dedupe.scatter <- match.arg(dedupe.scatter)
  flip <- FALSE
  if (is.numeric(data[[x]]) && (!is.numeric(data[[y]]))) {
    # HACK: ggplot2 does not implement vertical dodging, therefore we
    # use coord_flip() as a workaround
    flip <- TRUE
    tmp <- x
    x <- y
    y <- tmp
  }

  num.x <- is.numeric(data[[x]])
  num.y <- is.numeric(data[[y]])
  ord.x <- is.ordered(data[[x]])
  ord.y <- is.ordered(data[[y]])

  data <- order.factors(data, x, y, w=w, verbose=verbose)
  for (v in c(x,y)) {
    data <- limit.factor.levels(data, v, w=w,
                                max.levels=max.factor.levels)
  }

  if (dedupe.scatter == 'area') {
    # record frequency/total weight for each unique row of the data
    if (w == 'NULL') {
      # equivalent to 'uniq -c'
      data <- plyr::ddply(data, names(data), function(D) nrow(D))
      names(data)[length(data)] <- '.freq.'
      w <- '.freq.'
    } else {
      data <- plyr::ddply(data, setdiff(names(data), w), function(D) sum(D[[w]], na.rm=TRUE))
      names(data)[length(data)] <- w
    }
  }

  # dodging

  if (!num.x) {
    # example involving dodging:
    # i2<-iris
    # i2$c<-factor(cut(i2$Petal.Width,3))
    # names(i2)[6]<-'pw.quant'
    # plotluck(Petal.width~Species|pw.quant,i2, opts=plotluck.options(geom='scatter'))
    pos <- position_dodge(0.5)
  } else {
    pos <- position_identity()
  }

  # if there are a lot of points, better make them transparent
  alpha <- max(0.3, 0.8 - 0.8/2000 * nrow(data))

  p <- ggplot(data, aes_string(x=x, y=y, weight=w))

  if (w != 'NULL' && length(unique(data[[w]])) > 1) {
    # use point size to represent count/weight
    p <- p + geom_point(aes_string(size=w), alpha=alpha,
                        position=pos, na.rm=TRUE, ...) +
      scale_size(guide='none')
  } else {
    # use jittering
    if (max(table(data[, c(x, y)], useNA='ifany')) >= min.points.jitter) {
      # test for repeated points, optionally jitter
      # - if both x and y are numeric, jitter in both directions
      # - if one is a factor, only jitter in this direction
      #   (enough empty space between levels)
      if (num.x && !num.y) {
        jitter.x <- 0
        # resolution(data[[y]], zero=FALSE) == 1
      } else if (!num.x && num.y) {
        # resolution(data[[x]], zero=FALSE) == 1
        jitter.y <- 0
      } else {
        jitter.x <- jitter.x * resolution(data[[x]], zero=FALSE)
        jitter.y <- jitter.y * resolution(data[[y]], zero=FALSE)
      }
    } else {
      jitter.x <- 0
      jitter.y <- 0
    }

    p <- p + geom_point(alpha=alpha, position=position_jitter(width=jitter.x, height=jitter.y),
                        na.rm=TRUE, ...)
  }

  # draw smoothing line
  if (num.x && num.y) {
    p <- add.smooth.line(p, w, fill.smooth)
  }

  # axis transformation
  p <- add.axis.transform(p, data, x, 'x', trans.log.thresh, verbose=verbose)
  p <- add.axis.transform(p, data, y, 'y', trans.log.thresh, verbose=verbose)

  if (!flip) {
    if (num.x) {
      p <- p + theme_panel_num_x + theme_panel_num_y
    } else {
      p <- p + theme_panel_num_y
      if (length(levels(data[[x]])) < 10) {
        # grid lines for factors only necessary when very many
        p <- p + theme_panel_fac_x
      } else {
        p <- p + theme(panel.grid.major.x=element_line(color='black', linetype='dotted'),
                       panel.grid.minor.x=element_blank())
      }
      if (estimate.label.overlap(levels(data[[x]]))) {
        p <- p + theme_slanted_text_x
      }
    }
  } else {
    p <- p + coord_flip()
    if (num.x) {
      p <- p + theme_panel_num_y + theme_panel_num_x
    } else {
      p <- p + theme_panel_num_x
      if (length(levels(data[[x]])) < 10) {
        p <- p + theme_panel_fac_y
      } else {
        p <- p + theme(panel.grid.major.y=element_line(color='black', linetype='dotted'),
                       panel.grid.minor.y=element_blank())
      }
    }
  }
  list(plot=p, data=data)
}


# hexbin plot with overlayed smoothing line
gplt.hex <- function(data, x, y, w='NULL', trans.log.thresh=2,
                     fill.smooth='NULL', verbose=FALSE, ...) {

  p <- ggplot(data, aes_string(x=x, y=y)) +
    geom_hex(na.rm=TRUE, ...)

  p <- add.smooth.line(p, w, fill.smooth)

  # axis transformation
  p <- add.axis.transform(p, data, x, 'x', trans.log.thresh, verbose=verbose)
  p <- add.axis.transform(p, data, y, 'y', trans.log.thresh, verbose=verbose)

  p <- p +
    # log scaling of color often reveals more details
    scale_fill_gradientn(colors=c(hcl(66,60,95), hcl(128,100,45)), trans='log', guide='none') +
    theme(legend.position='right') +
    theme_panel_num_x + theme_panel_num_y

  list(plot=p, data=data)
}


# heat map
# point grid of discretized values, with optional discretized color (z)
# the color is a representation of an appropriate central tendency measure
# for the point bin (mode for factors, median for ordinal and numeric vectors)
gplt.heat <- function(data, x, y, z='NULL', w='NULL', trans.log.thresh=2,
                        max.factor.levels=30,
                        resolution.heat=30,
                        palette.brewer.seq='YlGn',
                        palette.brewer.qual='Set1',
                        verbose=FALSE,
                        ...) {

  num.x <- is.numeric(data[[x]])
  ord.x <- is.ordered(data[[x]])
  num.y <- is.numeric(data[[y]])
  ord.y <- is.ordered(data[[y]])

  num.z <- FALSE
  ord.z <- FALSE
  ex.z <- (!is.null(z)) && (z != 'NULL')
  if (ex.z) {
    num.z <- is.numeric(data[[z]])
    ord.z <- is.ordered(data[[z]])
    if (!num.z && !ord.z) {
      data <- order.factor.by.freq(data, z, w)
    }
  }

  data <- order.factors(data, x, y, z, w, verbose=verbose)

  # quantize variables
  for (v in c(x,y)) {
    if (is.numeric(data[[v]])) {
      data <- discretize(data, v, w=w, max.breaks=resolution.heat, estimate.breaks=FALSE,
                         method='histogram')
    } else {
      data <- limit.factor.levels(data, v, w=w,
                                  max.levels=min(resolution.heat,max.factor.levels),
                                  tol=max.factor.levels)
    }
  }

  g.vars <- names(data)
  if (ex.z) {
    g.vars <- setdiff(g.vars, z)
  }

  if (w != 'NULL') {
    g.vars <- setdiff(g.vars, w)
  }

  # HACK: remove precomputed columns
  g.vars <- setdiff(g.vars, '.center.')

  # replace z values with group center
  if (ex.z) {
    data <- replace.by.central(data, z, g.vars, w)

    breaks <- length(get.palette(data[[z]], palette.brewer.seq,
                               palette.brewer.qual)) - 1

    if (num.z) {
      # if z distribution is very skewed, better to quantize than to histogram
      trans.z <- get.trans.fun(data, z, trans.log.thresh)
      method <- 'histogram'
      if (trans.z[[1]]$name != 'identity') {
        method <= 'quantile'
      }

      # limited by color palette
      data <- discretize(data, z, w=w, max.breaks=breaks, tol=breaks,
                         estimate.breaks=FALSE, method=method)
    } else {
      data <- limit.factor.levels(data, z, w,
                                  max.levels=breaks,
                                  tol=breaks)
    }
    g.vars <- c(g.vars, z)
  }

  # sum up counts or weights
  if (w == 'NULL') {
    data <- plyr::ddply(data, g.vars, function(D) nrow(D))
    names(data)[length(data)] <- '.freq.'
    w <- '.freq.'
  } else {
    data <- plyr::ddply(data, g.vars, function(D) sum(D[[w]], na.rm=TRUE))
    names(data)[length(data)] <- w
  }

  # scale the area of the rectangle such that:
  # - the largest one is equal to the grid length
  # - the smallest one is still visible
  # - the area is proportional to the weight

  res.x <- resolution(as.numeric(data[[x]]))
  res.y <- resolution(as.numeric(data[[y]]))

  w.max <- sqrt(max(data[[w]]))
  w.min <- sqrt(min(data[[w]]))
  w.rg <- w.max - w.min
  if (w.rg == 0) {
     sz <- rep(1, nrow(data))
  } else {
     sz.max <- 1
     sz.min <- max(0.1, w.min/w.max)
     sz <- sz.min + (sqrt(data[[w]]) - w.min) * (sz.max - sz.min)  / w.rg
  }

  data$width <- res.x * sz
  data$height <- res.y * sz

  p <- ggplot(data, aes_string(x=x, y=y, height='height', width='width', color=z, fill=z, weight=w)) +
    geom_tile(na.rm=TRUE, ...)

  # axis transformation
  # known issue: logarithmic scaling can make the rectangles overlap!
  # plotluck(name ~ sleep_total + brainwt, data = msleep)
  # disabling for now ...
  #p <- add.axis.transform(p, data, x, 'x', trans.log.thresh, verbose=verbose)
  #p <- add.axis.transform(p, data, y, 'y', trans.log.thresh, verbose=verbose)

  if (ex.z) {
    p <- add.color.fill(p, data, z,
                        palette.brewer.seq=palette.brewer.seq,
                        palette.brewer.qual=palette.brewer.qual)
    p <- add.color.legend(p, data, z, 'fill')
  }

  # show gridlines
  p <- p + theme_panel_num_x + theme_panel_num_y
  if (!num.x && estimate.label.overlap(levels(data[[x]]))) {
    p <- p + theme_slanted_text_x
  }

  list(plot=p, data=data)
}

# cleveland dot plot or bar plot with stat_bin
# no log transforms for bars; see http://www.perceptualedge.com/articles/b-eye/dot_plots.pdf
gplt.dot <- function(data, x, w='NULL', vertical=TRUE,
                     max.factor.levels=30, geom=c('dot', 'bar'), verbose=FALSE, ...) {

  geom <- match.arg(geom)

  if (!is.ordered(data[[x]])) {
    data <- order.factor.by.freq(data, x, w)
  }

  data <- limit.factor.levels(data, x, w=w,
                              max.levels=max.factor.levels)

  ylab <- 'count'
  if (w != 'NULL') {
    ylab <- w
  }

  p <- ggplot(data, aes_string(x=x, weight=w))
  if (geom == 'dot') {
    # if there are only few points, plot direction is not obvious to the viewer
    if (!vertical) {
      shp <- '^'
    } else {
      shp <- '>'
    }
    p <- p + geom_point(stat='count', shape=shp, size=6, na.rm=TRUE, ...)
  } else {
    p <- p + geom_bar(position=position_dodge(), alpha=0.8, width=0.2, na.rm=TRUE, ...)
  }

  if (!vertical) {
    p <- p + theme_panel_num_y + ylab(ylab)

    if (length(levels(data[[x]])) < 10) {
      p <- p + theme_panel_fac_x
    } else {
      p <- p + theme(panel.grid.major.x=element_line(color='black', linetype='dotted'),
                     panel.grid.minor.x=element_blank())
    }

    if (estimate.label.overlap(levels(data[[x]]))) {
      p <- p + theme_slanted_text_x
    }

  } else { # horizontal
    p <- p + coord_flip() + theme_panel_num_x + ylab(ylab)
    if (length(levels(data[[x]])) < 10) {
      p <- p + theme_panel_fac_y
    } else {
      p <- p + theme(panel.grid.major.y=element_line(color='black', linetype='dotted'),
                     panel.grid.minor.y=element_blank())
    }
  }
  list(plot=p, data=data)
}


# violin plot with overlayed median point
# assumption: x is factor, y is numeric
gplt.violin <- function(data, x, y, w='NULL',
                        trans.log.thresh=2,
                        max.factor.levels=30,
                        verbose=FALSE,
                        ...) {

   flip <- FALSE
   if (is.numeric(data[[x]])) {
      if (is.numeric(data[[y]])) {
         stop('Violin plot requires one factor variable')
      }
      flip <- TRUE
      tmp <- x
      x <- y
      y <- tmp
   }
   # note: geom_violin throws an error if all values are equal
   # workaround: add very small jitter
   data[[y]] <- data[[y]] +
      1E-8 * min(data[[y]], na.rm=TRUE) * (runif(nrow(data)) - 0.5)

   if (!is.ordered(data[[x]])) {
      data <- order.factor.by.value(data, x, y, w=w, verbose=verbose)
   }

   data <- limit.factor.levels(data, x, w=w,
                               max.levels=max.factor.levels)

   p <- ggplot(data, aes_string(x=x, y=y, weight=w, ymax=max(y,na.rm=TRUE))) +
      geom_violin(scale='width', alpha=0.8, na.rm=TRUE, ...)

   if ('.center.' %in% names(data)) {
      # dodge does not work correctly when width is not specified
      # see https://github.com/hadley/ggplot2/issues/525
      p <- p + geom_point(mapping=aes_(y=~.center.),
                          position=position_dodge(width=0.9),
                          size=2, shape=1, na.rm=TRUE)
   }

   # '+ geom_point_center()' is unfortunately not working correctly for the case that a
   # group (y-value) AND a color is specified, e.g.,
   # plotluck(Petal.Length~Species|Petal.Width,iris,opts=plotluck.options(geom='violin',max.factor.levels.color=10))
   # In this case, compute_group() in StatCenterX is only called once per y-value.

   # axis transformation
   p <- add.axis.transform(p, data, y, 'y', trans.log.thresh, verbose=verbose)

   if (!flip) {
      p <- p + theme_panel_fac_x + theme_panel_num_y

      if (estimate.label.overlap(levels(data[[x]]))) {
         p <- p + theme_slanted_text_x
      }

   } else {
      p <- p + coord_flip() + theme_panel_fac_y + theme_panel_num_x
   }
   list(plot=p, data=data)
}


# box plot
# assumption: x is factor, y is numeric
gplt.box <- function(data, x, y, w='NULL',
                     trans.log.thresh=2,
                     max.factor.levels=30,
                     verbose=FALSE,
                     ...) {

  flip <- FALSE
  if (is.numeric(data[[x]])) {
    if (is.numeric(data[[y]])) {
      stop('Box plot requires one factor variable')
    }
    flip <- TRUE
    tmp <- x
    x <- y
    y <- tmp
  }

  if (!is.ordered(data[[x]])) {
    data <- order.factor.by.value(data, x, y, w, verbose=verbose)
  }

  data <- limit.factor.levels(data, x, w=w,
                              max.levels=max.factor.levels)

  p <- ggplot(data, aes_string(x=x, y=y, weight=w, ymax=max(y,na.rm=TRUE))) +
    geom_boxplot(alpha=0.8, na.rm=TRUE, ...)

  # axis transformation
  p <- add.axis.transform(p, data, y, 'y', trans.log.thresh, verbose=verbose)

  if (!flip) {
    p <- p + theme_panel_fac_x + theme_panel_num_y

    if (estimate.label.overlap(levels(data[[x]]))) {
      p <- p + theme_slanted_text_x
    }
  } else {
    p <- p + coord_flip() + theme_panel_fac_y + theme_panel_num_x
  }
  list(plot=p, data=data)
}


# density plot with overlayed vertical median lines
gplt.density <- function(data, x, w='NULL',
                         trans.log.thresh=2,
                         verbose=FALSE,
                         ...) {

   ylab <- 'density'
   if (w != 'NULL') {
      ylab <- sprintf('%s density', w)
   }

   p <- ggplot(data, aes_string(x=x, weight=w)) +
      geom_density(aes_(y=~after_stat(scaled)), alpha=0.6, adjust=0.5, trim=TRUE, na.rm=TRUE, ...) +
      geom_rug(na.rm=TRUE)
      # unfortunately the following does not work for log transformation -
      # the layer received the transformed data + geom_vline_center()

   if ('.center.' %in% names(data)) {
      p <- p + geom_vline_inheritable(aes_(xintercept=~.center.),
                                      linetype='dashed', size=0.7, na.rm=TRUE)
   }

   # axis transformation
   p <- add.axis.transform(p, data, x, 'x', trans.log.thresh, verbose=verbose) +
      ylab(ylab) +
      # density numbers themselves not very meaningful
      theme(axis.ticks.y=element_blank(),
            axis.text.y=element_blank()) +
      theme_panel_num_x
   list(plot=p, data=data)
}

# histogram plot with overlayed vertical median lines
gplt.histogram <- function(data, x, w='NULL',
                           n.breaks=NA,
                           trans.log.thresh=2,
                           verbose=FALSE,
                           ...) {

  if (is.numeric(n.breaks)) {
    n.breaks <- n.breaks
  } else {
    n.breaks <- nclass.FD.modified(data[[x]])
  }
  bin.width <- diff(range(data[[x]], na.rm=TRUE))/n.breaks
  p <- ggplot(data, aes_string(x=x, weight=w)) +
    geom_histogram(binwidth=bin.width, position='identity', alpha=0.6, na.rm=TRUE, ...) +
    geom_rug(na.rm=TRUE) + geom_vline_center()

  # axis transformation
  p <- add.axis.transform(p, data, x, 'x', trans.log.thresh, verbose=verbose)

  ylab <- 'count'
  if (w != 'NULL') {
    ylab <- w
  }

  p <- p + ylab(ylab) +
    theme_panel_num_x + theme_panel_num_y
  list(plot=p, data=data)
}


# helper function
norm.sum <- function(x) {
   x[is.na(x)] <- 1e-20
   x <- x / sum(x)
   x
}

# spine plot of two variables
# note: We could use mosaicplot, but we want to consistently stick with ggplot
#    another option would be the prodplots package, but currently seems
#    to be in development stage
#
# - plot.margin.x - horizontal gap between rectangles for x-values
# - no gaps between y-rectangles
#
# note: additional columns other than x,y,w are dropped; this precludes later
# faceting on them. They could be preserved, however this function is already
# complicated as it is.

gplt.spine <- function(data, x, y, w='NULL',
                       plot.margin.x=0.05,
                       max.factor.levels=10,
                       palette.brewer.seq='YlGn',
                       palette.brewer.qual='Set1',
                       verbose=FALSE,
                       ...) {

  # ordering and factor truncation
  data <- order.factors(data, x, y, w=w, verbose=verbose)
  for (v in c(x, y)) {
    data <- quantize(data, v, w=w, n.levels=max.factor.levels)
    if (is.numeric(data[[v]])) {
      data[[v]] <- ordered(data[[v]], exclude=NULL)
    }
  }

  x.lev <- length(levels(data[[x]]))
  y.lev <- length(levels(data[[y]]))

  xy.tab  <- marginalize(data, c(x, y), w)
  x.tab   <- marginalize(data, x, w)
  y.cond  <- sweep(xy.tab, 1, x.tab, FUN='/')

  x.tab.df             <- as.data.frame(x.tab, responseName='x.mrg')
  x.tab.df$x.mrg.cum   <- c(0, cumsum(x.tab.df$x.mrg)[1:(x.lev-1)])
  x.tab.df$x.cnt       <- seq(0, x.lev-1)
  x.tab.df$x.center    <- with(x.tab.df, x.mrg.cum  + x.mrg/2 + x.cnt * plot.margin.x)

  y.cond.df            <- as.data.frame(aperm(y.cond), responseName='y.cond')
  #y.cond.df$y.cond.cum <- as.vector(apply(y.cond, 1, function(x) {c(0, cumsum(x)[1:(y.lev-1)])}))

  # note: if there is no data for an x/y combination, make y coordinates non-NA for displaying
  # 'zero-area' rects
  y.cond.df <- plyr::ddply(y.cond.df, x, plyr::here(transform), y.cond=norm.sum(y.cond))
  y.cond.df <- plyr::ddply(y.cond.df, x, transform, y.cond.cum=cumsum(y.cond))
  y.cond.df$y.cond.cum <- y.cond.df$y.cond.cum - y.cond.df$y.cond

  plot.data <- y.cond.df
  plot.data <- merge(plot.data, x.tab.df)

  plot.data$left     <- plot.data$x.mrg.cum  + plot.data$x.cnt * plot.margin.x
  plot.data$right    <- plot.data$left       + plot.data$x.mrg
  plot.data$bottom   <- plot.data$y.cond.cum
  plot.data$top      <- plot.data$bottom     + plot.data$y.cond

  # remove empty rects
  #idx <- plot.data$bottom < plot.data$top & plot.data$left < plot.data$right
  #plot.data <- plot.data[idx,]

  p <- ggplot(plot.data, aes_(xmin=~left, xmax=~right, ymin=~bottom, ymax=~top)) +
    # make outline color same as fill color - black outline will hide colors
    # for very narrow stripes
    geom_rect(aes_string(color=y, fill=y), size=1, alpha=0.6, na.rm=TRUE, ...) +
    scale_x_continuous(breaks=x.tab.df$x.center, labels=levels(data[[x]])) +
    theme(axis.ticks.x=element_blank()) +
    # y reflected in color legend; however, we do want the label when part of a multiplot!
    xlab(x)  + ylab(y) +
    theme_panel_fac_y

  p <- add.color.fill(p, data, y,
                      palette.brewer.seq=palette.brewer.seq,
                      palette.brewer.qual=palette.brewer.qual)
  p <- add.color.legend(p, data, y, 'fill')

  if (estimate.label.overlap(labels=levels(data[[x]]), breaks=x.tab.df$x.center)) {
    p <- p + theme_slanted_text_x
  }
  list(plot=p, data=data)
}

# spine plot of three variables
# - plot.margin.x - horizontal gap between rectangles for x-values
# - plot.margin.y - vertical gap between rectangles for y-values
# - no gaps between z-rectangles
gplt.spine3 <- function(data, x, y, z, w='NULL',
                        plot.margin.x=0.05,
                        plot.margin.y=0.02,
                        max.factor.levels=10,
                        palette.brewer.seq='YlGn',
                        palette.brewer.qual='Set1',
                        verbose=FALSE,
                        ...) {

  # ordering and factor truncation
  data <- order.factors(data, x, y, z, w=w, verbose=verbose)
  for (v in c(x, y, z)) {
    data <- quantize(data, v, w=w, n.levels=max.factor.levels)
    if (is.numeric(data[[v]])) {
      data[[v]] <- ordered(data[[v]], exclude=NULL)
    }
  }

  x.lev <- length(levels(data[[x]]))
  y.lev <- length(levels(data[[y]]))
  z.lev <- length(levels(data[[z]]))

  xyz.tab <- marginalize(data, c(x, y, z), w)
  xy.tab  <- marginalize(data, c(x, y), w)
  x.tab   <- marginalize(data, x, w)
  y.tab   <- marginalize(data, y, w)
  y.cond  <- sweep(xy.tab, 1, x.tab, FUN='/')
  z.cond  <- sweep(xyz.tab, 1:2, xy.tab, FUN='/')

  x.tab.df             <- as.data.frame(x.tab, responseName='x.mrg')
  x.tab.df$x.mrg.cum   <- c(0, cumsum(x.tab.df$x.mrg)[1:(x.lev-1)])
  x.tab.df$x.cnt       <- seq(0, x.lev-1)
  x.tab.df$x.center    <- with(x.tab.df, x.mrg.cum  + x.mrg/2 + x.cnt * plot.margin.x)

  y.tab.df             <- as.data.frame(y.tab, responseName='y.mrg')
  y.tab.df$y.mrg.cum   <- c(0, cumsum(y.tab.df$y.mrg)[1:(y.lev -1)])
  y.tab.df$y.cnt       <- seq(0, y.lev-1)
  y.tab.df$y.center    <- with(y.tab.df, y.mrg.cum  + y.mrg/2 + y.cnt * plot.margin.y)

  y.cond.df            <- as.data.frame(aperm(y.cond), responseName='y.cond')
  y.cond.df$y.cond.cum <- as.vector(apply(y.cond, 1, function(x) {c(0, cumsum(x)[1:(y.lev-1)])}))

  z.cond.df            <- as.data.frame(aperm(z.cond), responseName='z.cond')
  # note: if there is no data for an x/y combination, make z coordinates
  # non.NA and evenly spaced for displaying 'zero-area' rects
  z.cond.df <- plyr::ddply(z.cond.df, c(y,x), plyr::here(transform), z.cond=norm.sum(z.cond))
  z.cond.df <- plyr::ddply(z.cond.df, c(y,x), transform, z.cond.cum=cumsum(z.cond))
  z.cond.df$z.cond.cum <- z.cond.df$z.cond.cum - z.cond.df$z.cond

  #z.cond.df$z.cond.cum <- as.vector(apply(z.cond, 2:1,
  #                                         function(x) {c(0, cumsum(x)[1:(z.lev-1)])}))

  plot.data <- z.cond.df
  plot.data <- merge(plot.data, y.cond.df)
  plot.data <- merge(plot.data, x.tab.df)
  plot.data <- merge(plot.data, y.tab.df)

  plot.data$left     <- plot.data$x.mrg.cum  + plot.data$z.cond.cum * plot.data$x.mrg + plot.data$x.cnt * plot.margin.x
  plot.data$right    <- plot.data$left       + plot.data$z.cond * plot.data$x.mrg
  plot.data$bottom   <- plot.data$y.cond.cum + plot.data$y.cnt * plot.margin.y
  plot.data$top      <- plot.data$bottom     + plot.data$y.cond

  p <- ggplot(plot.data, aes_(xmin=~left, xmax=~right, ymin=~bottom, ymax=~top)) +
    # make outline color same as fill color - black outline will hide colors
    # for very narrow stripes
    # size=1: make sure empty rects are still visible
    geom_rect(aes_string(color=z, fill=z), size=1, alpha=0.6, na.rm=TRUE, ...) +
    scale_y_continuous(breaks=y.tab.df$y.center, labels=levels(data[[y]])) +
    scale_x_continuous(breaks=x.tab.df$x.center, labels=levels(data[[x]]))

  p <- add.color.fill(p, data, z,
                      palette.brewer.seq=palette.brewer.seq,
                      palette.brewer.qual=palette.brewer.qual)
  p <- add.color.legend(p, data, z, 'fill')

  p <- p + xlab(x) + ylab(y) +
    theme_panel_fac_y +
    theme(axis.ticks.x=element_blank(),
          axis.ticks.y=element_blank())

  # write labels slanted to mitigate overlap
  if (estimate.label.overlap(labels=levels(data[[x]]), breaks=x.tab.df$x.center)) {
    p <- p + theme_slanted_text_x
  }
  list(plot=p, data=data)
}


# blank plot with optional message text
gplt.blank <- function(text=NULL, ...) {
  p <- ggplot(data.frame(x=c(0,1), y=c(0,1)), aes_(x=~x, y=~y)) +
    geom_blank(...) +
    labs(x=NULL, y=NULL) +
    xlim(0,1) +
    ylim(0,1) +
    theme(axis.ticks.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.y=element_blank(),
          axis.text.y=element_blank(),
          panel.background=element_blank())

  if (!is.null(text)) {
    p <- p + annotate('text', x=0.5, y=0.5, label=text, hjust=0.5, vjust=0.5)
  }
  list(plot=p, data=NULL)
}


#' Create options structure for \code{plotluck}
#'
#' @param opts An (optional) named list to start with. Anything not specified in ... will be inherited from opts.
#' @param ... Parameters to override default settings
#' @return A named list of options, usable as argument to function \code{\link{plotluck}}.
#'
#'  \code{plotluck} accepts a list of options to modify its behavior. Calling
#'  \code{plotluck.options} without arguments produces a list with the default
#'  values. Specifying any number of attribute/value pairs overrides them
#'  selectively.
#'
#'\tabular{lll}{
#' \strong{Option}\tab\strong{Default}\tab\strong{Comment}\cr
#' \code{na.rm} \tab \code{FALSE} \tab Do not show missing factor values as separate level.\cr
#' \code{geom} \tab \code{"auto"} \tab Override type of plot; the available types for a given formula and variables can be inspected with \code{verbose=TRUE}.\cr
#' \code{sample.max.rows} \tab \code{100000} \tab If the data set has more rows than that, sample it down.\cr
#' \code{trans.log.thresh} \tab \code{2} \tab Threshold for logarithmic axis scaling. Visible magnification factor of the central region of the distribution.\cr
#' \code{n.breaks.histogram} \tab \code{NA} \tab Override the number of histogram breaks.\cr
#' \code{min.points.hex} \tab \code{5000} \tab Minimum data points required to display a hexbin plot.\cr
#' \code{min.points.density} \tab \code{20} \tab Minimum data points required to display a density or histogram plot.\cr
#' \code{min.points.violin} \tab \code{20} \tab Minimum data points required to display a violin or box plot.\cr
#' \code{resolution.heat} \tab \code{30} \tab Grid spacing for heat maps.\cr
#' \code{dedupe.scatter} \tab \code{'area'} \tab To represent multiple instances of the same coordinates in scatter plot: scale the point size, or use jitter?\cr
#' \code{min.points.jitter} \tab \code{3} \tab Minimum number of coordinate duplicates to start jittering points.\cr
#' \code{jitter.x} \tab \code{0.4} \tab Amount of jitter to apply in horizontal direction, as a fraction of resolution.\cr
#' \code{jitter.y} \tab \code{0.4} \tab  Amount of jitter to apply in vertical direction, as a fraction of resolution.\cr
#' \code{few.unique.as.factor} \tab \code{5} \tab If a numeric variable has less than that many unique values, make it an ordered factor.\cr
#' \code{max.factor.levels} \tab \code{30} \tab For factors with more than that many levels, least frequent ones will be pruned into "other".\cr
#' \code{max.factor.levels.color} \tab \code{3} \tab Maximum number of factor levels that can be represented as colors in the same plot.\cr
#' \code{max.factor.levels.violin} \tab \code{20} \tab Maximum number of levels to plot violins; rather switch to box plot. \cr
#' \code{max.factor.levels.spine.x} \tab \code{20} \tab Maximum number of levels to plot in x-direction in a spine plot. \cr
#' \code{max.factor.levels.spine.y} \tab \code{10} \tab Maximum number of levels to plot in y-direction in a spine plot. \cr
#' \code{max.factor.levels.spine.z} \tab \code{5} \tab Maximum number of levels to represent as colors in a spine plot. \cr
#' \code{spine.plot.margin.x} \tab \code{0.05} \tab Horizontal gap between rectangles in a spine plot. \cr
#' \code{spine.plot.margin.y} \tab \code{0.02} \tab Vertical gap between rectangles in a spine plot. \cr
#' \code{facet.max.cols} \tab \code{10} \tab Maximum number of facet columns for conditional variables. \cr
#' \code{facet.max.rows} \tab \code{10} \tab Maximum number of facet rows for conditional variables. \cr
#' \code{facet.num.wrap} \tab \code{6} \tab Preferred number of facets for single conditional variable. \cr
#' \code{facet.num.grid} \tab \code{3} \tab Preferred number of facets for each of two conditional variables. \cr
#' \code{prefer.factors.vert} \tab \code{TRUE} \tab In mixed numeric/factor plots, use vertical axis for the factor. \cr
#' \code{fill.default} \tab \code{"deepskyblue"} \tab Default fill color for density and histogram plots. \cr
#' \code{palette.brewer.seq} \tab \code{"YlGn"} \tab Sequential brewer palette name. \cr
#' \code{palette.brewer.qual} \tab \code{"Set1"} \tab Qualitative brewer palette name. \cr
#' \code{multi.entropy.order} \tab \code{TRUE} \tab Use estimated conditional entropy to order multi-plots. \cr
#' \code{multi.max.rows} \tab \code{6} \tab Maximum number of rows for multi-plots. \cr
#' \code{multi.max.cols} \tab \code{6} \tab Maximum number of columns for multi-plots. \cr
#' \code{multi.in.grid} \tab \code{TRUE} \tab In multi-plots, make a page with multiple plots, or generate each one separately. \cr
#' \code{verbose} \tab \code{FALSE}\tab Print information about plot types, ordering, scaling, etc. \cr}
#'
#' @note \code{plotluck}'s aim is to provide a function that is usable
#'  "out-of-the-box", with no or very little manual tweaking. If you find
#'  yourself needing to change option values repeatedly or find the presets to
#'  be suboptimal, please contact the author.
#'
#' @seealso \code{\link{plotluck}}
#' @export
#'
#' @examples
#' # list all default options
#' plotluck.options()
#'
#' data(iris)
#' # default with violin plot
#' plotluck(iris, Petal.Length~Species)
#'
#' # use box-and-whiskers plot instead
#' plotluck(iris, Petal.Length~Species, opts=plotluck.options(geom='box'))
#'
#' @export
plotluck.options <- function(opts,...) {
  if (missing(opts)) {
    opts <- list(
      na.rm=FALSE,
      geom='auto',
      sample.max.rows=100000,
      trans.log.thresh=2,
      n.breaks.histogram=NA,
      min.points.hex=5000,
      min.points.density=20,
      min.points.violin=20,
      min.points.jitter=3,
      jitter.x=0.4,
      jitter.y=0.4,
      resolution.heat=30,
      dedupe.scatter='area',
      few.unique.as.factor=5,
      max.factor.levels=30,
      max.factor.levels.color=3,
      max.factor.levels.violin=20,
      max.factor.levels.spine.x=20,
      max.factor.levels.spine.y=10,
      max.factor.levels.spine.z=5,
      spine.plot.margin.x=0.05,
      spine.plot.margin.y=0.02,
      facet.max.cols=10,
      facet.max.rows=10,
      facet.num.wrap=6,
      facet.num.grid=3,
      prefer.factors.vert=TRUE,
      fill.default='deepskyblue',
      palette.brewer.seq='YlGn',
      palette.brewer.qual='Set1',
      multi.entropy.order=TRUE,
      multi.max.rows=6,
      multi.max.cols=6,
      multi.in.grid=TRUE,
      verbose=FALSE)
  }
  overrides <- list(...)
  unknown <- setdiff(names(overrides), names(opts))
  if (length(unknown) > 0) {
    stop(sprintf('Unknown options: %s', paste(unknown, sep='',collapse =', ')))
  }
  opts[names(overrides)] <- overrides
  opts
}


sample.data <- function(data, w='NULL', max.rows) {
  n.row <- nrow(data)
  if (n.row > max.rows) {
    if (w == 'NULL') {
      data <- data[sample(1:n.row, max.rows),, drop=FALSE]
    } else {
      # note: weighted sampling itself can be quite slow
      data <- data[sample(1:n.row, max.rows, prob=data[[w]]),, drop=FALSE]
    }
  }
  data
}

# expected input formula: 'x', 'x*y', 'x+y', 'x+1'
# output: list of occurring variables, ignoring constants
parse.formula.term <- function(form) {
  if (inherits(form, 'numeric')) {
    return(character())
  }
  if (inherits(form, 'name')) {
    return(as.character(form))
  }
  if (as.character(form[[1]]) %in% c('+', '*')) {
    res <- character()
    if (inherits(form[[2]], 'name')) {
      res <- as.character(form[[2]])
    } else if (!inherits(form[[2]], 'numeric')) {
      stop('Invalid formula: at most two dependent or conditional variables allowed')
    }
    if (inherits(form[[3]], 'name')) {
      res <- c(res, as.character(form[[3]]))
    } else if (!inherits(form[[3]], 'numeric')) {
      stop('Invalid formula: at most two dependent or conditional variables allowed')
    }
    return(res)
  }
  stop('Invalid formula')
}

# expected input: conditional formula with up to three variables
# output: list of lists (response, independent variable, conditionals)
parse.formula <- function(form) {
  if (as.character(form[[1]]) != '~') {
    stop('Invalid formula')
  }
  node <- form[[2]]
  if (!inherits(node,'name')) {
    stop('Invalid formula: only one dependent variable allowed')
  }
  resp <- as.character(node)
  indep <- character()
  cond <- character()

  node <- form[[3]]
  if (!inherits(node,'name') && as.character(node[[1]])  == '|') {
    indep <- parse.formula.term(node[[2]])
    cond <- parse.formula.term(node[[3]])
  } else {
    indep <- parse.formula.term(node)
  }

  if (resp != '.' && (resp %in% indep || resp %in% cond)) {
    stop('Invalid formula: variable cannot be both dependent and independent')
  }
  if (length(intersect(indep,cond))>0) {
    stop('Variables can only be used once')
  }
  if (length(indep)+length(cond) > 2) {
    stop('Invalid formula: at most 3 variables allowed')
  }
  return(list(resp, indep, cond))
}

# try to match user input to column names, if not exact
correct.varnames <- function(x, data) {
  if (length(x) == 0 || length(x) == 1 && x == '.') {
    return(x)
  }
  for (i in seq(length(x))) {
    idx <- pmatch(tolower(x[i]), tolower(names(data)))
    if (!is.na(idx)) {
      x[i] <- names(data)[idx]
    } else {
      stop(sprintf('No such variable: %s', x[i]))
    }
  }
  x
}

# process weights
process.weights <- function(data, weights) {
  if (weights == 'NULL') {
    return(data)
  }

  if  (!is.numeric(data[[weights]])) {
    stop('Weight must be numeric')
  }
  if  (any(data[[weights]]<0)) {
    stop('Weight must be non-negative')
  }
  weight.na <- is.na(data[[weights]])
  if  (any(weight.na)) {
    message('Weight is NA for %d instances, deleting', length(which(weight.na)))
    data <- data[!weight.na,]
  }
  # if weights are integer type, Hmisc::wtd.quantile() can lead to NA due to overflow
  data[[weights]] <- as.double(data[[weights]])
  data
}

info.options <- function(chosen, choices, verbose=FALSE) {
  if (verbose) {
    choices <- setdiff(choices, 'auto')
    cat(sprintf("Choosing geom='%s' out of possible options: '%s'\n", chosen, paste0(choices,collapse=', ')))
  }
}

info.threshold <- function(cond, msg, threshold, ...) {
  if (cond) {
    cat(sprintf(msg, ...), sprintf('(%s = %s)\n', deparse(match.call()$threshold), threshold))
  }
}

#' "I'm feeling lucky" for ggplot
#'
#' The purpose of \code{plotluck} is to let the user focus on \emph{what} to plot,
#' and automate the \emph{how}. Given a dependency formula with up to three
#' variables, it tries to choose the most suitable type of plot. It also automates
#' sampling large datasets, correct handling of observation weights, logarithmic
#' axis scaling, ordering and pruning of factor levels, and overlaying smoothing
#' curves or median lines.
#'
#' @param data a data frame.
#' @param formula an object of class \code{\link[stats]{formula}}: a symbolic description
#'  of the relationship of up to three variables.
#' \tabular{lll}{
#' \strong{Formula}\tab\strong{Meaning}\tab\strong{Plot types}\cr
#' \code{y~1}\tab Distribution of single variable\tab Density, histogram, scatter, dot, bar\cr
#' \code{y~x}\tab One explanatory variable\tab Scatter, hex, violin, box, spine, heat\cr
#' \code{y~x+z}\tab Two explanatory variables\tab heat, spine\cr
#' \code{y~1|z} or \code{y~x|z}\tab One conditional variable\tab Represented through coloring or facetting\cr
#' \code{y~1|x+z}\tab Two conditional variables\tab Represented through facetting\cr}
#' In addition to these base plot types, the dot symbol \code{"."} can also be used,
#' and denotes all variables in the data frame. This gives rise to a lattice or
#' series of plots (use with caution, can be slow).
#' \tabular{lll}{
#' \strong{Formula}\tab\strong{Meaning}\cr
#' \code{.~1}\tab Distribution of each variable in the data frame, separately\cr
#' \code{y~.}\tab Plot \code{y} against each variable in the data frame\cr
#' \code{.~x}\tab Plot each variable in the data frame against \code{x}\cr
#' \code{.~.}\tab Plot each variable in the data frame against each other.\cr}
#'  See also section "Generating multiple plots at once" below.
#' @param weights observation weights or frequencies (optional).
#' @param opts a named list of options (optional); See also \code{\link{plotluck.options}}.
#' @param ... additional parameters to be passed to the respective ggplot2 geom objects.
#' @return a ggplot object, or a plotluck.multi object if the dot symbol was used.
#' @export
#' @keywords hplot
#' @keywords aplot
#' @keywords dplot
#' @concept automation
#' @concept visualization
#' @concept plotting
#' @concept exploratory data analysis
#' @concept ggplot
#' @concept ggplot2
#' @concept heat map
#' @concept density plot
#' @concept violin plot
#' @concept hexbin
#' @concept histogram
#' @concept bar plot
#' @concept box plot
#' @concept spine plot
#' @concept mosaic plot
#' @concept scatter plot
#' @concept heat map
#'
#' @seealso \code{\link{plotluck.options}}, \code{\link{sample.plotluck}}, \code{\link[ggplot2]{ggplot}}
#'
#' @section Determining the type of plot: Besides the shape of the formula, the
#'   algorithm takes into account the type of variables as either numeric, ordered,
#'   or unordered factors. Often, it makes sense to treat ordered factors similarly
#'   as numeric types.
#'
#'   One-variable numeric (resp. factor) distributions are usually represented by
#'   density (resp. Cleveland dot) charts, but can be overridden to histograms or
#'   bar plots using the \code{geom} option. Density plots come with an overlaid
#'   vertical median line.
#'
#'   For two numerical variables, by default a scatter plot is produced, but for
#'   high numbers of points a hexbin is preferred (option \code{min.points.hex}).
#'   These plots come with a smoothing line and standard deviation.
#'
#'   The relation between two factor variables can be depicted best by spine
#'   (a.k.a., mosaic) plots, unless they have too many levels (options
#'   \code{max.factor.levels.spine.x}, \code{max.factor.levels.spine.y},
#'   \code{max.factor.levels.spine.z}). Otherwise, a heat map is produced.
#'
#'   For a mixed-type (factor/numeric) pair of variables, violin (overridable
#'   to box) plots are generated. However, if the resulting graph would contain
#'   too many (more than \code{max.factor.levels.violin}) violin plots in a row,
#'   the algorithm switches automatically. The number of bins of a histogram can
#'   be customized with \code{n.breaks.histogram}. The default setting, \code{NA},
#'   applies a heuristic estimate.
#'
#'   The case of a response two dependent variables (`y~x+z`) is covered by
#'   either a spine plot (if all are factors) or a heat map.
#'
#'   In many cases with few points for one of the aggregate plots, a scatter
#'   looks better (options \code{min.points.density}, \code{min.points.violin},
#'   \code{min.points.hex}).
#'
#'   If each factor combination occurs only once in the data set, we resort to
#'   bar plots.
#
#' @section Conditional variables: Conditional variables are represented by either
#'   trying to fit into the same graph using coloring (\code{max.factor.levels.color}),
#'   or by facetting (preferred dimensions \code{facet.num.wrap} (resp.
#'   \code{facet.num.grid}) for one resp. two variables). Numeric vectors are
#'   discretized accordingly. Facets are laid out horizontally or vertically
#'   according to the plot type, up to maximum dimensions of \code{facet.max.rows}
#'   and \code{facet.max.cols}.
#'
#' @section Reordering of factor levels: To better illustrate the relation
#'  between an independent factor variable and a dependent numerical variable
#'  (or an ordered factor), levels are reordered according to the value of the
#'  dependent variable. If no other numeric or ordered variable exists, we
#'  sort by frequency.
#'
#' @section Instance weights: Argument \code{weights} allows to specify weights
#'  or frequency counts for each row of data. All plots and summary statistics
#'  take weights into account when supplied. In scatter and heat maps, weights
#'  are indicated either by a shaded disk with proportional area (default) or by
#'  jittering (option \code{dedupe.scatter}), if the number of duplicated points
#'  exceeds \code{min.points.jitter}. The amount of jittering can be controlled
#'  with \code{jitter.x} and \code{jitter.y}.
#'
#' @section Axis scaling: \code{plotluck} supports logarithmic and log-modulus
#'  axis scaling. log-modulus is considered if values are both positive and
#'  negative; in this case, the transform function is \code{f(x) = sign(x) *
#'  log(1+abs(x))}.
#'
#'  The heuristic to apply scaling is based on the proportion of total display
#'  range that is occupied by the 'core' region of the distribution between the
#'  lower and upper quartiles; namely, the fact whether the transform could
#'  magnify this region by a factor of at least \code{trans.log.thresh}.
#'
#' @section Missing values: By default, missing (\code{NA} or \code{NaN}) values
#'  in factors are are shown as a special factor level code{"?"}. They can be
#'  removed by setting \code{na.rm=TRUE}. Conventionally, missing numeric values
#'  are not shown.
#'
#' @section Sampling: For very large data sets, plots can take a very long time
#'  (or even crash R). \code{plotluck} has a built-in stop-gap: If the data
#'  comprises more than \code{sample.max.rows}, it will be sampled down to that
#'  size (taking into account \code{weights}, if supplied).
#'
#' @section Factor preprocessing:  Character (resp. logical) vectors are converted to
#'  unordered (resp. ordered) factors.
#'
#'  Frequently, when numeric variables have very few values despite sufficient
#'  data size, it helps to treat these values as the levels of a factor; this is
#'  governed by option \code{few.unique.as.factor}.
#'
#'  If an unordered factor has too many levels, plots can get messy. In this
#'  case, only the \code{max.factor.levels} most frequent ones are retained,
#'  while the rest are merged into a default level \code{".other."}.
#'
#' @section Coloring: If \code{color} or \code{fill} aesthetics are used to
#'  distinguish different levels or ranges of a variable, the color scheme adjusts
#'  to the type. Preferably, a sequential (resp. qualitative) palette is chosen
#'  for a numeric/ordered (unordered) factor (\code{palette.brewer.seq},
#'  \code{palette.brewer.qual}); see also \link[RColorBrewer]{RColorBrewer}.
#'
#' @section Generating multiple plots at once: If \code{formula} contains a dot
#'  (\code{"."}) symbol, the function creates a number of 1D or 2D plots by calling
#'  \code{plotluck} repeatedly. As described above, this allows either single
#'  distribution, one-vs-all and all-vs-all variable plots. To save space,
#'  rendering is minimal without axis labels.
#'
#'  In the all-vs-all case, the diagonal contains 1D distribution plots, analogous
#'  to the behavior of the default plot method for data frames, see
#'  \code{\link[graphics]{plot.data.frame}}.
#'
#'  With setting \code{in.grid=FALSE}, plots are produced in a sequence, otherwise
#'  together on one or multiple pages, if necessary (default). Page size is
#'  controlled by \code{multi.max.rows} and  \code{multi.max.cols}.
#'
#'  With \code{entropy.order=TRUE}, plots are sorted by an estimate of
#'  empirical conditional entropy, with the goal of prioritizing the more
#'  predictive variables. Set \code{verbose=TRUE} if you want to see the actual
#'  values. For large data sets the calculation can be time consuming; entropy
#'  calculation can be suppressed by setting \code{multi.entropy.order=FALSE}.
#'
#'  @note The return value is an object of class \code{plotluck_multi}. This
#'   class does not have any functionality; its sole purpose is to make this
#'   function work in the same way as \code{ggplot} and \code{plotluck}, namely,
#'   do the actual drawing if and only if the return value is not assigned.
#'
#' @section Debugging: With the option \code{verbose=TRUE} turned on, the function
#'  will print out information about the chosen and applicable plot types, ordering,
#'  log scaling, etc.
#'
#' @section Column name matching: Variable names can be abbreviated if they match
#'  a column name uniquely by prefix.
#'
#' @section Remarks on supported plot types: By default, \code{plotluck}
#'  uses violin and density plots in place of the more traditional box-and-whisker
#'  plots and histograms; these modern graph types convey the shape of a
#'  distribution better. In the former case, summary statistics like mean and
#'  quantiles are less useful if the distribution is not unimodal; a wrong
#'  choice of the number of bins of a histogram can create misleading artifacts.
#'
#'  Following Cleveland's advice, factors are plotted on the y-axis to make labels
#'  most readable and compact at the same time. This direction can be controlled
#'  using option \code{prefer.factors.vert}.
#'
#'  Due to their well-documented problematic aspects, pie charts and stacked bar
#'  graphs are not supported.
#'
#'  With real-world data (as opposed to smooth mathematical functions),
#'  three-dimensional scatter, surface, or contour plots can often be hard to
#'  read if the shape of the distribution is not suitable, data coverage is
#'  uneven, or if the perspective is not carefully chosen depending on the data.
#'  Since they usually require manual tweaking, we have refrained from
#'  incorporating them.
#'
#' @section Remarks on the use of options: For completeness, we have included the
#'  description of option parameters in the current help page. However, the
#'  tenet of this function is to be usable "out-of-the-box", with no or very
#'  little manual tweaking required. If you find yourself needing to change
#'  option values repeatedly or find the presets to be suboptimal, please
#'  contact the author.
#
#' @section Limitations: \code{plotluck} is designed for generic out-of-the-box
#'  plotting, and not suitable to produce more specialized types of plots that
#'  arise in specific application domains (e.g., association, stem-and-leaf,
#'  star plots, geographic maps, etc). It is restricted to at most three variables.
#'  Parallel plots with variables on different scales (such as time
#'  series of multiple related signals) are not supported.
#'
#' @examples
#' # Single-variable density
#' data(diamonds, package='ggplot2')
#' plotluck(diamonds, price~1)
#' invisible(readline(prompt="Press [enter] to continue"))
#'
#' # Violin plot
#' data(iris)
#' plotluck(iris, Species~Petal.Length)
#' invisible(readline(prompt="Press [enter] to continue"))
#'
#' # Scatter plot
#' data(mpg, package='ggplot2')
#' plotluck(mpg, cty~model)
#' invisible(readline(prompt="Press [enter] to continue"))
#'
#' # Spine plot
#' data(Titanic)
#' plotluck(as.data.frame(Titanic), Survived~Class+Sex, weights=Freq)
#' invisible(readline(prompt="Press [enter] to continue"))
#'
#' # Facetting
#' data(msleep, package='ggplot2')
#' plotluck(msleep, sleep_total~bodywt|vore)
#' invisible(readline(prompt="Press [enter] to continue"))
#'
#' # Heat map
#' plotluck(diamonds, price~cut+color)
#'
#'\donttest{
#' # Multi plots
#
#' # All 1D distributions
#' plotluck(iris, .~1)
#'
#' # 2D dependencies with one fixed variable on vertical axis
#' plotluck(iris, Species~.)
#'}
#' # See also tests/testthat/test_plotluck.R for more examples!
#'
plotluck <- function(data, formula, weights,
                     opts=plotluck.options(),
                     ...) {
  parsed <- parse.formula(formula)
  response <- correct.varnames(parsed[[1]], data)
  indep <- correct.varnames(parsed[[2]], data)
  cond <- correct.varnames(parsed[[3]], data)

  vars <- c(response, indep, cond)
  if (!missing(weights)) {
    weights <- correct.varnames(deparse(substitute(weights)), data)
    vars <- c(vars, weights)
  } else {
    # note: we set missing aesthetics to 'NULL' instead of NULL, because
    # then ggplot interprets it correctly as a constant [e.g., aes(weights='NULL')]
    weights <- 'NULL'
  }
  vars <- unique(vars)

  # validation and preprocessing
  if (!('.' %in% vars)) {
     data <- data[,vars, drop=FALSE]
  }
  data <- sample.data(data, weights, opts$sample.max.rows)
  data <- process.weights(data, weights)
  data <- preprocess.factors(data, setdiff(vars, c(weights, '.')), opts$na.rm)

  info.threshold(opts$verbose && nrow(data) > opts$sample.max.rows,
                 'Data set has %s rows, sampling down', opts$sample.max.rows,
                 nrow(data))

  if ('.' %in% vars) {
    # trellis of plots
    return(plotluck.multi(response, indep, data, w=weights,
                          in.grid=opts$multi.in.grid,
                          max.rows=opts$multi.max.rows,
                          max.cols=opts$multi.max.cols,
                          opts=opts,
                          ...))
  }

  for (v in c(response, indep)) {
    if (is.numeric(data[[v]])) {
      data <- discretize.few.unique(data, v, opts$few.unique.as.factor, opts$verbose)
    }
  }

  # conditionals: discretize if numeric, and order
  # note: to have a single data frame containing all plot and facet variables,
  # we have to preprocess the conditionals beforehand.

  if (length(cond) > 0) {

    # if we have to discretize, we want that many facets
    intervals <- NULL

    if (length(cond) == 1) {
      intervals <- opts$facet.num.wrap
      info.threshold(opts$verbose && length(unique(data[[cond]])) > opts$facet.num.wrap, 'Discretizing %s into intervals', opts$facet.num.wrap, cond)
    } else {
      intervals <- opts$facet.num.grid
      info.threshold(opts$verbose && length(unique(data[[cond[1]]])) > opts$facet.num.grid, 'Discretizing %s into intervals', opts$facet.num.wrap, cond[1])
      info.threshold(opts$verbose && length(unique(data[[cond[2]]])) > opts$facet.num.grid, 'Discretizing %s into intervals', opts$facet.num.wrap, cond[2])
    }

    order.by <- NULL
    if (is.numeric(data[[response]]) || is.ordered(data[[response]])) {
      order.by <- response
    } else if (length(indep) == 1 &&
               (is.numeric(data[[indep]]) || is.ordered(data[[indep]]))) {
      order.by <- indep
    }

    for (i in seq(length(cond))) {
      if (is.numeric(data[[cond[i]]])) {
        data <- discretize(data, cond[i], w=weights, max.breaks=intervals,
                           estimate.breaks=FALSE, method='histogram')
        data[[cond[i]]] <- ordered(data[[cond[i]]])
      } else {
        data <- limit.factor.levels(data, cond[i], w=weights,
                                    max.levels=min(intervals, opts$max.factor.levels))
        if (!is.ordered(data[[cond[i]]])) {
          data <- order.factor.by.value(data, cond[i], order.by, weights, verbose=opts$verbose)
        }
      }
    }
  }

  vars.non.numeric <- cond
  vars.numeric <- NULL
  for (v in c(response, indep)) {
    if (!is.numeric(data[[v]])) {
      vars.non.numeric <- c(vars.non.numeric, v)
    } else {
      vars.numeric <- c(vars.numeric, v)
    }
  }

  # compute joint number of factor combinations, and
  # the maximum/median number of instances in them
  if (length(vars.non.numeric) > 0) {
    t <- table(data[,vars.non.numeric], useNA='ifany')
    num.groups   <- length(t)
    med.points.per.group <- median(t)
    max.points.per.group <- max(t)
  } else {
    num.groups <- 1
    med.points.per.group <- nrow(data)
    max.points.per.group <- nrow(data)
  }

  # precompute medians
  if (length(vars.numeric) == 1) {
     grp.med <- group.central(data, vars.numeric, vars.non.numeric, w=weights, method='median')
     if (length(vars.non.numeric) == 0) {
        # single value
        data$.center. <- grp.med$.center.
     } else {
      data <- merge(data, grp.med)
     }
  }

  # note: factor levels cannot be limited before ordering is known!

  # determine type of plot
  if (length(indep) == 0) {
    # distribution plot
    res <- determine.plot.type.0(data, response, weights,
                                 med.points.per.group, num.groups, opts, ...)
  } else if (length(indep) == 1) {
    res <- determine.plot.type.1(data, response, indep, cond, weights,
                                 med.points.per.group, max.points.per.group,
                                 num.groups, opts, ...)
  } else { # length(indep) == 2
    res <- determine.plot.type.2(data, response, indep[[1]], indep[[2]],
                                 weights, opts, ...)
  }

  p <- res$plot
  data <- res$data
  type.plot <- res$type.plot

  if (length(indep) < 2) {
     # data is possibly modified, we need to pass it in
    p <- add.conditional.layer(p, data, response, indep, cond, type.plot, opts)
  }
  p <- p + theme(panel.background=element_blank(), # get rid of gray background
                 strip.background=element_blank(), # no background for facet labels
                 axis.line.x=element_line(),
                 axis.line.y=element_line(),
                 panel.grid=element_line(color='black'))
  p
}

determine.plot.type.0 <- function(data, response, w, med.points.per.group,
                                  num.groups, opts,
                                  ...) {
  res <- NULL
  type.plot <- NULL
  opts.geom <- NULL
  choices.geom <- NULL
  if (is.numeric(data[[response]])) {
    choices.geom <- c('density', 'histogram', 'scatter', 'auto')
    opts.geom <- match.arg(opts$geom, choices.geom)
    if (opts.geom == 'auto') {
      # enough points for density plot?
      opts.geom <- ifelse(med.points.per.group > opts$min.points.density, 'density', 'scatter')
    }
    if (opts.geom == 'density') {
      type.plot <- 'density'
      res <- gplt.density(data, response, w=w,
                        trans.log.thresh=opts$trans.log.thresh,
                        verbose=opts$verbose,
                        ...)
    } else if (opts.geom == 'histogram') {
      type.plot <- 'histogram'
      res <- gplt.histogram(data, response, w=w,
                          n.breaks=opts$n.breaks.histogram,
                          trans.log.thresh=opts$trans.log.thresh,
                          verbose=opts$verbose,
                          ...)
    } else { # opts.geom == 'scatter'

      # to plot a distribtion as a line plots,
      # make up a constant y coordinate
      indep <- '.y_const'
      data[[indep]] <- 1
      data[[indep]] <- as.factor(data[[indep]])
      type.plot <- 'scatter.num.1'
      if (!opts$prefer.factors.vert) {
         res <- gplt.scatter(data, indep, response, w=w,
                             dedupe.scatter=opts$dedupe.scatter,
                             min.points.jitter=opts$min.points.jitter,
                             trans.log.thresh=opts$trans.log.thresh,
                             max.factor.levels=opts$max.factor.levels,
                             verbose=opts$verbose,
                             ...)
         res$plot <- res$plot +
            theme(axis.ticks.x=element_blank(),
                  axis.text.x=element_blank()) +
            xlab('')
         # known issue: the conditioning layer can add theme_slanted_text_x,
         # in which case spurious '1's appear on the x-axis.
      } else {
         res <- gplt.scatter(data, response, indep, w=w,
                             dedupe.scatter=opts$dedupe.scatter,
                             min.points.jitter=opts$min.points.jitter,
                             trans.log.thresh=opts$trans.log.thresh,
                             max.factor.levels=opts$max.factor.levels,
                             verbose=opts$verbose,
                             ...)
         res$plot <- res$plot +
            theme(axis.ticks.y=element_blank(),
                  axis.text.y=element_blank()) +
            xlab('')
      }
    }
  } else { # !is.numeric(data[[response]])
    choices.geom <- c('dot', 'bar', 'auto')
    opts.geom <- match.arg(opts$geom, choices.geom)
    if (opts.geom == 'auto') {
      opts.geom <- ifelse(num.groups < 6, 'bar', 'dot')
    }
    type.plot <- opts.geom
    res <- gplt.dot(data, response, w=w, vertical=opts$prefer.factors.vert,
                  max.factor.levels=opts$max.factor.levels, geom=opts.geom,
                  verbose=opts$verbose, ...)
  }
  info.options(opts.geom, choices.geom, opts$verbose)
  res$type.plot=type.plot
  res
}

determine.plot.type.1 <- function(data, response, indep, cond,
                                  w, med.points.per.group,
                                  max.points.per.group,
                                  num.groups, opts,
                                  ...) {
  res <- NULL
  type.plot <- NULL
  opts.geom <- NULL
  choices.geom <- NULL
  if (is.numeric(data[[indep]]) && is.numeric(data[[response]])) {
    choices.geom <- c('hex', 'scatter', 'auto')
    opts.geom <- match.arg(opts$geom, choices.geom)
    if (opts.geom == 'auto') {
      # if a lot of points, choose hex
      opts.geom <- ifelse(nrow(data) >= opts$min.points.hex, 'hex', 'scatter')
    }
    # HACK: the smoothing line can only be colored with a default color if it
    # wouldn't be used to distinguish multiple groups.
    fill.smooth <- ifelse(length(cond) == 0, opts$fill.default, 'NULL')
    if (opts.geom == 'hex') {
      type.plot <- 'hex'
      res <- gplt.hex(data, indep, response, w=w, trans.log.thresh=opts$trans.log.thresh,
                    # hex plot already has color, make the smoothing line neutral
                    fill.smooth='grey', ...)
    } else {
      type.plot <- 'scatter.num'
      res <- gplt.scatter(data, indep, response, w,
                        dedupe.scatter=opts$dedupe.scatter,
                        min.points.jitter=opts$min.points.jitter,
                        jitter.x=opts$jitter.x,
                        jitter.y=opts$jitter.y,
                        trans.log.thresh=opts$trans.log.thresh,
                        max.factor.levels=opts$max.factor.levels,
                        fill.smooth=fill.smooth,
                        verbose=opts$verbose,
                        ...)
    }
  } else if (!is.numeric(data[[indep]]) && !is.numeric(data[[response]])) {
    choices.geom <-  c('spine', 'heat', 'auto')
    opts.geom <- match.arg(opts$geom, choices.geom)
    if (opts.geom == 'auto') {
      opts.geom <- ifelse(length(cond) == 0 && # HACK: spine plot cannot be faceted, due to difficulty of implementation
                            length(levels(data[[response]])) <= opts$max.factor.levels.spine.y &&
                            length(levels(data[[indep]])) <= opts$max.factor.levels.spine.x, 'spine', 'heat')
    }

    if (opts.geom == 'spine') {
       type.plot <- 'spine'
       res <- gplt.spine(data, indep, response, w=w,
                       plot.margin.x=opts$spine.plot.margin.x,
                       max.factor.levels=opts$max.factor.levels.spine.x,
                       palette.brewer.seq=opts$palette.brewer.seq,
                       palette.brewer.qual=opts$palette.brewer.qual,
                       verbose=opts$verbose,
                       ...)
    } else {
       type.plot <- 'heat'
       res <- gplt.heat(data, indep, response, z='NULL', w=w,
                        trans.log.thresh=opts$trans.log.thresh,
                        max.factor.levels=opts$max.factor.levels,
                        resolution.heat=opts$resolution.heat,
                        palette.brewer.seq=opts$palette.brewer.seq,
                        palette.brewer.qual=opts$palette.brewer.qual,
                        verbose=opts$verbose,
                        ...)
    }
  } else {
     # mixed types: one factor, one numeric

     if (max.points.per.group <= 1) {
        # special case: single point each - use bar plot
        choices.geom <- c('dot', 'bar', 'auto')
        opts.geom <- match.arg(opts$geom, choices.geom)
        if (opts.geom == 'auto') {
           opts.geom <- ifelse(num.groups < 6, 'bar', 'dot')
        }
        type.plot <- opts.geom
        var.fac <- response
        var.num <- indep
        if (is.numeric(data[[response]])) {
           var.fac <- indep
           var.num <- response
        }
        res <- gplt.dot(data, var.fac, w=var.num, vertical=opts$prefer.factors.vert,
                      max.factor.levels=opts$max.factor.levels, geom=opts.geom,
                      verbose=opts$verbose, ...)
     } else { #

        choices.geom <- c('violin', 'box', 'scatter', 'auto')
        opts.geom <- match.arg(opts$geom, choices.geom)
        var.fac <- response
        var.num <- indep
        # assignment of variables to axes depends only on opts$prefer.factors.vert
        switch <- is.numeric(data[[response]]) != opts$prefer.factors.vert
        if (switch) {
           var.num <- response
           var.fac <- indep
        }
        if (opts.geom == 'auto') {
           # if there are too many violin plots in a horizontal row, they
           # just look like black lines
           opts.geom <- ifelse(med.points.per.group > opts$min.points.violin,
                               ifelse(length(levels(var.fac)) <= opts$max.factor.levels.violin, 'violin', 'box'),
                               'scatter')
        }

        if (opts.geom == 'violin') {
           type.plot <- 'violin'
           res <- gplt.violin(data, var.fac, var.num, w=w,
                            trans.log.thresh=opts$trans.log.thresh,
                            max.factor.levels=opts$max.factor.levels,
                            verbose=opts$verbose,
                            ...)
        } else if (opts.geom == 'box') {
           type.plot <- 'box'
           res <- gplt.box(data, var.fac, var.num, w=w,
                         trans.log.thresh=opts$trans.log.thresh,
                         max.factor.levels=opts$max.factor.levels,
                         verbose=opts$verbose,
                         ...)
        } else { # opts.geom == 'scatter'
           # not enough points for violin or box plot
           type.plot <- 'scatter.mixed'
           res <- gplt.scatter(data, var.fac, var.num, w=w,
                             dedupe.scatter=opts$dedupe.scatter,
                             min.points.jitter=opts$min.points.jitter,
                             trans.log.thresh=opts$trans.log.thresh,
                             max.factor.levels=opts$max.factor.levels,
                             verbose=opts$verbose,
                             ...)
        }
     }
  }
  info.options(opts.geom, choices.geom, opts$verbose)
  res$type.plot=type.plot
  res
}

determine.plot.type.2 <- function(data, response, indep1, indep2, w, opts,
                                  ...) {
  res <- NULL
  type.plot <- NULL
  choices.geom <- c('spine', 'heat', 'auto')
  opts.geom <- match.arg(opts$geom, choices.geom)
  if (opts.geom == 'auto') {

    opts.geom <- ifelse((!is.numeric(data[[response]])) && (!is.numeric(data[[indep1]])) &&
                          (!is.numeric(data[[indep2]])) &&
                          length(levels(data[[response]])) <= opts$max.factor.levels.spine.z &&
                          length(levels(data[[indep2]])) <= opts$max.factor.levels.spine.y &&
                          length(levels(data[[indep1]])) <= opts$max.factor.levels.spine.x, 'spine', 'heat')
  }

  if (opts.geom == 'spine') {
    type.plot <- 'spine3'
    res <- gplt.spine3(data, indep1, indep2, response, w=w,
                     plot.margin.x=opts$spine.plot.margin.x,
                     plot.margin.y=opts$spine.plot.margin.y,
                     max.factor.levels=opts$max.factor.levels.spine.x,
                     palette.brewer.seq=opts$palette.brewer.seq,
                     palette.brewer.qual=opts$palette.brewer.qual,
                     verbose=opts$verbose,
                     ...)
  } else {
    type.plot <- 'heat3'
    res <- gplt.heat(data, indep1, indep2, response, w=w,
                     trans.log.thresh=opts$trans.log.thresh,
                     max.factor.levels=opts$max.factor.levels,
                     resolution.heat=opts$resolution.heat,
                     palette.brewer.seq=opts$palette.brewer.seq,
                     palette.brewer.qual=opts$palette.brewer.qual,
                     verbose=opts$verbose,
                     ...)
  }
  info.options(opts.geom, choices.geom, opts$verbose)
  res$type.plot=type.plot
  res
}


# format factor levels with the first or last value replaced by <var>=<val>
# this helps save space on the grid display
format.facets <- function(data, x, show.var='first') {
  lev <- NULL
  if (is.numeric(data[[x]])) {
    lev <- as.character(sort(unique(data[[x]])))
  } else {
    lev <- levels(data[[x]])
  }
  names(lev) <- lev
  idx <- ifelse(show.var == 'first', 1, length(lev))
  lev[idx] <- sprintf('%s = %s', x, lev[idx])
  lev
}

add.facet.wrap <- function(p, data, cond, preferred.order, opts) {
  nrow <- NULL
  ncol <- NULL
  show.var <- 'first'
  if (preferred.order == 'row') {
    ncol <- opts$facet.max.cols
  } else if (preferred.order == 'col') {
    nrow <- opts$facet.max.rows
    show.var <- 'last'
  }
  facet.labels <- format.facets(data, cond, show.var)

  p <- p + theme_slanted_text_x +  # axis text might overlap in small diagrams
    facet_wrap(cond,
               labeller=as_labeller(facet.labels),
               nrow=nrow, ncol=ncol) +
    theme(panel.border=element_rect(fill=NA))
  # known issue 1: always slanting text is not ideal, but no quick fix
  # known issue 2: for it would be good to order vertical facets like axis
  # (i.e., the lowest value at the bottom), but doesn't seem to be easy to configure.
}


add.facet.grid <- function(p, data, cond) {
  facet.labels.1 <- format.facets(data, cond[1], 'first')
  facet.labels.2 <- format.facets(data, cond[2], 'first')

  p + theme_slanted_text_x + # axis text can easily overlap in small diagrams
    facet_grid(sprintf('%s~%s', cond[1], cond[2]),
               labeller=labeller(.rows=as_labeller(facet.labels.1),
                                 .cols=as_labeller(facet.labels.2))) +
    theme(panel.border=element_rect(fill=NA))
}


# when we get here, color will not be used for further distinction;
# no harm in using previous factor for (redundant) coloring
redundant.factor.color <- function(p, data, response, indep, type.plot, opts) {
  if (type.plot %in% c('spine', 'spine3', 'hex')) {
    # these types are already colored
    return(p)
  }
  fac <- NULL
  if (!is.numeric(data[[response]])) {
    fac <- response
  } else if  (length(indep) > 0 && !is.numeric(data[[indep]])) {
    fac <- indep
  }
  if (!is.null(fac)) {
    p <- add.color.fill(p, data, fac,
                        palette.brewer.seq=opts$palette.brewer.seq,
                        palette.brewer.qual=opts$palette.brewer.qual)  +
      guides(fill='none', color='none')

    return (p)
  } else if (type.plot %in% c('density', 'histogram')) {
    # color 1D density plots with default color
    p + aes(fill='something') +
      aes(color='something') +
      scale_fill_manual(values=opts$fill.default) +
      scale_color_manual(values=opts$fill.default) +
      guides(fill='none', color='none')
  } else {
    p
  }
}

# represent conditional variables either by color or facets
add.conditional.layer <- function(p, data, response, indep, cond, type.plot, opts) {

  if (length(cond) == 0) {
    return(redundant.factor.color(p, data, response, indep, type.plot, opts))
  }

  # heuristic for facet layout
  preferred.order <- 'none'
  if (type.plot %in% c('density', 'histogram')) {
    preferred.order <- 'col'
  } else if (!(type.plot %in% c('hex', 'scatter.num', 'heat'))) { # pure numeric -> no preference
    # mixed num/factor -> opposite to graph layout
    preferred.order <- ifelse(opts$prefer.factors.vert, 'row', 'col')
  }

  if (length(cond) == 1) {
    u <- length(unique(data[[cond]]))

    if (u <= opts$max.factor.levels.color && !(type.plot %in% c('spine', 'heat', 'hex'))) {
      p <- add.color.fill(p, data, cond,
                          palette.brewer.seq=opts$palette.brewer.seq,
                          palette.brewer.qual=opts$palette.brewer.qual)

      aesth.legend <- ifelse(type.plot %in% c('density', 'histogram', 'bar', 'violin'), 'fill', 'color')
      p <- add.color.legend(p, data, cond, aesth.legend)

    } else {
      p <- redundant.factor.color(p, data, response, indep, type.plot, opts)
      add.facet.wrap(p, data, cond, preferred.order, opts)
    }
  } else { #length(cond) == 2
    #u1 <- length(unique(data[[cond[1]]]))
    #u2 <- length(unique(data[[cond[2]]]))
    # if (min(u1,u2) <= opts$max.factor.levels.color) {
    #    if (u1 < u2) {
    #       p + aes_string(fill=cond[1]) + aes_string(color=cond[1]) +
    #          add.facet.wrap(p, data, cond[2], preferred.order, opts)
    #    } else {
    #       p + aes_string(fill=cond[2]) + aes_string(color=cond[2]) +
    #          add.facet.wrap(p, data, cond[1], preferred.order, opts)
    #    }
    # } else {
    # example: plotluck(diamonds, price~1|cut+clarity)
    p <- redundant.factor.color(p, data, response, indep, type.plot, opts)
    add.facet.grid(p, data, cond)
    #}
  }
}


# plot multiple graphs in a grid layout, possibly over multiple pages
# suppress.*lab == 'all': no axis labels for this dimension
# suppress.*lab == 'margin' means:
# no x-axis labels except at the bottom;
# no y-axis labels except in the leftmost plots
mplot <- function(plots, rows=ceiling(sqrt(length(plots))),
                  cols=ceiling(sqrt(length(plots))),
                  suppress.xlab=FALSE, suppress.ylab=FALSE) {

  num.plots <- length(plots)
  if (cols > num.plots) {
    cols <- num.plots
    rows <- 1
  }

  size.page <- rows * cols

  layout <- matrix(seq( size.page),
                   ncol=cols, nrow=rows, byrow=TRUE)

  for (p in 1:ceiling(num.plots/size.page)) {

    grid::grid.newpage()
    grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))

    # make each plot, in the correct location
    for (i in 1:min(size.page, num.plots-(p-1)*size.page)) {
      # get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout==i, arr.ind=TRUE))
      plot.idx <- (p-1)*size.page+i
      plot.current <- plots[[plot.idx]]

      if (suppress.xlab == 'all' ||
          (suppress.xlab == 'margin' && matchidx$row != rows
           && plot.idx + cols <= num.plots)) { # last complete row

        # note: the following doesn't work if a coord_flip() was applied:
        # plot.current <- plot.current + xlab('')
        # axis labels move with the coordinates, themes don't!
        plot.current <- plot.current + theme(axis.title.x=element_blank())
      }
      if (suppress.ylab == 'all' ||
          (suppress.ylab == 'margin' && matchidx$col != 1)) {
        plot.current <- plot.current + theme(axis.title.y=element_blank())
      }

      # do not fail if a single subplot isn't well-defined
      tryCatch(
        print(plot.current, vp = grid::viewport(layout.pos.row = matchidx$row,
                                                layout.pos.col = matchidx$col)),
        error = function(e) print(gplt.blank(e), vp = grid::viewport(layout.pos.row = matchidx$row,
                                                                     layout.pos.col = matchidx$col)))
    }
  }
}

plotluck.multi <- function(response, indep, data, w='NULL',
                           in.grid=TRUE,
                           max.rows=10, max.cols=10,
                           opts=plotluck.options(),
                           ...) {

  main <- deparse(substitute(data))

  vars.x <- NULL
  vrs.y <- NULL
  if (response == '.') {
    vars.y <- names(data)
    if (length(indep) == 0) {
      vars.x <- '1'
      if (opts$verbose) {
        cat('Plotting distributions for all variables\n')
      }
    } else if (indep == '.') {
      vars.x <- names(data)
      if (opts$verbose) {
        cat('Plotting each variable against each other\n')
      }
    }  else {
      vars.x <- indep
      if (opts$verbose) {
        cat(sprintf('Plotting each variable against %s\n', vars.x))
      }
    }
  } else { # response != '.'
    vars.y <- response
    if(indep != '.') {
      stop('Error: invalid formula')
    }
    if (opts$verbose) {
      cat(sprintf('Plotting %s against each variable\n', vars.y))
    }
    vars.x <- names(data)
  }

  # order variables by conditional entropy
  cond.ent <- NULL
  if (opts$multi.entropy.order) {
    if (length(vars.y) > 1 && length(vars.x) == 1 && vars.x != '1') {
      cond.ent <- suppressWarnings(cond.entropy.data(data, given=vars.x, w=w))
      cond.ent <- cond.ent[,vars.x]
      vars.y <- vars.y[order(cond.ent)]
    } else if (length(vars.y) == 1 && length(vars.x) > 1) {
      cond.ent <- suppressWarnings(cond.entropy.data(data, target=vars.y, w=w))
      cond.ent <- cond.ent[vars.y,]
      vars.x <- vars.x[order(cond.ent)]
    } else if (length(vars.y) > 1 && length(vars.x) > 1) {
      cond.ent <- suppressWarnings(cond.entropy.data(data, w=w))
      cond.ent <- (apply(cond.ent, 1, mean) + apply(cond.ent, 1, mean))/2
      vars.x <- vars.x[order(cond.ent)]
      vars.y <- vars.x
    }
    if (opts$verbose && !is.null(cond.ent)) {
      cat('Ordering variables according to conditional entropy:\n')
      cond.ent<-sort(cond.ent)
      out<-data.frame(var=names(cond.ent),cond.ent=cond.ent)
      print(out, row.names=FALSE)
    }
  }

  combi <- expand.grid(seq(length(vars.x)),
                       seq(length(vars.y)), stringsAsFactors=FALSE)
  names(combi) = c('grid.x', 'grid.y')
  combi$x <- vars.x[combi$grid.x]
  combi$y <- vars.y[combi$grid.y]
  combi$fac.x <- sapply(combi$x, function(v) !is.numeric(data[[v]]))
  combi$fac.y <- sapply(combi$y, function(v) !is.numeric(data[[v]]))

  # try to make a square layout
  cols <- ceiling(sqrt(nrow(combi)))
  rows <- ceiling(nrow(combi)/cols)

  # figure out which plots are not at the margins, and
  # don't show axis labels for them if redundant
  suppress.xlab <- FALSE
  suppress.ylab <- FALSE

  theme.multi <- ''

  is.square <- TRUE

  if (!in.grid && opts$verbose) {
    cat('Not plotting all graphs on one page because multi.in.grid=FALSE\n')
  }

  if (in.grid) {

    # grid of small 'sparklines':
    # remove all annotation text
    theme.multi <- " + theme_minimal() +
         theme(legend.position='none',
         axis.ticks.x=element_blank(),
         axis.text.x=element_blank(),
         axis.ticks.y=element_blank(),
         axis.text.y=element_blank(),
         strip.background=element_blank(),
         strip.text=element_blank(),
         panel.grid.major=element_blank(),
         panel.grid.minor=element_blank(),
         axis.title.x=element_text(size=rel(0.7)),
         axis.title.y=element_text(size=rel(0.7)))"

    theme.multi <- gsub('\\s','', theme.multi)

    # area scale slow and hardly visible
    opts$dedupe.scatter <- 'jitter'

    # use the limited space to make subgraphs
    # relatively rectangular
    opts$facet.max.rows <- NULL
    opts$facet.max.cols <- NULL

    # do all plots fit on a single page?
    if (cols > max.cols || cols > max.rows) {
      is.square <- FALSE
      cols <- max.cols
      rows <- min(max.rows, ceiling(nrow(combi)/cols))
      if (opts$verbose) {
        cat(sprintf('Cannot fit all %d variables on one page; multi.max.rows = %d, multi.max.cols= %d\n',
                    max.rows, max.cols))
      }
    }

    if (length(vars.x) > 1 && length(vars.y) > 1 && is.square) {
      # if the full cross product fits on one page,
      # write the axis labels only on the margins
      suppress.xlab <- 'margin'
      suppress.ylab <- 'margin'
    }
    if (length(vars.x) == 1) {
      # do not repeat the axis label for the constant dimension
      if (vars.x != '1') {
        main <- vars.x
        suppress.xlab <- 'margin'
      }
      else {
        # distribution/density plot
        suppress.ylab <- 'all'
      }
    }
    if (length(vars.y) == 1) {
      if (vars.y != '1') {
        main <- vars.y
        suppress.ylab <- 'margin'
      } else {
        suppress.ylab <- 'all'
      }
    }

    # For mixed factor/num combinations, plotluck() assigns the axes only
    # depending on opts$prefer.factors.vert. In the output lattice for
    # plotluck.multi, we need to control the direction (and show opposites
    # at mirror position)
    combi$opts <- 'prefer.factors.vert=TRUE'
    combi$labs <- ''
    if (length(vars.x) > 1 && length(vars.y) == 1) {
       if (is.numeric(data[[vars.y]])) {
          combi$opts <-
             'prefer.factors.vert=FALSE'
       }
    } else  if (length(vars.x) == 1 && length(vars.y) > 1) {
       if (!is.numeric(data[[vars.x]])) {
          combi$opts <-
             'prefer.factors.vert=FALSE'
       }
    } else {  # length(vars.x) > 1 && length(vars.y) > 1
       below.diag <- combi$grid.x > combi$grid.y
       combi$opts[below.diag & combi$fac.x] <-
          'prefer.factors.vert=FALSE'
       combi$opts[(!below.diag) & (!combi$fac.y)] <-
          'prefer.factors.vert=FALSE'

       # label the corner elements with the variable name, instead
       # of 'density' or 'count' (diagonal contains distribution plots)
       idx.id <- combi$x == combi$y
       combi$labs[idx.id] <-
          sprintf('+xlab("%s")+ylab("%s")', combi$x[idx.id],
                  combi$y[idx.id])
    }
  }

  combi$opts <- sprintf('prefer.factors.vert=%s', opts$prefer.factors.vert)
  combi$labs <- ''
  if (length(vars.x) > 1 && length(vars.y) > 1 &&
      ((!in.grid) || (!is.square))) {
    # plots with the axis transposed are redundant;
    # if not part of grid, no need to show them
    combi <- combi[combi$grid.x >= combi$grid.y,]

  } else {

     # For mixed factor/num combinations, plotluck() assigns the axes only
     # depending on opts$prefer.factors.vert. In the output lattice for
     # plotluck.multi, we need to control the direction (and show opposites
     # at mirror position)
     combi$opts <- 'prefer.factors.vert=TRUE'
     if (length(vars.x) > 1 && length(vars.y) == 1) {
        if (is.numeric(data[[vars.y]])) {
           combi$opts <-
              'prefer.factors.vert=FALSE'
        }
     } else  if (length(vars.x) == 1 && length(vars.y) > 1) {
        if (!is.numeric(data[[vars.x]])) {
           combi$opts <-
              'prefer.factors.vert=FALSE'
        }
     } else {  # length(vars.x) > 1 && length(vars.y) > 1
        below.diag <- combi$grid.x > combi$grid.y
        combi$opts[below.diag & combi$fac.x] <-
           'prefer.factors.vert=FALSE'
        combi$opts[(!below.diag) & (!combi$fac.y)] <-
           'prefer.factors.vert=FALSE'

        # label the corner elements with the variable name, instead
        # of 'density' or 'count' (diagonal contains distribution plots)
        idx.id <- combi$x == combi$y
        combi$labs[idx.id] <-
           sprintf('+xlab("%s")+ylab("%s")', combi$x[idx.id],
                   combi$y[idx.id])
     }
  }

  # plot single-variable distributions for x~x
  combi$x[combi$x == combi$y] <- '1'

  if (w != 'NULL')
  {
    call.strs <- sprintf('plotluck(data, %s~%s, w=%s, opts=plotluck.options(opts,%s), ...)%s%s',
                         combi$y, combi$x, w, combi$opts, combi$labs, theme.multi)
  } else {
    call.strs <- sprintf('plotluck(data, %s~%s, opts=plotluck.options(opts,%s), ...)%s%s',
                         combi$y, combi$x, combi$opts, combi$labs, theme.multi)
  }

  call.str <- sprintf('list(%s)', paste(call.strs, collapse=','))

  # don't show verbose messages for all individual plots
  opts$verbose <- FALSE

  structure(list(plots=eval(parse(text=call.str)),
                 in.grid=in.grid,
                 cols=cols,
                 rows=rows,
                 suppress.xlab=suppress.xlab,
                 suppress.ylab=suppress.ylab,
                 main=main),
            class='plotluck_multi')
}


# auxiliary class to achieve consistent behavior of plotluck.multi with ggplot
# and plotluck: draw the plot if an only if the return value is not assigned.
#' @export
print.plotluck_multi <- function(x, ...) {
  if (x$in.grid) {
    mplot(x$plots, rows=x$rows, cols=x$cols,
          suppress.xlab=x$suppress.xlab, suppress.ylab=x$suppress.ylab)
  } else {
    lapply(x$plots, print)
  }
}


#' Run \code{plotluck} for a randomly generated formula.
#'
#' \code{sample.plotluck} samples a formula as follows:
#' \itemize{
#' \item Uniformly draw the number of variables (1-3).
#' \item For each variable, uniformly choose one of the existing variable types from the data set (numeric, ordered or unordered factor).
#' \item Uniformly select one of the data frame columns of that type.
#'}
#'
#' @param data a data frame
#' @param ... additional parameters to be passed to \code{plotluck}, such as
#'       \code{weights} and \code{opts}.
#' @return a ggplot2 object.
#'
#' @seealso \code{\link{plotluck}}
#' @export
#' @examples
#'set.seed(42)
#' data(iris)
#' sample.plotluck(iris)
sample.plotluck <- function(data, ...) {
  idx.ord <- which(sapply(data, is.ordered))
  idx.fac <- which(sapply(data, function(x) {is.factor(x) && (!is.ordered(x))}))
  idx.num <- setdiff(setdiff(seq(length(data)),idx.fac), idx.ord)
  w <- match.call()$weights
  if (!is.null(w)) {
    idx.w <- which(names(data) == w)
    idx.num <- setdiff(idx.num, idx.w)
  }

  types <- list()
  if (length(idx.fac) > 0) {
    types[[length(types)+1]] <- idx.fac
  }
  if (length(idx.ord) > 0) {
    types[[length(types)+1]] <- idx.ord
  }
  if (length(idx.num) > 0) {
    types[[length(types)+1]] <- idx.num
  }
  form.length <- sample(c(1,2,3),1)
  cond.pos <- sample(seq(form.length), 1) + 0.5
  form <- NULL
  op <- NULL
  for (pos in seq(form.length)) {
    form <- c(form, op)
    if (pos > cond.pos) {
      cond.pos <- 1000
      op <- '|'
      if (pos == 1) {
        form <- c(form, '~', '1')
      }
    } else if (pos == 1) {
      op <- '~'
    } else {
      op <- '+'
    }
    while (TRUE) {
      var.type <- sample(seq(length(types)), 1)
      var <- sample(seq(length(types[[var.type]])),1)
      var <- names(data)[(types[[var.type]][var])]
      if (!(var %in% form)) {
        # avoid duplication of variables
        break
      }
    }
    form <- c(form, var)
  }
  if (length(form) == 1) {
    form <- c(form, '~', '1')
  }
  form <-do.call(paste, as.list(form))

  # collect extra arguments
  arg <- sapply(match.call()[seq(2,length(match.call()))], deparse)
  if (length(arg) == 1) {
     s <- sprintf('plotluck(%s, %s)\n', as.character(match.call()[2]), form)
  } else {
     s <- list()
     for (i in seq(2, length(arg))) {
        s[[length(s)+1]] <- sprintf('%s = %s', names(arg)[i], arg[i])
     }
     s <- do.call(paste, c(s, sep=', '))
     s <- sprintf('plotluck(%s, %s, %s)\n', as.character(match.call()[2]), form, s)
  }
  cat(s)
  plotluck(data, as.formula(form), ...) + labs(title=s) + theme(plot.title=element_text(size=10))
}

# same as sample.plotluck, but can be called with a list of options
# only used for testing/debugging!

# e.g.
# opts.list<-list()
# opts.list[[1]]<-plotluck.options(verbose=T)
# opts.list[[2]]<-plotluck.options(verbose=T,prefer.factors.vert=F)
# opts.list[[3]]<-plotluck.options(verbose=T,prefer.factors.vert=F,max.factor.levels.color=100)
# opts.list[[4]]<-plotluck.options(verbose=T,prefer.factors.vert=T,max.factor.levels.color=100,dedupe.scatter='jitter',min.points.hex=10000,min.points.density=1E20,min.points.violin=1E20)
# opts.list[[5]]<-plotluck.options(verbose=T,prefer.factors.vert=F,max.factor.levels=3)


sample.plotluck.testopts <- function(data, opts.list, ...) {
   idx.ord <- which(sapply(data, is.ordered))
   idx.fac <- which(sapply(data, function(x) {is.factor(x) && (!is.ordered(x))}))
   idx.num <- setdiff(setdiff(seq(length(data)),idx.fac), idx.ord)
   w <- match.call()$weights
   if (!is.null(w)) {
      idx.w <- which(names(data) == w)
      idx.num <- setdiff(idx.num, idx.w)
   }

   types <- list()
   if (length(idx.fac) > 0) {
      types[[length(types)+1]] <- idx.fac
   }
   if (length(idx.ord) > 0) {
      types[[length(types)+1]] <- idx.ord
   }
   if (length(idx.num) > 0) {
      types[[length(types)+1]] <- idx.num
   }
   form.length <- sample(c(1,2,3),1)
   cond.pos <- sample(seq(form.length), 1) + 0.5
   form <- NULL
   op <- NULL
   for (pos in seq(form.length)) {
      form <- c(form, op)
      if (pos > cond.pos) {
         cond.pos <- 1000
         op <- '|'
         if (pos == 1) {
            form <- c(form, '~', '1')
         }
      } else if (pos == 1) {
         op <- '~'
      } else {
         op <- '+'
      }
      while (TRUE) {
         var.type <- sample(seq(length(types)), 1)
         var <- sample(seq(length(types[[var.type]])),1)
         var <- names(data)[(types[[var.type]][var])]
         if (!(var %in% form)) {
            # avoid duplication of variables
            break
         }
      }
      form <- c(form, var)
   }
   if (length(form) == 1) {
      form <- c(form, '~', '1')
   }
   form <-do.call(paste, as.list(form))

   # collect extra arguments
   arg <- sapply(match.call()[seq(2,length(match.call()))], deparse)
   if (length(arg) <= 2) {
      s <- sprintf('plotluck(%s, %s)', as.character(match.call()[2]), form)
   } else {
      s <- list()
      for (i in seq(3, length(arg))) {
         s[[length(s)+1]] <- sprintf('%s = %s', names(arg)[i], arg[i])
      }
      s <- do.call(paste, c(s, sep=', '))
      s <- sprintf('plotluck(%s, %s, %s)', as.character(match.call()[2]), form, s)
   }

   for (i in seq(length(opts.list))) {
      s.opt <- paste(s, sprintf(' [OPTION %d]', i), sep='')
      cat(s.opt)
      print(plotluck(data, as.formula(form), opts=opts.list[[i]],...) + labs(title=s.opt) + theme(plot.title=element_text(size=10)))
   }
}
stefan-schroedl/plotluck documentation built on Jan. 27, 2023, 3:40 a.m.