R/dispatch.R
In AssocBin: Measuring Association with Recursive Binning

Documented in inDep plot.inDep summary.inDep

##' @title Test pairwise variable independence
##' @description This is a high-level function which accepts a data
##' set, stop criteria, and split functions for continuous variables
##' and then applies a chi-square test for independence to bins
##' generated by recursively binning the ranks of continuous variables
##' or implied by the combinations of levels of categorical variables.
##' @details The output of `inDep` is a list, the first element of
##' which is a list of lists, each of which records the details of the
##' binning of a particular pair of variables
##' @param data `data.frame` or object coercible to a `data.frame`
##' @param stopCriteria output of `makeCriteria` providing criteria
##' used to stop binning to be passed to binning functions
##' @param catCon splitting function to apply to pairs of one
##' cateogorical and one continuous variable
##' @param conCon splitting function to apply to pairs of continuous
##' variables
##' @param ptype one of 'simple', 'conservative', 'gamma', or
##' 'fitted'; the type of p-values to compute for continuous pairs
##' and pairs of mixed type. 'Conservative' assumes a chi-square
##' distribution to the statistic with highly conservative degrees
##' of freedom that are based on continuous uniform margins and so
##' do not account for the constraints introduced by the ranks.
##' 'Simple' assumes a chi-square distribution but uses
##' contingency-table inspired degrees of freedom which can be
##' slightly anti-conservative in the case of continuous pairs but
##' work well for continuous/categorical comparisons. 'Gamma'
##' assumes a gamma distribution on the resulting statistics with
##' parameters fit from the same empirical investigation. 'Fitted'
##' mixes the gamma approach and the chi-squared approach these by
##' applying 'gamma' to continuous-categorical comparisons and a
##' least squares fitted version of the simple approximation to
##' continuous-continuous comparisons. For all
##' categorical-categorical comparisons the contingency table
##' degrees of freedom are use in a chi-squared distribution. More
##' details can be found in the associated paper.
##' @param dropPoints logical; should returned bins contain points?
##' @return An `inDep` object, with slots `data`, `types`, `pairs`,
##' `binnings`, `residuals`, `statistics`, `K`, `logps`, and
##' `pvalues` that stores the results of using recursive binning with
##' the specified splitting logic to test independence on a data set.
##' `data` gives the name of the data object in the global environment
##' which was split, `types` is a character vector giving the data
##' types of each pair, `pairs` is a character vector of the variable
##' names of each pair, `binnings` is a list of lists where each list
##' is the binning fir to the corresponding pair by the recursive
##' binning algorithm, `residuals` is list of numeric vectors giving
##' the residual for each bin of each pairwise binning, `statistics`
##' is a numeric vector giving the chi-squared statistic for each
##' binning, `K` is a numeric vector giving the number of bins in each
##' binning, `logps` gives the natural logarithm of the statistic's
##' p-value, and finally `pvalues` is a numeric vector of p-values
##' for `statistics` based on the specified p-value computation, which
##' defaults to 'simple'.  Internally, the
##' p-values are computed on the log scale to better distinguish
##' between strongly dependent pairs and the `pvalues` returned are
##' computed by calling `exp(logps)`. The order of all returned values
##' is by increasing `logps`.
##' @author Chris Salahub
inDep <- function(data, stopCriteria,
                  catCon = uniRIntSplit,
                  conCon = rIntSplit,
                  ptype = c('simple',
                            'conservative',
                            'gamma',
                            'best'),
                  dropPoints = FALSE) {
    ## argument checking
    datName <- deparse1(substitute(data))
    if (!is.data.frame(data)) stop("`data` must be a data frame")
    if (missing(stopCriteria)) {
        stopCriteria <- "depth >= 6 | n < 1 | expn <= 10 | stopped"
    }

    ## function definition
    stopFn <- function(bns) stopper(bns, stopCriteria)
    
    ## pre-processing
    vars <- names(data) # get all variable names
    types <- sapply(data, class) # get all classes
    chars <- types == "character"
    ints <- types == "integer"
    logs <- types == "logical"
    if (any(ints)) { # regularize type names
        data[ints] <- lapply(data[ints], as.numeric)
        types[ints] <- "numeric"
    }
    if (any(chars)) {
        data[chars] <- lapply(data[chars], as.factor)
        types[chars] <- "factor"
    }
    if (any(logs)) {
        data[logs] <- lapply(data[logs],
                             function(x) as.factor(as.numeric(x)))
        types[logs] <- "factor"
    }
    combs <- combn(ncol(data), 2) # get all pairs
    scndRwInds <- types[combs[2, ]] == "factor"
    scndRwFs <- combs[2, scndRwInds]
    combs[2, scndRwInds] <- combs[1, scndRwInds]
    combs[1, scndRwInds] <- scndRwFs # factors always come first
    typecomb <- apply(combs, 2,
                      function(x) paste(types[x], collapse = ":"))
    nlev <- sapply(data, function(var) length(levels(var)))

    ## get ranks
    ranks <- data
    ranks[, types=="numeric"] <- apply(data[types=="numeric"],
                                       2, rank,
                                       ties.method = "random")
    
    ## pairwise functions: random squarified splitting
    bns <- vector(mode = "list", length(typecomb))
    names(bns) <- apply(combs, 2,
                        function(x) paste(vars[x],
                                          collapse = ":"))
    facFac <- which(typecomb == "factor:factor")
    facNum <- which(typecomb == "factor:numeric")
    numNum <- which(typecomb == "numeric:numeric")
    for (ii in facFac) {
        bns[[ii]] <- catBinner(x = ranks[, combs[1, ii]],
                               y = ranks[, combs[2, ii]],
                               dropPoints = dropPoints)
    }
    for (ii in facNum) {
        bns[[ii]] <- uniBinner(x = ranks[, combs[1, ii]],
                               y = ranks[, combs[2, ii]],
                               stopper = stopFn,
                               splitter = catCon,
                               dropPoints = dropPoints)
    }
    for (ii in numNum) {
        bns[[ii]] <- binner(x = ranks[, combs[1, ii]],
                            y = ranks[, combs[2, ii]],
                            stopper = stopFn,
                            splitter = conCon,
                            dropPoints = dropPoints)
    }

    ## compute statistic values and p-values
    binStats <- lapply(bns, binChi)
    obStats <- sapply(binStats, function(x) x$stat)
    K <- sapply(binStats, function(x) x$nbins)
    ptype <- match.arg(ptype)
    pvals <- simpleDfs <- numeric(length(typecomb))
    ## compute the simple approximate dfs
    simpleDfs[facFac] <- K[facFac] - nlev[combs[1, facFac]] -
        nlev[combs[2, facFac]] + 1
    simpleDfs[facNum] <- facNumSimpleDf(K[facNum],
                                        nlev[combs[1, facNum]])
    simpleDfs[numNum] <- numNumSimpleDf(K[numNum])
    ## compute the pvalues
    pvals[facFac] <- pchisq(obStats[facFac],
                            df = K[facFac] - nlev[combs[1, facFac]] -
                                nlev[combs[2, facFac]] + 1,
                            lower.tail = FALSE, log.p = TRUE)
    if (ptype == 'simple') {
        pvals[facNum] <- pchisq(obStats[facNum],
                                df = facNumSimpleDf(K[facNum],
                                                    nlev[combs[1, facNum]]),
                                lower.tail = FALSE, log.p = TRUE)
        pvals[numNum] <- pchisq(obStats[numNum],
                                df = numNumSimpleDf(K[numNum]),
                                lower.tail = FALSE, log.p = TRUE)
    } else if (ptype == 'conservative') {
        pvals[facNum] <- pchisq(obStats[facNum],
                                df = K[facNum] - nlev[combs[1, facNum]],
                                lower.tail = FALSE, log.p = TRUE)
        pvals[numNum] <- pchisq(obStats[facNum],
                                df = K[facNum] - 1,
                                lower.tail = FALSE, log.p = TRUE)
    } else if (ptype == 'gamma') {
        pvals[facNum] <- pgamma(obStats[facNum],
                                shape = facNumGammaShape(K[facNum],
                                                         nlev[combs[1, facNum]]),
                                scale = facNumGammaScale(K[facNum],
                                                         nlev[combs[1, facNum]]),
                                lower.tail = FALSE, log.p = TRUE)
        pvals[numNum] <- pgamma(obStats[numNum],
                                shape = numNumGammaShape(K[numNum]),
                                scale = numNumGammaScale(K[numNum]),
                                lower.tail = FALSE, log.p = TRUE)
    } else if (ptype == 'best') {
        pvals[facNum] <- pgamma(obStats[facNum],
                                shape = facNumGammaShape(K[facNum],
                                                         nlev[combs[1, facNum]]),
                                scale = facNumGammaScale(K[facNum],
                                                         nlev[combs[1, facNum]]),
                                lower.tail = FALSE, log.p = TRUE)
        pvals[numNum] <- pchisq(obStats[numNum],
                                df = numNumFittedDf(K[numNum]),
                                lower.tail = FALSE, log.p = TRUE)
    } else {
        stop("ptype must be one of ('simple', 'conservative', 'gamma', or 'best')")
    }
    pord <- order(pvals)
    
    ## return everything
    inDep <- list(data = datName,
                  types = typecomb[pord],
                  pairs = names(bns)[pord],
                  binnings = bns[pord],
                  simpleDfs = simpleDfs[pord],
                  Ks = K[pord],
                  residuals = sapply(binStats[pord],
                                     function(x) x$residuals),
                  statistics = sapply(binStats[pord],
                                      function(x) x$stat),
                  logps = pvals[pord],
                  pvalues = exp(pvals)[pord])
    class(inDep) <- "inDep"
    inDep
}

##' Methods
##' @title S3 methods for `inDep`
##' @description The `summary` and `plot` methods outlined here
##' support the quick description of an `inDep` object.
##' @details For each index in `which`, this function produces a row
##' of three plots. The first plot is the raw data, the second plot
##' is the ranks of the data, and the final plot is the binning
##' contained in the `inDep` object.
##' @param object `inDep` object to summarize
##' @param x object with class `inDep`
##' @param ... additional arguments to pass on to the method
##' @param which indices of binnings to display from `x`, where
##' binnings are ordered by increasing p-value
##' @param border colour of borders to be drawn on the binnings
##' @param buffer relative width of empty space separating categories
##' @param dropPoints logical: should points be dropped for the plot
##' of the binnings?
##' @param colrng colour range to be passed to `residualFill` for
##' plotting
##' @param nbr number of breaks to be passed to `residualFill` for
##' plotting
##' @param pch point type passed to plot
##' @return Nothing for the plot method, while summary quietly returns
##' a summary of `inDep`
##' @author Chris Salahub
##' @describeIn methods Summary method for `inDep`
summary.inDep <- function(object, ...) {
    dat <- object$data
    nprs <- length(object$pairs)
    typtab <- table(object$types)
    pvals <- quantile(object$pvalues, probs = seq(0, 1, 0.1))
    sig5 <- sum(object$pvalues < 0.05)
    sig1 <- sum(object$pvalues < 0.01)
    cat(paste0("All ", nprs, " pairs in ", dat,
          " recursively binned with type distribution: \n"))
    print(typtab)
    cat(paste0(sig5, " pairs are significant at 5% and ",
               sig1, " pairs are significant at 1%\n"))
    invisible(list(data = dat, npair = nprs,
                   typeTable = typtab,
                   pDeciles = pvals))
}
##' @describeIn methods Plot method for `inDep`
plot.inDep <- function(x, ..., which = 1:5, border = "black",
                       buffer = 0.01, dropPoints = FALSE,
                       colrng = c("steelblue", "white", "firebrick"),
                       nbr = NA, pch = ".") {
    dat <- get(x$data)
    prs <- strsplit(x$pairs[which], split = "\\:")
    typs <- strsplit(x$types[which], split = "\\:")
    oldPar <- par(mfrow = c(length(which), 3),
                  mar = c(0.5, 1.1, 2.1, 0.1))
    #maxRes <- max(sapply(x$residuals, max))
    for (ii in seq_along(prs)) {
        x1 <- dat[, prs[[ii]][1]] # get pair
        y <- dat[, prs[[ii]][2]]
        scl <- length(x1)*buffer
        if (typs[[ii]][1] == "factor") { # jitter factors
            x1 <- as.factor(x1) # ensure type
            xtbl <- table(x1)
            xbr <- c(0, xtbl)
            xa <- cumsum(xbr[-length(xbr)])/2 + cumsum(xbr[-1])/2
            ## handle small categories thinner than scl
            xmns <- pmin(-xtbl[as.numeric(x1)]/2 + scl, 0)
            xmxs <- pmax(xtbl[as.numeric(x1)]/2 - scl, 0)
            pltx <- xa[as.numeric(x1)] +
                runif(length(x1), min = xmns, max = xmxs)
        } else {
            pltx <- x1
            xbr <- NA
        }
        if (typs[[ii]][2] == "factor") {
            y <- as.factor(y)
            ytbl <- table(y)
            ybr <- c(0, ytbl)
            ya <- cumsum(ybr[-length(ybr)])/2 + cumsum(ybr[-1])/2
            ymns <- pmin(-ytbl[as.numeric(y)]/2 + scl, 0)
            ymxs <- pmax(ytbl[as.numeric(y)]/2 - scl, 0)
            plty <- ya[as.numeric(y)] +
                runif(length(y), min = ymns, max = ymxs)
        } else {
            plty <- y
            ybr <- NA
        }
        ## create three plot areas
        plot(x = pltx, y = plty, xaxt = "n", yaxt = "n", pch = pch,
             ...)
        abline(h = cumsum(ybr), v = cumsum(xbr), lty = 2)
        mtext("Raw", side = 3, line = 0, cex = 0.6)
        plot(x = rank(pltx, ties.method = "random"),
             y = rank(plty, ties.method = "random"),
             xaxt = "n", yaxt = "n", pch = pch, ...)
        abline(h = cumsum(ybr), v = cumsum(xbr), lty = 2)
        mtext("Ranks", side = 3, line = 0, cex = 0.6)
        mtext(side = 3, line = 1, cex = 0.8,
              text = bquote("Pair:"~.(paste(prs[[ii]],
                                            collapse = "|"))*
                                ","~log[10]*p~"="~.(round(x$logps[which[ii]]/
                                                         log(10),
                                                         1))))
        if (dropPoints) thirdPch = NA else thirdPch = pch
        plotBinning(x$binnings[[which[ii]]], factor = 0.9,
                    xlab = "", ylab = "", border = border,
                    fill = importanceFill(x$binnings[[which[ii]]],
                                          colrng = colrng, nbr = nbr),
                    suppressLabs = TRUE, pch = thirdPch, ...)
        mtext("Bins", side = 3, line = 0, cex = 0.6)
    }
    par(oldPar)
}