Nothing
##' @title Test pairwise variable independence
##' @description This is a high-level function which accepts a data
##' set, stop criteria, and split functions for continuous variables
##' and then applies a chi-square test for independence to bins
##' generated by recursively binning the ranks of continuous variables
##' or implied by the combinations of levels of categorical variables.
##' @details The output of `inDep` is a list, the first element of
##' which is a list of lists, each of which records the details of the
##' binning of a particular pair of variables
##' @param data `data.frame` or object coercible to a `data.frame`
##' @param stopCriteria output of `makeCriteria` providing criteria
##' used to stop binning to be passed to binning functions
##' @param catCon splitting function to apply to pairs of one
##' cateogorical and one continuous variable
##' @param conCon splitting function to apply to pairs of continuous
##' variables
##' @param ptype one of 'simple', 'conservative', 'gamma', or
##' 'fitted'; the type of p-values to compute for continuous pairs
##' and pairs of mixed type. 'Conservative' assumes a chi-square
##' distribution to the statistic with highly conservative degrees
##' of freedom that are based on continuous uniform margins and so
##' do not account for the constraints introduced by the ranks.
##' 'Simple' assumes a chi-square distribution but uses
##' contingency-table inspired degrees of freedom which can be
##' slightly anti-conservative in the case of continuous pairs but
##' work well for continuous/categorical comparisons. 'Gamma'
##' assumes a gamma distribution on the resulting statistics with
##' parameters fit from the same empirical investigation. 'Fitted'
##' mixes the gamma approach and the chi-squared approach these by
##' applying 'gamma' to continuous-categorical comparisons and a
##' least squares fitted version of the simple approximation to
##' continuous-continuous comparisons. For all
##' categorical-categorical comparisons the contingency table
##' degrees of freedom are use in a chi-squared distribution. More
##' details can be found in the associated paper.
##' @param dropPoints logical; should returned bins contain points?
##' @return An `inDep` object, with slots `data`, `types`, `pairs`,
##' `binnings`, `residuals`, `statistics`, `K`, `logps`, and
##' `pvalues` that stores the results of using recursive binning with
##' the specified splitting logic to test independence on a data set.
##' `data` gives the name of the data object in the global environment
##' which was split, `types` is a character vector giving the data
##' types of each pair, `pairs` is a character vector of the variable
##' names of each pair, `binnings` is a list of lists where each list
##' is the binning fir to the corresponding pair by the recursive
##' binning algorithm, `residuals` is list of numeric vectors giving
##' the residual for each bin of each pairwise binning, `statistics`
##' is a numeric vector giving the chi-squared statistic for each
##' binning, `K` is a numeric vector giving the number of bins in each
##' binning, `logps` gives the natural logarithm of the statistic's
##' p-value, and finally `pvalues` is a numeric vector of p-values
##' for `statistics` based on the specified p-value computation, which
##' defaults to 'simple'. Internally, the
##' p-values are computed on the log scale to better distinguish
##' between strongly dependent pairs and the `pvalues` returned are
##' computed by calling `exp(logps)`. The order of all returned values
##' is by increasing `logps`.
##' @author Chris Salahub
inDep <- function(data, stopCriteria,
catCon = uniRIntSplit,
conCon = rIntSplit,
ptype = c('simple',
'conservative',
'gamma',
'best'),
dropPoints = FALSE) {
## argument checking
datName <- deparse1(substitute(data))
if (!is.data.frame(data)) stop("`data` must be a data frame")
if (missing(stopCriteria)) {
stopCriteria <- "depth >= 6 | n < 1 | expn <= 10 | stopped"
}
## function definition
stopFn <- function(bns) stopper(bns, stopCriteria)
## pre-processing
vars <- names(data) # get all variable names
types <- sapply(data, class) # get all classes
chars <- types == "character"
ints <- types == "integer"
logs <- types == "logical"
if (any(ints)) { # regularize type names
data[ints] <- lapply(data[ints], as.numeric)
types[ints] <- "numeric"
}
if (any(chars)) {
data[chars] <- lapply(data[chars], as.factor)
types[chars] <- "factor"
}
if (any(logs)) {
data[logs] <- lapply(data[logs],
function(x) as.factor(as.numeric(x)))
types[logs] <- "factor"
}
combs <- combn(ncol(data), 2) # get all pairs
scndRwInds <- types[combs[2, ]] == "factor"
scndRwFs <- combs[2, scndRwInds]
combs[2, scndRwInds] <- combs[1, scndRwInds]
combs[1, scndRwInds] <- scndRwFs # factors always come first
typecomb <- apply(combs, 2,
function(x) paste(types[x], collapse = ":"))
nlev <- sapply(data, function(var) length(levels(var)))
## get ranks
ranks <- data
ranks[, types=="numeric"] <- apply(data[types=="numeric"],
2, rank,
ties.method = "random")
## pairwise functions: random squarified splitting
bns <- vector(mode = "list", length(typecomb))
names(bns) <- apply(combs, 2,
function(x) paste(vars[x],
collapse = ":"))
facFac <- which(typecomb == "factor:factor")
facNum <- which(typecomb == "factor:numeric")
numNum <- which(typecomb == "numeric:numeric")
for (ii in facFac) {
bns[[ii]] <- catBinner(x = ranks[, combs[1, ii]],
y = ranks[, combs[2, ii]],
dropPoints = dropPoints)
}
for (ii in facNum) {
bns[[ii]] <- uniBinner(x = ranks[, combs[1, ii]],
y = ranks[, combs[2, ii]],
stopper = stopFn,
splitter = catCon,
dropPoints = dropPoints)
}
for (ii in numNum) {
bns[[ii]] <- binner(x = ranks[, combs[1, ii]],
y = ranks[, combs[2, ii]],
stopper = stopFn,
splitter = conCon,
dropPoints = dropPoints)
}
## compute statistic values and p-values
binStats <- lapply(bns, binChi)
obStats <- sapply(binStats, function(x) x$stat)
K <- sapply(binStats, function(x) x$nbins)
ptype <- match.arg(ptype)
pvals <- simpleDfs <- numeric(length(typecomb))
## compute the simple approximate dfs
simpleDfs[facFac] <- K[facFac] - nlev[combs[1, facFac]] -
nlev[combs[2, facFac]] + 1
simpleDfs[facNum] <- facNumSimpleDf(K[facNum],
nlev[combs[1, facNum]])
simpleDfs[numNum] <- numNumSimpleDf(K[numNum])
## compute the pvalues
pvals[facFac] <- pchisq(obStats[facFac],
df = K[facFac] - nlev[combs[1, facFac]] -
nlev[combs[2, facFac]] + 1,
lower.tail = FALSE, log.p = TRUE)
if (ptype == 'simple') {
pvals[facNum] <- pchisq(obStats[facNum],
df = facNumSimpleDf(K[facNum],
nlev[combs[1, facNum]]),
lower.tail = FALSE, log.p = TRUE)
pvals[numNum] <- pchisq(obStats[numNum],
df = numNumSimpleDf(K[numNum]),
lower.tail = FALSE, log.p = TRUE)
} else if (ptype == 'conservative') {
pvals[facNum] <- pchisq(obStats[facNum],
df = K[facNum] - nlev[combs[1, facNum]],
lower.tail = FALSE, log.p = TRUE)
pvals[numNum] <- pchisq(obStats[facNum],
df = K[facNum] - 1,
lower.tail = FALSE, log.p = TRUE)
} else if (ptype == 'gamma') {
pvals[facNum] <- pgamma(obStats[facNum],
shape = facNumGammaShape(K[facNum],
nlev[combs[1, facNum]]),
scale = facNumGammaScale(K[facNum],
nlev[combs[1, facNum]]),
lower.tail = FALSE, log.p = TRUE)
pvals[numNum] <- pgamma(obStats[numNum],
shape = numNumGammaShape(K[numNum]),
scale = numNumGammaScale(K[numNum]),
lower.tail = FALSE, log.p = TRUE)
} else if (ptype == 'best') {
pvals[facNum] <- pgamma(obStats[facNum],
shape = facNumGammaShape(K[facNum],
nlev[combs[1, facNum]]),
scale = facNumGammaScale(K[facNum],
nlev[combs[1, facNum]]),
lower.tail = FALSE, log.p = TRUE)
pvals[numNum] <- pchisq(obStats[numNum],
df = numNumFittedDf(K[numNum]),
lower.tail = FALSE, log.p = TRUE)
} else {
stop("ptype must be one of ('simple', 'conservative', 'gamma', or 'best')")
}
pord <- order(pvals)
## return everything
inDep <- list(data = datName,
types = typecomb[pord],
pairs = names(bns)[pord],
binnings = bns[pord],
simpleDfs = simpleDfs[pord],
Ks = K[pord],
residuals = sapply(binStats[pord],
function(x) x$residuals),
statistics = sapply(binStats[pord],
function(x) x$stat),
logps = pvals[pord],
pvalues = exp(pvals)[pord])
class(inDep) <- "inDep"
inDep
}
##' Methods
##' @title S3 methods for `inDep`
##' @description The `summary` and `plot` methods outlined here
##' support the quick description of an `inDep` object.
##' @details For each index in `which`, this function produces a row
##' of three plots. The first plot is the raw data, the second plot
##' is the ranks of the data, and the final plot is the binning
##' contained in the `inDep` object.
##' @param object `inDep` object to summarize
##' @param x object with class `inDep`
##' @param ... additional arguments to pass on to the method
##' @param which indices of binnings to display from `x`, where
##' binnings are ordered by increasing p-value
##' @param border colour of borders to be drawn on the binnings
##' @param buffer relative width of empty space separating categories
##' @param dropPoints logical: should points be dropped for the plot
##' of the binnings?
##' @param colrng colour range to be passed to `residualFill` for
##' plotting
##' @param nbr number of breaks to be passed to `residualFill` for
##' plotting
##' @param pch point type passed to plot
##' @return Nothing for the plot method, while summary quietly returns
##' a summary of `inDep`
##' @author Chris Salahub
##' @describeIn methods Summary method for `inDep`
summary.inDep <- function(object, ...) {
dat <- object$data
nprs <- length(object$pairs)
typtab <- table(object$types)
pvals <- quantile(object$pvalues, probs = seq(0, 1, 0.1))
sig5 <- sum(object$pvalues < 0.05)
sig1 <- sum(object$pvalues < 0.01)
cat(paste0("All ", nprs, " pairs in ", dat,
" recursively binned with type distribution: \n"))
print(typtab)
cat(paste0(sig5, " pairs are significant at 5% and ",
sig1, " pairs are significant at 1%\n"))
invisible(list(data = dat, npair = nprs,
typeTable = typtab,
pDeciles = pvals))
}
##' @describeIn methods Plot method for `inDep`
plot.inDep <- function(x, ..., which = 1:5, border = "black",
buffer = 0.01, dropPoints = FALSE,
colrng = c("steelblue", "white", "firebrick"),
nbr = NA, pch = ".") {
dat <- get(x$data)
prs <- strsplit(x$pairs[which], split = "\\:")
typs <- strsplit(x$types[which], split = "\\:")
oldPar <- par(mfrow = c(length(which), 3),
mar = c(0.5, 1.1, 2.1, 0.1))
#maxRes <- max(sapply(x$residuals, max))
for (ii in seq_along(prs)) {
x1 <- dat[, prs[[ii]][1]] # get pair
y <- dat[, prs[[ii]][2]]
scl <- length(x1)*buffer
if (typs[[ii]][1] == "factor") { # jitter factors
x1 <- as.factor(x1) # ensure type
xtbl <- table(x1)
xbr <- c(0, xtbl)
xa <- cumsum(xbr[-length(xbr)])/2 + cumsum(xbr[-1])/2
## handle small categories thinner than scl
xmns <- pmin(-xtbl[as.numeric(x1)]/2 + scl, 0)
xmxs <- pmax(xtbl[as.numeric(x1)]/2 - scl, 0)
pltx <- xa[as.numeric(x1)] +
runif(length(x1), min = xmns, max = xmxs)
} else {
pltx <- x1
xbr <- NA
}
if (typs[[ii]][2] == "factor") {
y <- as.factor(y)
ytbl <- table(y)
ybr <- c(0, ytbl)
ya <- cumsum(ybr[-length(ybr)])/2 + cumsum(ybr[-1])/2
ymns <- pmin(-ytbl[as.numeric(y)]/2 + scl, 0)
ymxs <- pmax(ytbl[as.numeric(y)]/2 - scl, 0)
plty <- ya[as.numeric(y)] +
runif(length(y), min = ymns, max = ymxs)
} else {
plty <- y
ybr <- NA
}
## create three plot areas
plot(x = pltx, y = plty, xaxt = "n", yaxt = "n", pch = pch,
...)
abline(h = cumsum(ybr), v = cumsum(xbr), lty = 2)
mtext("Raw", side = 3, line = 0, cex = 0.6)
plot(x = rank(pltx, ties.method = "random"),
y = rank(plty, ties.method = "random"),
xaxt = "n", yaxt = "n", pch = pch, ...)
abline(h = cumsum(ybr), v = cumsum(xbr), lty = 2)
mtext("Ranks", side = 3, line = 0, cex = 0.6)
mtext(side = 3, line = 1, cex = 0.8,
text = bquote("Pair:"~.(paste(prs[[ii]],
collapse = "|"))*
","~log[10]*p~"="~.(round(x$logps[which[ii]]/
log(10),
1))))
if (dropPoints) thirdPch = NA else thirdPch = pch
plotBinning(x$binnings[[which[ii]]], factor = 0.9,
xlab = "", ylab = "", border = border,
fill = importanceFill(x$binnings[[which[ii]]],
colrng = colrng, nbr = nbr),
suppressLabs = TRUE, pch = thirdPch, ...)
mtext("Bins", side = 3, line = 0, cex = 0.6)
}
par(oldPar)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.