library(Matrix)
load.dat = function(file = "dat.dat") read.delim(file)
convert.dat = function(d) {
r = setNames(d$Reads, d$OTU)
l = split(r, d$Site_Species)
sstab = do.call("rbind", l)
names(dimnames(sstab)) = c("Site_Species", "OTU")
sstab
}
# Maximum tolerance for row and column sums for Gower matrix
alt.diversityTest.epsilon <- 1.0e-12
# Still not quite sure...
alt.diversityTest.ReverseTerms <- TRUE
#' @rdname alphaDiversityTest
#'
#' @export
#'
alt.alphaDiversityTest.divtable <- function(tab, zero.var.adjust = TRUE,
n.resample = 10000, method = c("bootstrap", "permute"),
test.quantiles = c(0.001, 0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99, 0.999),
...)
{
method <- match.arg(method)
ans <- list(method = "Alpha diversity test, contrast among sites in single data set")
ans$data.name <- deparse(substitute(tab))
g.distmat <- alt.diversityTest.distmat(tab)
#g.vardist <- lapply(g.distmat, function(x) diag(alt.diversityTest.gower(x)))
g.vardist <- lapply(g.distmat, alt.diversityTest.gowerDiag)
n.g <- sapply(g.vardist, length)
N <- sum(n.g)
G <- length(n.g)
ans$N.samples <- N
ans$N.groups <- G
terms = alt.diversityTest.CalcTerms(n.g, g.vardist, zero.var.adjust)
ans$observed.ln.LR <- terms$ln.LR
# Analytic distribution
ans$df.X2 <- terms$DF
ans$P.analytic <- pchisq(terms$ln.LR, df = terms$DF, TRUE)
# Empirical distribution
nulldist <- alt.diversityTest.NullDist(obs = terms$ln.LR, n.g = n.g,
g.vardist = g.vardist, zero.var.adjust = zero.var.adjust,
method = method, n.resample=n.resample)
ans$n.resample <- n.resample
ans$resample.method <- method
ans$quantiles <- quantile(nulldist, test.quantiles)
ans$P.empirical <- sum(terms$ln.LR <= nulldist) / n.resample
ans$empdist <- nulldist
structure(ans, class = c('diversity_test', 'list'))
}
# Construct a 0-1 distance matrix from the site x group matrix, each
# entry is 1 of the group is identical and 0 if it is not. This is
# represented efficiently.
#
alt.diversityTest.distmat <- function(tab, group = dimnames(tab)[[1]],
drop = TRUE)
{
if (is.null(dim(tab))) {
dim(tab) <- c(1, length(tab))
dimnames(tab) <- list(Site = "onedim", Group = names(tab))
}
if (dim(tab)[1] > 1 && is.null(group))
stop("must supply group(s), all groups not supported")
else if (missing(group) && dim(tab)[1] == 1)
group <- 1
G <- dim(tab)[1]
K <- dim(tab)[2]
N <- sum(tab)
N.G <- rowSums(tab)
D <- list()
for (g in group) {
Dmat <- matrix(1, N.G[g], N.G[g])
n.K <- tab[g, ][tab[g, ] > 0]
cum.n.K <- cumsum(n.K)
for (src in 1:length(n.K)) {
# which rows/cols to 0
slice <- (cum.n.K[src] - n.K[src] + 1):cum.n.K[src]
Dmat[slice, slice] <- 0
}
D[[as.character(g)]] <- Dmat
}
if (length(D) == 1 && drop)
D[[1]]
else D
}
# Return diagonal of matrix of centroid distances for variances based on Gower (1966)
#
# Gower JC. 1966. Some distance properties of latent root and vector
# methods used in multivariate analysis. Biometrika 53:325-338.
#
alt.diversityTest.gowerDiag <- function(dmat)
{
if (is.null(dim(dmat))) {
dim(dmat) <- c(1, length(dmat))
dimnames(dmat) <- list(Site = "onedim", Group = names(dmat))
}
if (! all(dmat == t(dmat)))
stop("dmat not symmetric")
d <- -0.5 * dmat
rd <- rowMeans(d)
# diag(d) is always 0
# diag(d) + (-rd + -rd) + mean(d)
(-rd + -rd) + mean(d)
}
#> microbenchmark(alt.gower1(m), alt.gower2(m), alt.gowerDiag(m), times = 100000)
#Unit: microseconds
# expr min lq mean median uq max neval
# alt.gower1(m) 464.887 496.0135 538.8139 510.5865 530.2535 148723.3 1e+05
# alt.gower2(m) 245.627 264.0980 288.4119 271.9580 283.1540 157597.7 1e+05
# alt.gowerDiag(m) 85.680 95.5160 105.2933 98.9100 103.8760 132955.0 1e+05
alt.gower1 <- function(tab)
{
g.distmat <- alt.diversityTest.distmat(tab)
lapply(g.distmat, function(x) diag(alt.diversityTest.gower(x)))
}
alt.gower2 <- function(tab)
{
g.distmat <- alt.diversityTest.distmat(tab)
lapply(g.distmat, alt.diversityTest.gowerDiag)
}
alt.gowerDiag <- function(tab, group = dimnames(tab)[[1]], drop = TRUE)
{
if (is.null(dim(tab))) {
dim(tab) <- c(1, length(tab))
dimnames(tab) <- list(Site = "onedim", Group = names(tab))
}
if (dim(tab)[1] > 1 && is.null(group))
stop("must supply group(s), all groups not supported")
else if (missing(group) && dim(tab)[1] == 1)
group <- 1
N.G <- rowSums(tab)
D <- list()
for (g in group) {
this.N.G <- N.G[g] # total N for site
this.n.K <- unname(tab[g, ][tab[g, ] > 0]) # nonzero sources for site
storage.mode(this.N.G) <- storage.mode(this.n.K) <- "double"
# total 1s set in full distance matrix
# this.1.G <- (this.N.G * this.N.G) - this.sumsq.n.K
# mean distance value of full distance matrix
# this.mean.d <- -0.5 * this.1.G / (this.N.G * this.N.G)
this.mean.d <- -0.5 + (sum(this.n.K * this.n.K) / (2 * this.N.G * this.N.G))
# number of 0s in the row = number in this group
# this.0.g = rep(this.n.K[k], this.n.K[k])
# number of 1s in the row = this.N.G - number of 0s
# this.1.g = this.N.G - this.0.g
# this.row.mean = -0.5 * this.1.g / this.N.G
this.row.means <- -0.5 * (this.N.G - rep(this.n.K, times = this.n.K)) / this.N.G
D[[as.character(g)]] <- -2 * this.row.means + this.mean.d
}
if (length(D) == 1 && drop)
D[[1]]
else D
}
# Create centroid distances for variances based on Gower (1966)
#
# Gower JC. 1966. Some distance properties of latent root and vector
# methods used in multivariate analysis. Biometrika 53:325-338.
#
alt.diversityTest.gower <- function(dmat)
{
if (is.null(dim(dmat))) {
dim(dmat) <- c(1, length(dmat))
dimnames(dmat) <- list(Site = "onedim", Group = names(dmat))
}
if (! all(dmat == t(dmat)))
stop("dmat not symmetric")
d <- -0.5 * dmat
rd <- rowMeans(d)
gower.mat <- d + outer(-rd, -rd, "+") + mean(d)
if (! all(abs(rowSums(gower.mat)) <= alt.diversityTest.epsilon))
stop("abs(rowSums(gower.mat)) > alt.diversityTest.epsilon")
if (! all(abs(colSums(gower.mat)) <= alt.diversityTest.epsilon))
stop("abs(colSums(gower.mat)) > alt.diversityTest.epsilon")
gower.mat
}
# Calculate terms of the variance, log-likelihood and degrees of freedom
alt.diversityTest.CalcTerms <- function(n.g, g.vardist, zero.var.adjust = TRUE)
{
N <- sum(n.g)
G <- length(n.g)
V.g <- sapply(g.vardist, sum) / (n.g - 1)
if (zero.var.adjust)
V.g <- alt.diversityTest.ZeroVarAdjust(V.g, n.g)
# ss.pooled
V.p <- sum((n.g - 1) * V.g) / (N - G)
term.V.g <- sum((n.g - 1) * log(V.g))
term.V.p <- (N - G) * log(V.p)
term.denom <- 1 + ((1 / (3 * (G - 1))) *
(sum(1 / (n.g - 1)) - (1 / (N - G))))
ln.LR <- if (alt.diversityTest.ReverseTerms)
(term.V.p - term.V.g) / term.denom
else
(term.V.g - term.V.p) / term.denom
DF <- G - 1
list(V.g = V.g, V.p = V.p, ln.LR = ln.LR, DF = DF)
}
# Construct null distribution of the variance
alt.diversityTest.NullDist <- function(obs, n.g, g.vardist,
zero.var.adjust = TRUE, method = c("bootstrap", "permute"),
n.resample = 10000)
{
method <- match.arg(method)
N <- sum(n.g)
G <- length(n.g)
cum.n.g <- cumsum(n.g)
all.g.vardist <- unlist(g.vardist, use.names=FALSE)
nulldist <- obs # observed
for (i in 2:n.resample) {
p <- switch(method,
"permute" = sample(all.g.vardist),
"bootstrap" = sample(all.g.vardist, replace=TRUE))
for (n in names(g.vardist)) {
# peel off a slice of the distance permutation for each group
slice <- (cum.n.g[n] - n.g[n] + 1):cum.n.g[n]
g.vardist[[n]] <- p[slice]
}
terms <- alt.diversityTest.CalcTerms(n.g, g.vardist, zero.var.adjust)
nulldist <- c(nulldist, terms$ln.LR)
}
sort(nulldist)
}
# If there is no diversity within a group (ss.g == 0), assign a
# minimum diversity. The minimum distance prior to dividing by
# (n.g - 1) is 1/(2 * n.g * n.g), so replace 0 diversity with
# half this quantity, divided by (n.g - 1):
#
# 1 / ((4 * n.g * n.g) * (n.g - 1))
#
alt.diversityTest.ZeroVarAdjust <- function(ss.g, n.g)
{
nn <- names(ss.g[ss.g == 0]) # names of ss.g==0 elements
if (length(nn)) # index by names
ss.g[nn] <- 1 / (4 * n.g[nn] * n.g[nn] * (n.g[nn] - 1))
ss.g
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.