R/amova.R

## amova.R (2010-07-15)

##   Analysis of Molecular Variance

## Copyright 2010 Emmanuel Paradis

## This file is part of the R-package `pegas'.
## See the file ../COPYING for licensing issues.

amova <- function(formula, data = NULL, nperm = 1000, is.squared = FALSE)
{
    y.nms <- as.character(as.expression(formula[[2]]))
    rhs <- formula[[3]]
    gr.nms <- as.character(as.expression(rhs))
    ## keep the highest level first:
    if (length(rhs) > 1) gr.nms <- unlist(strsplit(gr.nms, "/"))

    y <- get(y.nms)
    if (!is.squared) y <- y^2 # square the distances
    if (class(y) == "dist") y <- as.matrix(y)
    if (!is.matrix(y))
        stop("the lhs of the formula must be either a matrix or an object of class 'dist'.")
    n <- dim(y)[1] # number of individuals

    gr <-
        if (is.null(data)) as.data.frame(sapply(gr.nms, get, envir = .GlobalEnv))
        else data[gr.nms]
    Nlv <- length(gr) # number of levels

### 5 local functions
    ## a simplified version of tapply(X, INDEX, FUN = sum):
    foo <- function(x, index) unlist(lapply(split(x, index), sum))

    getSSD <- function(y, gr, Nlv, N, n) {
        SSD <- numeric(Nlv + 2)
        SSD[Nlv + 2] <- sum(y/(2 * n)) # total SSD
        ## calculate SSD *within* each level:
        for (i in 1:Nlv) {
            p <- gr[, i] # extract the grouping at this level
            SSD[i + 1] <- sum((y/(2 * N[[i]])[p])[outer(p, p, "==")])
        }
        ## now differentiate to get the SSDs:
        if (Nlv > 1)
            for (i in Nlv:2) SSD[i] <- SSD[i] - SSD[i + 1]
        SSD[1] <- SSD[Nlv + 2] - sum(SSD[-(Nlv + 2)])
        SSD
    }

    getDF <- function(gr, Nlv, N, n) {
        df <- numeric(Nlv + 2)
        df[1:Nlv] <- unlist(lapply(N, length)) # number of groups
        ## get the df from the lowest level to the highest one:
        df[Nlv + 1] <- n
        for (i in (Nlv + 1):2) df[i] <-  df[i] - df[i - 1]
        df[1] <- df[1] - 1
        df[Nlv + 2] <- n - 1 # total df
        df
    }

    getNcoefficient <- function(gr, Nlv, N, n) {
        Nig <- N[[Nlv]] # numbers from the lowest level (ie, pop)
        Nig2 <- Nig^2
        npop <- length(Nig)
        if (Nlv == 1) # do classic variance components estimation
            ncoef <- (n - sum(Nig2)/n)/(npop - 1)
        else {
            if (Nlv == 2) { # use eq.9a-c in Excoffier et al. 1992
                ncoef <- numeric(3) # n, n', and n'', respectively
                G <- nlevels(gr[, 1]) # the number of groups
                ## use the fact that match() returns the 1st match of the
                ## 1st argument into the 2nd one, so this returns a factor
                ## indicating to which group each pop belongs to
                g <- gr[, 1][match(1:npop, as.integer(gr[, 2]))]
                npopBYgr <- tabulate(g) # number of pops in each group
                A <- sum(foo(Nig2, g)/foo(Nig, g))
                ncoef[1] <- (n - A)/sum(npopBYgr - 1) # corrects eq.9a in Excoffier et al. 1992
                ncoef[2] <- (A - sum(Nig2)/n)/(G - 1)
                ncoef[3] <- (n - sum(foo(Nig, g)^2/n))/(G - 1)
            } else { # use approximate formulae
                ncoef <- numeric(Nlv + 1)
                ## the coefficients are indexed from the highest to the lowest
                ## level to ease computation of the variance components
                ncoef[Nlv] <- (n - sum(Nig2)/n)/(npop - 1) # same than above
                ncoef[Nlv + 1] <- 1 # for the within pop level
                for (i in 1:(Nlv - 1)) {
                    group <- gr[, i]
                    g <- group[match(1:npop, as.integer(gr[, i + 1]))]
                    A <- sum(foo(Nig, g)^2)/sum(foo(Nig, g))
                    ncoef[i] <- (n - A)/(nlevels(group) - 1)
                }
            }
        }
        names(ncoef) <- letters[1:length(ncoef)] # suggestion by Rodney Dyer
        ncoef
    }

    getVarComp <- function(MSD, Nlv, ncoef) {
        ## get the variance components bottom-up
        if (Nlv == 1)
            sigma2 <- c((MSD[1] - MSD[2])/ncoef, MSD[2]) # 'n' changed to 'ncoef' (fix by Qixin He, 2010-07-15)
        else {
            sigma2 <- numeric(Nlv + 1)
            if (Nlv == 2) {
                sigma2[3] <- MSD[3]
                sigma2[2] <- (MSD[2] - sigma2[3])/ncoef[1]
                sigma2[1] <- (MSD[1] - MSD[3] - ncoef[2]*sigma2[2])/ncoef[3]
            } else {
                sigma2[Nlv + 1] <- MSD[Nlv + 1]
                for (i in Nlv:1) {
                    sel <- i:(Nlv + 1)
                    sigma2[i] <- (MSD[i] - sum(ncoef[sel]*sigma2[sel]))/ncoef[i]
                }
            }
        }
        names(sigma2) <- c(names(gr), "Error")
        sigma2
    }

    ## fit the AMOVA model to the data:
    N <- lapply(gr, tabulate) # number of individuals in each group
    SSD <- getSSD(y, gr, Nlv, N, n)
    df <- getDF(gr, Nlv, N, n)
    MSD <- SSD/df
    ncoef <- getNcoefficient(gr, Nlv, N, n)
    sigma2 <- getVarComp(MSD, Nlv, ncoef)

    ## output the results:
    res <- list(tab = data.frame(SSD = SSD, MSD = MSD, df = df,
                row.names = c(names(gr), "Error", "Total")),
                varcoef = ncoef, varcomp = sigma2, call = match.call())
    class(res) <- "amova"

    if (nperm) {
        rSigma2 <- matrix(0, nperm, length(sigma2))
        ## First shuffle all individuals among all pops to test
        ## the within pop var comp; if there is only one level,
        ## this is used to test both var components.
        j <- if (Nlv == 1) 1:2 else Nlv + 1
        for (i in 1:nperm) {
            rY <- perm.rowscols(y, n)
            ## the hierarchical structure is not changed so just
            ## need to recalculate the SSD and var comp
            rSSD <- getSSD(rY, gr, Nlv, N, n)
            rSigma2[i, j] <- getVarComp(rSSD/df, Nlv, ncoef)[j]
        }
        if (Nlv > 1) {
            ## for the lowest level, we just permute individuals within the
            ## level just above, so the hierarchical structure is unchanged
            j <- Nlv
            ## 'L' contains the indices of the individuals for each level just above
            L <- lapply(levels(gr[, j - 1]), function(x) which(gr[, j - 1] == x))
            for (i in 1:nperm) {
                rind <- unlist(lapply(L, sample))
                rY <- y[rind, rind]
                rSSD <- getSSD(rY, gr, Nlv, N, n)
                rSigma2[i, j] <- getVarComp(rSSD/df, Nlv, ncoef)[j]
            }
            if (Nlv > 2) {
                for (j in (Nlv - 1):2) {
                    above <- gr[, j - 1]
                    L <- lapply(levels(above), function(x) which(above == x))
                    for (i in 1:nperm) {
                        rind <- integer(0)
                        for (k in L) rind <- c(rind, sample(k))
                        rind <- unlist(lapply(L, sample))
                        rY <- y[rind, rind]
                        rGR <- gr[rind, ]
                        rN <- lapply(rGR, tabulate)
                        rSSD <- getSSD(rY, rGR, Nlv, rN, n)
                        rDF <- getDF(rGR, Nlv, rN, n)
                        rNcoef <- getNcoefficient(rGR, Nlv, rN, n)
                        rSigma2[i, j] <- getVarComp(rSSD/rDF, Nlv, rNcoef)[j]
                    }
                }
            }
            ## for the highest level permute the level below
            L <- lapply(levels(gr[, 1]), function(x) which(gr[, 1] == x))
            for (i in 1:nperm) {
                rind <- unlist(sample(L))
                rY <- y[rind, rind]
                rGR <- gr[rind, ]
                rN <- lapply(rGR, tabulate)
                rSSD <- getSSD(rY, rGR, Nlv, rN, n)
                rDF <- getDF(rGR, Nlv, rN, n)
                rNcoef <- getNcoefficient(rGR, Nlv, rN, n)
                rSigma2[i, 1] <- getVarComp(rSSD/rDF, Nlv, rNcoef)[1]
            }
        }
        P <- numeric(Nlv + 1)
        for (j in 1:(Nlv + 1))
            P[j] <- sum(rSigma2[, j] >= sigma2[j])/nperm
        P[Nlv + 1] <- NA
        res$varcomp <- data.frame(sigma2 = res$varcomp, P.value = P)
    }
    res
}

print.amova <- function(x, ...)
{
    cat("\n\tAnalysis of Molecular Variance\n\nCall: ")
    print(x$call)
    cat("\n")
    print(x$tab)
    cat("\nVariance components:\n")
    if (is.data.frame(x$varcomp)) {
        x$varcomp["Error", "P.value"] <- NA
        printCoefmat(x$varcomp, na.print = "")
    } else print(x$varcomp)
    cat("\nVariance coefficients:\n")
    print(x$varcoef)
    cat("\n")
}
dwinter/Pegas documentation built on May 15, 2019, 6:21 p.m.