ecodist: Dissimilarity-Based Functions for Ecological Analysis

Documented in MRM

MRM <- function(formula = formula(data), data, nperm = 1000, method="linear", mrank = FALSE)
{
# MRM: Multiple Regression on distance Matrices
# Sarah Goslee 2008-07-18
# tests R^2 and regression coefficients using a
# permutation test


# added method argument
# linear is the default, standard option
# logistic adds capability to do logistic regression
# - this will be much slower
# mrank is ignored if method is not "linear"

# Stuff R needs to be able to use a formula
     m <- match.call(expand.dots = FALSE)
     m2 <- match(c("formula", "data"), names(m), nomatch=0)
     m <- m[c(1, m2)]
     m[[1]] <- as.name("model.frame")
     m <- eval(m, parent.frame())
     m <- as.matrix(m)


# End of R stuff. m is now the data for the MRM test as
# columns y, x, n1, n2, n3, ...
# Determine the size of the matrices & do some error checking.
     n <- (1 + sqrt(1 + 8 * nrow(m)))/2
     if(abs(n - round(n)) > 0.0000001)
        stop("Matrix not square.\n")
     n <- round(n)
     if(ncol(m) < 2) stop("Not enough data. \n")


     if(method == "linear") {
        if(mrank) {
            m <- apply(m, 2, rank)
        }

        # convert matrices to column order to ensure compatibility with C
        for(thiscol in seq_len(ncol(m))) {
            tempmat <- full(m[,thiscol])
            m[,thiscol] <- tempmat[col(tempmat) > row(tempmat)]
        }

        # use matrix form to speed up calculations
        X <- m[ ,2:ncol(m), drop=FALSE]
        X <- cbind(rep(1, nrow(X)), X)
        Y <- m[ ,1, drop=FALSE]

        nd <- nrow(X)

        # only need to calculate (X'X)^-1 once
        XX <- crossprod(X)
        XX <- solve(XX)

        # will need to calculate Xy for each permutation
        XY <- crossprod(X, Y)
        YY <- crossprod(Y)

        # regression coefficients
        b <- XX %*% XY
        rownames(b) <- c("Int", colnames(X)[2:ncol(X)])

        bXY <- crossprod(b, XY)
        SSE <- YY - bXY

        SSTO <- YY - sum(Y)^2/nd
        SSR = SSTO - SSE

        # R2 = 1 - SSE/SSTO
        R2 <- 1 - SSE/SSTO
        R2 <- as.vector(R2)

        # F* = MSR / MSE
        # MSR = SSR / (p - 1) 
        # MSE = SSE / (n - p)
        p <- ncol(X) # number of parameters estimated
        F <- (SSR / (p - 1)) / (SSE / (nd - p))

        R2.pval <- NA
        b.pval <- rep(NA, ncol(X))
        F.pval <- NA

        if(nperm > 0) {

            R2.all <- numeric(nperm)

            # for regression coefficients, use pseudo-t of Legendre et al. 1994
            b.all <- numeric(nperm*p)

            # try out an overall F-test for lack of fit
            F.all <- numeric(nperm)

            cresults <- .C("mrmperm", as.double(as.vector(X)), as.double(as.vector(Y)), 
                 as.integer(p), as.integer(nd), as.integer(n), as.integer(nperm), 
                 R2.all = as.double(R2.all), b.all = as.double(b.all), 
                 F.all = as.double(F.all), as.double(numeric(n*n)), 
                 as.integer(numeric(n)), as.double(as.vector(XX)), as.double(numeric(p)),
                 as.double(0), as.double(numeric(p)), PACKAGE = "ecodist")

            R2.all <- cresults$R2.all
            R2.pval <- length(R2.all[R2.all >= R2.all[1]])/nperm

            F.all <- cresults$F.all
            F.pval <- length(F.all[F.all >= F.all[1]])/nperm

            # b.all contains pseudo-t of Legendre et al. 1994
            b.all <- matrix(cresults$b.all, nrow=nperm, ncol=p, byrow=TRUE)
            b.pval <- apply(b.all, 2, function(x)length(x[abs(x) >= abs(x[1])])/nperm)
        }
        results <- list(coef=cbind(b, pval=b.pval), r.squared=c(R2=R2, pval = R2.pval),F.test=c(F=F, F.pval = F.pval))
    } else {
        if(method == "logistic") {
             # extract data from formula object
             X <- m[ ,2:ncol(m), drop=FALSE]
             Y <- m[ ,1, drop=FALSE]
             
             colnames(Y) <- "Y"
             newdata <- data.frame(Y=Y, X)
             fit1 <- glm(Y ~ ., data=newdata, family=binomial(link = "logit"))

             # want to save coefficients, deviance & df
             b <- coefficients(fit1)
             dev <- summary(fit1)$deviance
             dev.df <- summary(fit1)$df.residual

             b.pval <- NA
             dev.pval <- NA

             if(nperm > 0) {
                b.all <- matrix(NA, nrow=nperm, ncol=length(b))
                b.all[1,] <- b
                dev.all <- rep(NA, nperm)
                dev.all[1] <- dev

                for(i in 2:nperm) {
                    newSample <- sample(n)
                    newY <- full(Y)
                    newY <- newY[newSample, newSample]
                    newY <- lower(newY)
                    newdata <- data.frame(Y=newY, X=X)
                    newfit <- glm(Y ~ ., data=newdata, family=binomial(link = "logit"))
                    b.all[i,] <- coefficients(newfit)
                    dev.all[i] <- summary(newfit)$deviance
                }
                b.pval <- apply(b.all, 2, function(x) length(x[abs(x) >= abs(x[1])])/nperm)
                dev.pval <- length(dev.all[dev.all >= dev.all[1]])/nperm
            }
            results <- list(coef = cbind(b, pval = b.pval), dev = c(resid.dev = dev, resid.df = dev.df, dev.pval = dev.pval))
        } else {
            stop("method must be 'linear' or 'logistic'\n")
        }
    }
    results
}