R/associate.R
In microbiome: Microbiome Analytics

Documented in associate cmat2table

#' @title Cross Correlation Wrapper
#' @description Cross-correlate columns of the input matrices.
#' @param x matrix (samples x features if annotation matrix)
#' @param y matrix (samples x features if cross-correlated with annotations)
#' @param method association method ('pearson', or 'spearman'
#' for continuous; categorical for discrete)
#' @param p.adj.threshold q-value threshold to include features 
#' @param cth correlation threshold to include features 
#' @param order order the results
#' @param n.signif mininum number of significant correlations for each 
#' element
#' @param mode Specify output format ('table' or 'matrix')
#' @param p.adj.method p-value multiple testing correction method. 
#' One of the methods in p.adjust 
#' function ('BH' and others; see help(p.adjust)). Default: 'fdr'
#' @param verbose verbose
#' @param filter.self.correlations Filter out correlations between 
#' identical items.
#' @return List with cor, pval, pval.adjusted
#' @examples 
#' data(peerj32)
#' d1 <- peerj32$microbes[1:20, 1:10]
#' d2 <- peerj32$lipids[1:20,1:10]
#' cc <- associate(d1, d2, method='pearson')
#' @export
#' @details As the method=categorical (discrete) association measure
#' for nominal (no order for levels) variables we use Goodman and
#' Kruskal tau based on
#' r-bloggers.com/measuring-associations-between-non-numeric-variables/
#'
#' The p-values in the output table depend on the method. For the spearman
#' and pearson correlation values, the p-values are provided by the default
#' method in the cor.test function. For the categorical method, the p-value is
#' estimated with the microbiome::gktau function (see the separate help page).
#' 
#' @references See citation('microbiome') 
#' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com}
#' @aliases cross_correlate
#' @keywords utilities
associate <- function(x, y=NULL, method="spearman", p.adj.threshold=Inf, cth=NULL, order=FALSE, n.signif=0, mode="table", p.adj.method="fdr", verbose=FALSE, filter.self.correlations=FALSE) {

    if (is.null(y)) {
        message("Cross-correlating the data with itself")
        y <- x
        
        if (filter.self.correlations) {
            # Ignore self-correlations in filtering
            n.signif <- n.signif + 1
        }
    }
    
    x <- as.data.frame(x)  # numeric or discrete
    y <- y  # numeric
    
    if (is.null(colnames(y))) {
        colnames(y) <- paste("column-", seq_len(ncol(y)), sep="")
    }
    
    xnames <- colnames(x)
    ynames <- colnames(y)
    qv <- NULL
    numeric.methods <- c("spearman", "pearson")
    categorical.methods <- c("categorical")

    # Rows paired.
    if (method %in% numeric.methods) {

        inds <- vapply(x, is.numeric, TRUE)

        if (any(!inds)) {
            warning("Considering only numeric annotations for \n       
                    pearson/spearman")
        }
        inds <- names(which(inds))
    } else if (method %in% categorical.methods) {
        inds <- vapply(x, is.factor, TRUE)
        if (any(!inds)) {
            warning("Considering only categorical annotations for factors")
        }
        inds <- names(which(inds))
    }
    
    xnames <- inds

    if (!is.vector(x)) {
        x <- suppressWarnings(as.matrix(x[, inds], ncol=length(inds)))
    } else {
        x <- as.matrix(x[inds], ncol=length(inds))
    }
    
    colnames(x) <- xnames
    
    Pc <- matrix(NA, ncol(x), ncol(y))
    Cc <- matrix(NA, ncol(x), ncol(y))
    rownames(Cc) <- colnames(x)
    colnames(Cc) <- colnames(y)
    rownames(Pc) <- colnames(x)
    colnames(Pc) <- colnames(y)
    
    if (verbose) {
        message(method)
    }

    if (method %in% c("pearson", "spearman")) {
        
        minobs <- 8
        
        for (j in seq_len(ncol(y))) {
            
            jc <- apply(x, 2, function(xi) {
                
                if (sum(!is.na(xi)) >= minobs) {
                    
                    res <- suppressWarnings(
                        cor.test(xi, unlist(y[, j], use.names=FALSE), 
                    method=method, use="pairwise.complete.obs"))
                    
                    res <- c(res$estimate, res$p.value)
                    
                } else {
                
                    warning(paste("Not enough observations (",
                minobs, "required); \n   
                        (", 
                    sum(!is.na(xi)), ") \n \n 
                        - skipping correlation estimation"))
                    res <- c(NA, NA)
                
                }
                res
                
            })

            Cc[, j] <- jc[1, ]
            Pc[, j] <- jc[2, ]
            
        }
        
    } else if (method == "categorical") {
        
        if (verbose) {
            message(method)
        }
        
        Cc <- matrix(NA, nrow=ncol(x), ncol=ncol(y))
        rownames(Cc) <- colnames(x)
        colnames(Cc) <- colnames(y)
        
        for (varname in colnames(x)) {
            
            for (lev in colnames(y)) {
                
                xvec <- x[, varname]
                yvec <- y[, lev]
                keep <- rowSums(is.na(cbind(xvec, yvec))) == 0
                xvec <- xvec[keep]
                yvec <- yvec[keep]
                
                # Number of data-annotation samples for
                # calculating the correlations
                n <- sum(keep)
                Cc[varname, lev] <- gktau(xvec, yvec) 
                
            }
        }
    }

    if (!all(is.na(Pc))) {

        rownames(Pc) <- xnames
        colnames(Pc) <- ynames
        
        rownames(Cc) <- xnames
        colnames(Cc) <- ynames

        # Corrected p-values
        qv <- array(NA, dim=dim(Pc))
        qv <- matrix(p.adjust(Pc, method=p.adj.method), nrow=nrow(Pc))
        dimnames(qv) <- dimnames(Pc)

    }

    # Filter
    if (!is.null(p.adj.threshold) || !is.null(cth)) {

        # Replace NAs with extreme values for filtering purposes
        qv[is.na(qv)] <- 1
        Pc[is.na(qv)] <- 1
        Cc[is.na(Cc)] <- 0
        
        # Filter by adjusted pvalues and correlations
        inds1.q <- inds2.q <- inds1.c <- inds2.c <- NULL
        
        if (!is.null(p.adj.threshold)) {
            
            inds1.q <- apply(qv, 1, function(x) {
                sum(x < p.adj.threshold) >= n.signif
            })
            
            inds2.q <- apply(qv, 2, function(x) {
                sum(x < p.adj.threshold) >= n.signif
            })
        }
        
        if (!is.null(cth)) {
            inds1.c <- apply(abs(Cc), 1, function(x) {
                sum(x > cth) >= n.signif
            })
            inds2.c <- apply(abs(Cc), 2, function(x) {
                sum(x > cth) >= n.signif
            })
        }
        
        if (!is.null(p.adj.threshold) && !is.null(cth)) {
            
            inds1 <- inds1.q & inds1.c
            inds2 <- inds2.q & inds2.c
            
        } else if (is.null(p.adj.threshold) && !is.null(cth)) {
            inds1 <- inds1.c
            inds2 <- inds2.c
        } else if (!is.null(p.adj.threshold) && is.null(cth)) {
            inds1 <- inds1.q
            inds2 <- inds2.q
        }
        
        Cmat <- as.matrix(0)

    # TODO: add also correlation filter, not only significance
    # Require each has at least n.signif. correlations
        
        if (sum(inds1) >= n.signif && sum(inds2) >= n.signif) {
            
            rnams <- rownames(Cc)[inds1]
            cnams <- colnames(Cc)[inds2]
            
            Cc <- matrix(Cc[inds1, inds2, drop=FALSE], nrow=sum(inds1))
            Pc <- matrix(Pc[inds1, inds2, drop=FALSE], nrow=sum(inds1))
            qv <- matrix(qv[inds1, inds2, drop=FALSE], nrow=sum(inds1))
            
            rownames(qv) <- rownames(Pc) <- rownames(Cc) <- rnams
            colnames(qv) <- colnames(Pc) <- colnames(Cc) <- cnams
            
            if (order && sum(inds1) >= 2 && sum(inds2) >= 2) {
                
                # Order in visually appealing order
                tmp <- Cc
                rownames(tmp) <- NULL
                colnames(tmp) <- NULL
                
                rind <- hclust(as.dist(1 - cor(t(tmp),
            use="pairwise.complete.obs")))$order
                cind <- hclust(as.dist(1 - cor(tmp,
            use="pairwise.complete.obs")))$order
                
                rnams <- rownames(Cc)[rind]
                cnams <- colnames(Cc)[cind]
                
                Cc <- Cc[rind, cind]
                Pc <- Pc[rind, cind]
                qv <- qv[rind, cind]
                
                rownames(qv) <- rownames(Pc) <- rownames(Cc) <- rnams
                colnames(qv) <- colnames(Pc) <- colnames(Cc) <- cnams
                
            }
            
        } else {
            message("No significant correlations with the given criteria\n")
            Cc <- Pc <- qv <- NULL
        }
    }

    res <- list(cor=Cc, pval=Pc, p.adj=qv)
    
    if (nrow(x) == nrow(y) && ncol(x) == ncol(y) && filter.self.correlations) {
        diag(res$cor) <- diag(res$pval) <- diag(res$p.adj) <- NA
    }

    if (mode == "table") {
        res <- cmat2table(res)
    }
    
    res
    
}


#' @title Convert Correlation Matrix into a Table
#' @description Arrange correlation matrices from associate into a table format.
#' @param res Output from associate
#' @param verbose verbose
#' @return Correlation table
#' @examples 
#' data(peerj32)
#' d1 <- peerj32$microbes[1:20, 1:10]
#' d2 <- peerj32$lipids[1:20,1:10]
#' cc <- associate(d1, d2, mode='matrix', method='pearson')
#' cmat <- associate(d1, d2, mode='table', method='spearman')
#' @references See citation('microbiome') 
#' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com}
#' @keywords utilities
cmat2table <- function(res, verbose=FALSE) {
    
    ctab <- ID <- NULL

    if (!is.null(res$cor)) {
        ctab <- as.data.frame(res$cor)
        ctab$ID <- rownames(res$cor)
        ctab <- melt(ctab, "ID")

        colnames(ctab) <- c("X1", "X2", "Correlation")
        ctab$Correlation <- as.numeric(as.character(ctab$Correlation))
    }
    
    correlation <- NULL  # circumwent warning on globabl vars

    if (!is.null(res$p.adj)) {
        
        if (verbose) {
            message("Arranging the table")
        }

        ctab2 <- as.data.frame(res$p.adj)
        ctab2$ID <- rownames(res$p.adj)
        ctab2 <- melt(ctab2, "ID")
        colnames(ctab2) <- c("X1", "X2", "p.adj")
        ctab2$p.adj <- as.numeric(as.character(ctab2$p.adj))

        ctab <- cbind(ctab, ctab2$p.adj)
        colnames(ctab) <- c("X1", "X2", "Correlation", "p.adj")
        ctab <- ctab[order(ctab$p.adj), ]
        colnames(ctab) <- c("X1", "X2", "Correlation", "p.adj")
        
    } else {
        message("No significant adjusted p-values")
        if (!is.null(ctab)) {
            
            ctab2 <- as.data.frame(res$pval)
            ctab2$ID <- rownames(res$pval)
            ctab2 <- melt(ctab2, "ID")
            colnames(ctab2) <- c("X1", "X2", "value")
            ctab2$value <- as.numeric(as.character(ctab2$value))
            
            ctab <- cbind(ctab, ctab2$value)
            ctab <- ctab[order(-abs(ctab$Correlation)), ]
            colnames(ctab) <- c("X1", "X2", "Correlation", "pvalue")
        }
    }

    ctab$X1 <- as.character(ctab$X1)
    ctab$X2 <- as.character(ctab$X2)
    
    # Keep the original order of factor levels
    ctab$X1 <- factor(as.character(ctab$X1), levels=rownames(res$cor))
    ctab$X2 <- factor(as.character(ctab$X2), levels=colnames(res$cor))
    
    # Remove NAs
    ctab <- ctab[!is.na(ctab$Correlation), ]
    
    # Order the table by p-value
    if ("p.adj" %in% colnames(ctab)) {
        ctab <- ctab[order(ctab$p.adj), ]
    } else if ("pvalue" %in% colnames(ctab)) {
        ctab <- ctab[order(ctab$pvalue), ]
    }
    
    ctab
    
}