In alexgenin/fastcooc: Fast frequentist co-occurrence analysis

fastcooc: Fast pairwise computation of species associations

library(devtools)
clean_dll()
load_all()

library(future)
library(future.apply)
plan(multicore)

This small package provides an implementation of the procedure presented by Veech (2013) to build association networks from presence/absence data (also named co-occurrence networks). It is aimed to be much faster than the implementation in the package cooccur for situations with a high number of species, and low number of sites.

The speed of the implementation relies on pre-computing P-values and storing them in a lookup table. A fast compiled function then loops through all pairs of species, computes their co-occurrence and total abundances, then looks up the P-value in the precomputed table.

Depending on your number of sites and number of species, pre-computing this table can take a while, but this needs only to be done once as long as the number of sites in the analysis does not change.

A typical workflow looks like this:

# Load a community matrix from the vegan package
library(vegan)
data(dune)
dune_pa <- dune > 0 # Transform to presence/absence data

# Compute associations. The community matrix should have sites as rows and 
# species as columns. 
Nsites <- nrow(dune_pa)
pval_table <- precompute_pvalues(Nsites, ntries = 9999)
coocs <- fastcooc(dune_pa, pval_array = pval_table)
# fastcooc returns a data.frame with three columns: the two species, and 
# the P-value (P-values close to zero mean a negative link, those close 
# to one mean a positive link)

# Display pairwise species associations
library(ggplot2)
ggplot(coocs) + 
  geom_raster(aes(x = sp1, y = sp2, 
                  fill = ifelse(pval < 0.5, "negative assoc.", 
                                "positive assoc."))) + 
  scale_fill_manual(values = c('blue', 'red'), name = "Associations") + 
  coord_fixed() + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1))

Here is the speedup brought by fastcooc compared to the coccur package:

naive_cooc <- function(sites, trim_threshold = 0.05) { 
  pmat <- matrix(NA_real_, ncol(sites), ncol(sites))
  for (spi in seq.int(nrow(pmat))) { 
    for (spj in seq.int(nrow(pmat))) { 
      if ( sd(sites[ ,spi]) != 0 && 
           sd(sites[ ,spj]) != 0 ) { 
        pmat[spi, spj] <- coocvec(sites[ ,spi], sites[ ,spj], ntries = 999)[4]
      } else { 
        pmat[spi, spj] <- 0.5
      }
    }
  }

  spnames <- seq.int(ncol(pmat))
  cooc_results <- data.frame(expand.grid(sp1 = spnames, 
                                         sp2 = spnames), 
                             pval = as.vector(pmat))
  cooc_results <- subset(cooc_results, 
                          pmin(pval, 1 - pval) < (trim_threshold/2))

  cooc_results
}

make_one_benchmark <- function(nsp, nsites) { 
  nsp <- round(nsp)
  nsites <- round(nsites)
  bigmat <- matrix(rnorm(nsites*nsp), nrow = nsites, ncol = nsp) > 0
  a <- system.time({ 
      result <- fastcooc(bigmat, long_form = TRUE, trim_threshold = 0.05, 
                         ntries = 999) %>% 
      subset(sp1 < sp2)
  })

  b <- system.time({ 
    cooccurdf <- print(cooccur::cooccur(t(bigmat), prob = "hyper")) %>% 
                   (function(df) df[ c('sp1', 'sp2')]) %>% 
                   as.matrix() 
  })

  all_inters_fc <- sort(paste(result[ ,'sp1'], result[ ,'sp2'], sep = '-'))
  all_inters_co <- sort(paste(cooccurdf[ ,'sp1'], cooccurdf[ ,'sp2'], sep = "-"))

  n_in_common <- length(intersect(all_inters_co, all_inters_fc))
  n_diff <- max(length(all_inters_co), length(all_inters_fc)) - n_in_common
  ntot <- nsp*(nsp-1)/2

  data.frame(n_in_common = n_in_common, 
             n_diff = n_diff, 
             ntot = ntot, 
             nsp = nsp, 
             nsites = nsites, 
             fastcooc = a['elapsed'], 
             cooccur  = b['elapsed'])
}

benchmarks <- expand.grid(nsp    = seq(5, 250, l = 10), 
                          nsites = seq(5, 60,  l = 10)) %>% 
                plyr::dlply(~ nsp + nsites)

bench_results <- future_lapply(benchmarks, 
                               function(df) { 
                                 make_one_benchmark(df$nsp, 
                                                    df$nsites)
                                 }) %>% 
                   dplyr::bind_rows()

bench_results_fmt <- bench_results %>% 
  plyr::mutate(fastcooc_rel_cooccur = fastcooc / cooccur) %>% 
  plyr::mutate(fastcooc_rel_cooccur = ifelse(fastcooc_rel_cooccur > 1, 
                                             NA, log10(fastcooc_rel_cooccur))) 

# ggplot(bench_results_fmt, aes(x = nsites, y = nsp, fill = fastcooc_relative) ) + 
#   geom_raster() + 
#   theme_minimal() + 
#   scale_fill_viridis_c(breaks = - c(0, seq.int(5)), 
#                         labels = function(b) paste0(10^(-b), "x"), 
#                         name = "fastcooc speedup", 
#                         na.value = "grey80") + 
#   labs(x = "Number of sites", 
#         y = "Number of species", 
#         title = "Speedup: fastcooc vs. naive pairwise approach (without lookup table)")
# 
ggplot(bench_results_fmt, 
       aes(x = nsites, y = nsp, fill = fastcooc_rel_cooccur) ) + 
  geom_raster() + 
  theme_minimal() + 
  scale_fill_viridis_c(breaks = - c(0, seq.int(5)), 
                        labels = function(b) paste0(10^(-b), "x"), 
                        name = "fastcooc speedup", 
                        na.value = "grey80") + 
  labs(x = "Number of sites", 
        y = "Number of species", 
        title = "Speedup: fastcooc vs. cooccur package")

Installation

Install using the package devtools:

devtools::install_github('alexgenin/fastcooc')

Notes

This package has been tested againts the coocurr package to check that it produces good results. However, results can still be slightly different because it uses a randomization-based procedure to compute P-values instead of an analytical approach. Typically, expect 0.01-1% of all possible links to be different with default parameters.
This package does not handle NAs in data, but does handle species that are completely absent or always present (those for which P-values cannot be computed), by assuming they cannot form significant associations.
This package does not compute any form of association strength (odd ratio, variance ratio, standardized effect size, etc.), and just reports P-values, which is the only information needed to build unweighted association networks.
There is no documentation yet (apart from this page).

alexgenin/fastcooc
Fast frequentist co-occurrence analysis

In alexgenin/fastcooc: Fast frequentist co-occurrence analysis

fastcooc: Fast pairwise computation of species associations

Installation

Notes

Further readings and references

R Package Documentation

Browse R Packages

We want your feedback!

alexgenin/fastcooc Fast frequentist co-occurrence analysis

In alexgenin/fastcooc: Fast frequentist co-occurrence analysis

fastcooc: Fast pairwise computation of species associations

Installation

Notes

Further readings and references

R Package Documentation

Browse R Packages

We want your feedback!

alexgenin/fastcooc
Fast frequentist co-occurrence analysis