fastcooc: Fast pairwise computation of species associations

library(devtools)
clean_dll()
load_all()

library(future)
library(future.apply)
plan(multicore)

This small package provides an implementation of the procedure presented by Veech (2013) to build association networks from presence/absence data (also named co-occurrence networks). It is aimed to be much faster than the implementation in the package cooccur for situations with a high number of species, and low number of sites.

The speed of the implementation relies on pre-computing P-values and storing them in a lookup table. A fast compiled function then loops through all pairs of species, computes their co-occurrence and total abundances, then looks up the P-value in the precomputed table.

Depending on your number of sites and number of species, pre-computing this table can take a while, but this needs only to be done once as long as the number of sites in the analysis does not change.

A typical workflow looks like this:

# Load a community matrix from the vegan package
library(vegan)
data(dune)
dune_pa <- dune > 0 # Transform to presence/absence data

# Compute associations. The community matrix should have sites as rows and 
# species as columns. 
Nsites <- nrow(dune_pa)
pval_table <- precompute_pvalues(Nsites, ntries = 9999)
coocs <- fastcooc(dune_pa, pval_array = pval_table)
# fastcooc returns a data.frame with three columns: the two species, and 
# the P-value (P-values close to zero mean a negative link, those close 
# to one mean a positive link)

# Display pairwise species associations
library(ggplot2)
ggplot(coocs) + 
  geom_raster(aes(x = sp1, y = sp2, 
                  fill = ifelse(pval < 0.5, "negative assoc.", 
                                "positive assoc."))) + 
  scale_fill_manual(values = c('blue', 'red'), name = "Associations") + 
  coord_fixed() + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1))

Here is the speedup brought by fastcooc compared to the coccur package:

naive_cooc <- function(sites, trim_threshold = 0.05) { 
  pmat <- matrix(NA_real_, ncol(sites), ncol(sites))
  for (spi in seq.int(nrow(pmat))) { 
    for (spj in seq.int(nrow(pmat))) { 
      if ( sd(sites[ ,spi]) != 0 && 
           sd(sites[ ,spj]) != 0 ) { 
        pmat[spi, spj] <- coocvec(sites[ ,spi], sites[ ,spj], ntries = 999)[4]
      } else { 
        pmat[spi, spj] <- 0.5
      }
    }
  }

  spnames <- seq.int(ncol(pmat))
  cooc_results <- data.frame(expand.grid(sp1 = spnames, 
                                         sp2 = spnames), 
                             pval = as.vector(pmat))
  cooc_results <- subset(cooc_results, 
                          pmin(pval, 1 - pval) < (trim_threshold/2))

  cooc_results
}
make_one_benchmark <- function(nsp, nsites) { 
  nsp <- round(nsp)
  nsites <- round(nsites)
  bigmat <- matrix(rnorm(nsites*nsp), nrow = nsites, ncol = nsp) > 0
  a <- system.time({ 
      result <- fastcooc(bigmat, long_form = TRUE, trim_threshold = 0.05, 
                         ntries = 999) %>% 
      subset(sp1 < sp2)
  })

  b <- system.time({ 
    cooccurdf <- print(cooccur::cooccur(t(bigmat), prob = "hyper")) %>% 
                   (function(df) df[ c('sp1', 'sp2')]) %>% 
                   as.matrix() 
  })

  all_inters_fc <- sort(paste(result[ ,'sp1'], result[ ,'sp2'], sep = '-'))
  all_inters_co <- sort(paste(cooccurdf[ ,'sp1'], cooccurdf[ ,'sp2'], sep = "-"))

  n_in_common <- length(intersect(all_inters_co, all_inters_fc))
  n_diff <- max(length(all_inters_co), length(all_inters_fc)) - n_in_common
  ntot <- nsp*(nsp-1)/2

  data.frame(n_in_common = n_in_common, 
             n_diff = n_diff, 
             ntot = ntot, 
             nsp = nsp, 
             nsites = nsites, 
             fastcooc = a['elapsed'], 
             cooccur  = b['elapsed'])
}

benchmarks <- expand.grid(nsp    = seq(5, 250, l = 10), 
                          nsites = seq(5, 60,  l = 10)) %>% 
                plyr::dlply(~ nsp + nsites)

bench_results <- future_lapply(benchmarks, 
                               function(df) { 
                                 make_one_benchmark(df$nsp, 
                                                    df$nsites)
                                 }) %>% 
                   dplyr::bind_rows()

bench_results_fmt <- bench_results %>% 
  plyr::mutate(fastcooc_rel_cooccur = fastcooc / cooccur) %>% 
  plyr::mutate(fastcooc_rel_cooccur = ifelse(fastcooc_rel_cooccur > 1, 
                                             NA, log10(fastcooc_rel_cooccur))) 

# ggplot(bench_results_fmt, aes(x = nsites, y = nsp, fill = fastcooc_relative) ) + 
#   geom_raster() + 
#   theme_minimal() + 
#   scale_fill_viridis_c(breaks = - c(0, seq.int(5)), 
#                         labels = function(b) paste0(10^(-b), "x"), 
#                         name = "fastcooc speedup", 
#                         na.value = "grey80") + 
#   labs(x = "Number of sites", 
#         y = "Number of species", 
#         title = "Speedup: fastcooc vs. naive pairwise approach (without lookup table)")
# 
ggplot(bench_results_fmt, 
       aes(x = nsites, y = nsp, fill = fastcooc_rel_cooccur) ) + 
  geom_raster() + 
  theme_minimal() + 
  scale_fill_viridis_c(breaks = - c(0, seq.int(5)), 
                        labels = function(b) paste0(10^(-b), "x"), 
                        name = "fastcooc speedup", 
                        na.value = "grey80") + 
  labs(x = "Number of sites", 
        y = "Number of species", 
        title = "Speedup: fastcooc vs. cooccur package")

Installation

Install using the package devtools:

devtools::install_github('alexgenin/fastcooc')

Notes

Further readings and references

This package implements a procedure equivalent to what is presented in Veech (2013), but this is in fact a much older and classic approach (see Arita, 2016). A recommended read for the ecologist interested in frequentist co-ocurrence approaches is Sanderson and Pimm (2015).

Arita, H. T. 2016. Species co-occurrence analysis: pairwise versus matrix-level approaches. Global Ecology and Biogeography.

Veech, J. A. 2013. A probabilistic model for analysing species co-occurrence: Probabilistic model. Global Ecology and Biogeography 22:252–260.

Sanderson, J. G., and S. L. Pimm. 2015. Patterns in nature: the analysis of species co-occurrences. The University of Chicago Press, Chicago London.



alexgenin/fastcooc documentation built on June 4, 2019, 7:48 a.m.