knitr::opts_chunk$set(echo = FALSE, comment = "", warning = FALSE, cache = TRUE, autodep = TRUE)
# options(crayon.enabled = TRUE)
options(width = 80)
library(taxa)
library(metacoder)
library(cowplot)

The challenges of taxonomic data

\begin{columns}

\begin{column}{0.3\textwidth} \includegraphics[height=8cm]{images/taxonomy.png} \end{column}

\begin{column}{0.8\textwidth} %%<--- here \begin{itemize} \setlength\itemsep{1em} \item Taxonomic data is hierarchical \item Associated with tabular data \item Can be names, classifications, or IDs \item Many different taxonomic systems \item Many different data formats \item Hierarchical visualization is difficult \end{itemize} \end{column}

\end{columns}

Sources of taxonomic data

DNA sequence databases

\begin{center} \resizebox{.99\textwidth}{!}{% \includegraphics[height=1cm]{images/ncbi_icon.png}% \quad \includegraphics[height=1cm]{images/unite_icon.png}% \quad \includegraphics[height=1cm]{images/greengenes_icon.png}% } \end{center}

Species occurrence databases

\begin{center} \resizebox{.99\textwidth}{!}{% \includegraphics[height=1cm]{images/gbif_icon.png}% \quad \includegraphics[height=1cm]{images/inaturalist_icon.png}% \quad \includegraphics[height=1cm]{images/atlas_icon.jpg}% } \end{center}

Museum records

\begin{center} \resizebox{.99\textwidth}{!}{% \includegraphics[height=1cm]{images/smith_icon.jpg}% \quad \includegraphics[height=1cm]{images/bmnh_icon.jpg}% \quad \includegraphics[height=1cm]{images/nhm_icon.jpg}% } \end{center}

Sources of taxonomic data: DNA sequences {.fragile}

\scriptsize

\textbf{NCBI GenBank}

\begin{verbatim}

AC073210.8 Homo sapiens BAC clone RP11-460N20 from 7, complete sequence AACGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGAGACCTTCGGGTC... \end{verbatim}

\textbf{UNITE}

\begin{verbatim}

SH099456.05FU_FJ357315_refs kFungi;pAscomycota;cDothideomycetes ;oPleosporales;fPleosporaceae;gEmbellisia;s__Embellisia_planifunda
GCTGGCGGCGTGCCTAACACATGTAAGTCGAACGGGACTGGGGGCAACTCCAGT... \end{verbatim}

\textbf{RDP}

\begin{verbatim}

S000448483 Sparassis crispa; MBUH-PIRJO&ILKKA94-1587/ss5 Lineage=Root;rootrank;Fungi;domain;Basidiomycota;phylum;Agaricomycetes; class;Polyporales;order;Sparassidaceae;family;Sparassis;genus AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGAATGCTTAACACATGAAAC... \end{verbatim}

\textbf{SILVA}

\begin{verbatim}

GCVF01000431.1.2369 Bacteria;Proteobacteria;Gammaproteobacteria; Oceanospirillales;Alcanivoraceae;Alcanivorax;Thalassiosira rotula AGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGTATGCTTAACACATGCAAG... \end{verbatim}

Sources of taxonomic data: DNA sequences {.fragile}

\scriptsize

\textbf{NCBI GenBank}

\begin{Verbatim}[commandchars=\{}]

\underline{AC073210.8} \underline{Homo sapiens} BAC clone RP11-460N20 from 7, complete sequence AACGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGAGACCTTCGGGTC... \end{Verbatim}

\textbf{UNITE}

\begin{Verbatim}[commandchars=\{}]

SH099456.05FU_FJ357315_refs \underline{kFungi;pAscomycota;cDothideomycetes} \underline{;oPleosporales;fPleosporaceae;gEmbellisia;s__Embellisia_planifunda}
GCTGGCGGCGTGCCTAACACATGTAAGTCGAACGGGACTGGGGGCAACTCCAGT... \end{Verbatim}

\textbf{RDP}

\begin{Verbatim}[commandchars=\{}]

S000448483 \underline{Sparassis crispa}; MBUH-PIRJO&ILKKA94-1587/ss5 Lineage=\underline{Root;rootrank;Fungi;domain;Basidiomycota;phylum;Agaricomycetes;} \underline{class;Polyporales;order;Sparassidaceae;family;Sparassis;genus} AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGAATGCTTAACACATGAAAC... \end{Verbatim}

\textbf{SILVA}

\begin{Verbatim}[commandchars=\{}]

GCVF01000431.1.2369 \underline{Bacteria;Proteobacteria;Gammaproteobacteria;} \underline{Oceanospirillales;Alcanivoraceae;Alcanivorax;Thalassiosira rotula} AGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGTATGCTTAACACATGCAAG... \end{Verbatim}

Sources of taxonomic data: Occurrence records

Global Biodiversity Information Facility : Archaea database

\scriptsize

readr::read_tsv("datasets/gbif_archea.csv")[4:8]

Sources of taxonomic data: Museum records

Smithsonian Museum of Natural History: Mammal database

\scriptsize

readr::read_csv("datasets/SNMNH.csv")[9]

The taxa package

\vfill

The taxa package is designed to be a solid foundation for using taxonomic data in R.

\vfill

\begin{itemize} \setlength\itemsep{1em} \item R6 classes to hold taxa, taxonomies, and associated data \item Flexible parsers to convert raw data to these classes \item Dplyr-inspired functions to manipulate these classes \item Functions to get data associated with each taxon in a taxonomy \end{itemize}

The metacoder package: visualization of taxon data

set.seed(6)

\scriptsize

readr::read_tsv("datasets/gbif_archea.csv") %>%
  parse_tax_data(class_cols = 4:8) %>%
  filter_taxa(taxon_names != "") %>%
  heat_tree(node_label = taxon_names, node_color = n_obs,
            node_size = n_obs, layout = "da")

Classes

\begin{center} \Huge Classes defined by taxa \end{center}

Classes defined by taxa: Relationships

\begin{center} \resizebox{.8\textwidth}{!}{% \includegraphics[height=1cm]{images/class_diagram.png}% } \end{center}

Classes defined by taxa: Relationships

\begin{center} \resizebox{.8\textwidth}{!}{% \includegraphics[height=1cm]{images/class_diagram_selected.png}% } \end{center}

Classes defined by taxa: The taxmap class

\begin{center} \hspace*{-0.6cm} \resizebox{1.1\textwidth}{!}{% \includegraphics{images/taxmap_printed.png}% } \end{center}

Parsing

\begin{center} \Huge Parsing \end{center}

Parsing

\begin{center} \vspace{-1cm}\hspace{-1cm} \resizebox{1.18\textwidth}{!}{% \includegraphics{images/parsing_guide.png}% } \end{center}

Parsing: vectors of classifications

\scriptsize

x <- c("Mammalia;Theria;Metatheria;Diprotodontia;Macropodiformes",
       "Mammalia;Theria;Eutheria;Primates;Haplorrhini;Simiiformes") 

parse_tax_data(x, class_sep = ";")

Parsing: vectors of names

\scriptsize

x <- c("Homo sapiens", "Macropus", "Chordata")

lookup_tax_data(x, type = "taxon_name", database = "ncbi")

Parsing: vectors of taxon or sequence IDs

\footnotesize

Taxon IDs

x <- c("9606", "207598", "7711") # NCBI taxon IDs
lookup_tax_data(x, type = "taxon_id", database = "ncbi")

Sequence IDs

x <- c("AC073210", "MG014608", "AE006468") # NCBI sequence IDs
lookup_tax_data(x, type = "seq_id", database = "ncbi")

Parsing: tables

Global Biodiversity Information Facility : Archaea database

\scriptsize

readr::read_tsv("datasets/gbif_archea.csv")[4:8]

Parsing: tables

\scriptsize

x = readr::read_tsv("datasets/gbif_archea.csv")

parse_tax_data(x, class_cols = 4:8)

Parsing: complex strings (NCBI Genbank)

\scriptsize

x = c("AC073210.8 Homo sapiens BAC clone RP11-460N20 from 7, complete sequence",
      "AE006468.2 Salmonella enterica subsp. enterica serovar Typhimurium",
      "MG014608.1 Macropus fuliginosus Csf1r gene, enhancer")

extract_tax_data(x, database = "ncbi", regex = "([A-Z0-9.]+) (.+)",
                 key = c(my_ncbi_id = "seq_id", my_desc = "info"))

Taxon attributes

\begin{center} \Huge Taxon attributes \end{center}

Taxon attributes: Taxonomy terminology

hmp_otus$lineage <- sub(hmp_otus$lineage, 
                        pattern = "^r__Root;",
                        replacement = "r__Bacteria;")
hmp_otus$lineage <- paste0("r__Life;", hmp_otus$lineage)
x = parse_tax_data(hmp_otus, class_cols = "lineage", class_sep = ";",
                   class_key = c(tax_rank = "info", tax_name = "taxon_name"),
                   class_regex = "^(.+)__(.+)$")
x = filter_taxa(x, taxon_names == "Proteobacteria", subtaxa = TRUE, supertaxa = TRUE, reassign_obs = F)

plot_one <- function(...) {
  set.seed(2)
  x %>%
    heat_tree(
      node_label = ifelse(is_root, "Root", ""),
      node_size = n_obs,
      node_label_size_range = c(0.05, 0.05),
      edge_color = "grey",
      layout = "da",
      initial_layout = "re",
      make_node_legend = FALSE,
      title_size = 0.09,
      ...)

}

selected = "red"
unselected = "grey"
target = "black"

my_color <- ifelse(x$taxon_indexes() %in% subtaxa(x, subset = taxon_names == "Betaproteobacteria", simplify = T),
                   selected, unselected)
my_color[x$taxon_names() == "Betaproteobacteria"] <- target
subtaxa_recursive <- plot_one(title = "Subtaxa (recursive = T)", node_color = my_color)

my_color <- ifelse(x$taxon_indexes() %in% subtaxa(x, subset = taxon_names == "Betaproteobacteria", simplify = T, recursive = FALSE), selected, unselected)
my_color[x$taxon_names() == "Betaproteobacteria"] <- target
subtaxa_immediate <- plot_one(title = "Subtaxa (recursive = F)", node_color = my_color)

my_color <- ifelse(x$taxon_indexes() %in% supertaxa(x, subset = taxon_names == "Betaproteobacteria", simplify = T), selected, unselected)
my_color[x$taxon_names() == "Betaproteobacteria"] <- target
supertaxa_recursive <- plot_one(title = "Supertaxa (recursive = T)", node_color = my_color)

my_color <- ifelse(x$taxon_indexes() %in% supertaxa(x, subset = taxon_names == "Betaproteobacteria", simplify = T, recursive = FALSE), selected, unselected)
my_color[x$taxon_names() == "Betaproteobacteria"] <- target
supertaxa_immediate <- plot_one(title = "Supertaxa (recursive = F)", node_color = my_color)

leaf_plot <- plot_one(title = "Leaves", node_color = ifelse(is_leaf, selected, unselected))

root_plot <- plot_one(title = "Roots", node_color = ifelse(is_root, selected, unselected))

stem_plot <- plot_one(title = "Stems", node_color = ifelse(is_stem, selected, unselected))

internode_plot <- plot_one(title = "Internodes", node_color = ifelse(is_internode, selected, unselected))

branch_plot <- plot_one(title = "Branches", node_color = ifelse(is_branch, selected, unselected))

\centering

set.seed(2)
x %>%
    heat_tree(
      node_label = taxon_names,
      node_size = n_obs,
      edge_color = "grey",
      node_color = "grey",
      layout = "da",
      initial_layout = "re",
      make_node_legend = FALSE)

Taxon attributes: subtaxa and supertaxa

\centering

cowplot::plot_grid(plotlist = list(subtaxa_recursive, subtaxa_immediate, supertaxa_recursive, supertaxa_immediate))

Taxon attributes: parts of a tree

\centering

cowplot::plot_grid(plotlist = list(leaf_plot, root_plot, stem_plot, internode_plot))

Taxon attributes: functions

Ranks, names, and IDs

taxon_names, taxon_ranks, taxon_ids

Parts of the tree

branches, internodes, leaves, roots, stems, supertaxa, subtaxa

Filtering helpers

is_branch, is_internode, is_leaf, is_root, is_stem

Numbers of supertaxa/subtaxa/data

n_supertaxa, n_subtaxa, n_obs, n_supertaxa_1, n_subtaxa_1, n_obs_1

Taxon attributes: Ranks, names, and IDs

These are derived from the list of taxon objects.

\scriptsize

taxon_names(ex_taxmap) %>% head
taxon_ranks(ex_taxmap) %>% head
taxon_ids(ex_taxmap) %>% head

Taxon attributes: subtaxa

These return a list of vectors named by taxon IDs.

\scriptsize

subtaxa(ex_taxmap, value = "taxon_names")[1:3]

Taxon attributes: subtaxa

These return a list of vectors named by taxon IDs.

\scriptsize

subtaxa(ex_taxmap, value = "taxon_names", recursive = FALSE)[1:3]

Taxon attributes: counts

These return counts of things per taxon.

\scriptsize

n_subtaxa(ex_taxmap)
n_supertaxa(ex_taxmap)
n_obs(ex_taxmap, "info")
n_obs(ex_taxmap, "abund")

Manipulating

\begin{center} \Huge Manipulating \end{center}

Manipulating: example data

Here is the example object that will be used:

obj <- ex_taxmap$clone(deep = TRUE)
obj$data$abund <- NULL

\scriptsize

print(obj)

Manipulating: example data

Here is the example object that will be used:

set.seed(1)

\scriptsize

heat_tree(obj, node_label = taxon_names, layout = "da")

Manipulating: Subsetting the taxonomy

Subset taxonomy and user-defined data to one taxon:

\scriptsize

filter_taxa(obj, taxon_names == "Plantae", subtaxa = TRUE)

Manipulating: Subsetting the taxonomy

Subset taxonomy and user-defined data to one taxon:

set.seed(1)

\scriptsize

filter_taxa(obj, taxon_names == "Plantae", subtaxa = TRUE) %>%
  heat_tree(node_label = taxon_names, layout = "da")

Manipulating: Subsetting the taxonomy

Subset taxonomy to one rank:

\scriptsize

filter_taxa(obj, taxon_ranks == "family", supertaxa = TRUE)

Manipulating: Subsetting the taxonomy

Subset taxonomy to one rank:

set.seed(8)

\scriptsize

filter_taxa(obj, taxon_ranks == "family", supertaxa = TRUE) %>%
  heat_tree(node_label = taxon_names, layout = "da")

Manipulating: Subsetting user data

Subset user-defined data and remove any taxa not in subset:

\scriptsize

filter_obs(obj, "info", n_legs == 4, drop_taxa = TRUE)

Manipulating: Subsetting user data

Subset data and remove any taxa not in subset:

set.seed(7)

\scriptsize

filter_obs(obj, "info", n_legs == 4, drop_taxa = TRUE) %>%
  heat_tree(node_label = taxon_names, layout = "da")

Manipulating: Adding user data

Add a column to a dataset:

\scriptsize

mutate_obs(obj, "info", bipedal = n_legs == 2)

Acknowledgements

\begin{center} \resizebox{.99\textwidth}{!}{% \includegraphics[height=0.5cm]{images/ropensci_icon.png}% \quad \includegraphics[height=0.5cm]{images/r_icon.jpg}% \quad \includegraphics[height=0.5cm]{images/user_icon.png}% } \end{center}

\begin{center} \resizebox{.99\textwidth}{!}{% \includegraphics[height=0.5cm]{images/osu_icon.png}% \quad \includegraphics[height=0.5cm]{images/ars_icon.png}% } \end{center}

\begin{center} \includegraphics[height=7cm]{images/grunwaldlab.jpg}% \end{center}

Manipulating: values accessible to NSE

The following can be used in manipulation functions as if they were independent variables using Non-Standard Evaluation (NSE):

\begin{itemize} \setlength\itemsep{1em} \item Functions that return per-taxon information \item User-defined table columns \item User-defined vectors and lists \item User-defined functions \end{itemize}

\scriptsize

unname(all_names(obj))

Metacoder: visualization of taxonomic data

\begin{center} \vspace{-0.3cm}\hspace{-1cm} \resizebox{0.9\textwidth}{!}{% \includegraphics{images/heat_tree_matrix.png}% } \end{center}

Metacoder: visualization of taxonomic data

\begin{center} \vspace{-0.7cm}\hspace{-0.6cm} \resizebox{1.15\textwidth}{!}{% \includegraphics{images/metacoder_multiroot.png}% } \end{center}

Thanks for listening!

\begin{center} \Huge Questions? \end{center}



ropenscilabs/taxa documentation built on Feb. 23, 2024, 6:31 p.m.