library("taxizedb")
library("dplyr")

Download DBs

ITIS

db_download_itis()

The Plant List (TPL)

db_download_tpl()

Catalogue of Life (COL)

db_download_col()

connect to the DBs

By default src_* functions use a path to the cached database file. You can alternatively pass in your own path if you've put it somewhere else.

ITIS

src_itis <- src_itis()

TPL

src_tpl <- src_tpl()

COL

src_col <- src_col()

query with SQL syntax

sql_collect(src_itis, "select * from hierarchy limit 5")
#> # A tibble: 5 x 5
#>                     hierarchy_string    tsn parent_tsn level childrencount
#> *                              <chr>  <int>      <int> <int>         <int>
#> 1                             202422 202422          0     0        154282
#> 2                      202422-846491 846491     202422     1          2666
#> 3               202422-846491-660046 660046     846491     2          2654
#> 4        202422-846491-660046-846497 846497     660046     3             7
#> 5 202422-846491-660046-846497-846508 846508     846497     4             6
# or pipe the src to sql_collect
src_itis %>% sql_collect("select * from hierarchy limit 5")

use dplyr verbs

get a tbl

hiers <- src_itis %>% tbl("hierarchy")
#> # Source:   table<hierarchy> [?? x 5]
#> # Database: postgres 9.6.0 [sacmac@localhost:5432/ITIS]
#>                                              hierarchy_string    tsn parent_tsn level childrencount
#>                                                         <chr>  <int>      <int> <int>         <int>
#>  1                                                     202422 202422          0     0        154282
#>  2                                              202422-846491 846491     202422     1          2666
#>  3                                       202422-846491-660046 660046     846491     2          2654
#>  4                                202422-846491-660046-846497 846497     660046     3             7
#>  5                         202422-846491-660046-846497-846508 846508     846497     4             6
#>  6                  202422-846491-660046-846497-846508-846553 846553     846508     5             5
#>  7           202422-846491-660046-846497-846508-846553-954935 954935     846553     6             3
#>  8      202422-846491-660046-846497-846508-846553-954935-5549   5549     954935     7             2
#>  9 202422-846491-660046-846497-846508-846553-954935-5549-5550   5550       5549     8             0
#> 10           202422-846491-660046-846497-846508-846553-954936 954936     846553     6             0
#> # ... with more rows

select certain fields

hiers %>% select(TSN, level)
#> # Source:   lazy query [?? x 2]
#> # Database: postgres 9.6.0 [sacmac@localhost:5432/ITIS]
#>       tsn level
#>     <int> <int>
#>  1 202422     0
#>  2 846491     1
#>  3 660046     2
#>  4 846497     3
#>  5 846508     4
#>  6 846553     5
#>  7 954935     6
#>  8   5549     7
#>  9   5550     8
#> 10 954936     6
#> # ... with more rows

Local versions of taxize functions

A few of the key functions from taxize have been ported to taxizedb. Support is currently limited to the NCBI taxonomy database.

children accesses the nodes immediately descending from a given taxon

children(3701, db='ncbi')
#> $`3701`
#>    childtaxa_id                                                     childtaxa_name childtaxa_rank
#> 1       1837063                         Arabidopsis thaliana x Arabidopsis halleri        species
#> 2       1547872                                              Arabidopsis umezawana        species
#> 3       1328956 (Arabidopsis thaliana x Arabidopsis arenosa) x Arabidopsis suecica        species
#> 4       1240361                         Arabidopsis thaliana x Arabidopsis arenosa        species
#> 5        869750                          Arabidopsis thaliana x Arabidopsis lyrata        species
#> 6        412662                                            Arabidopsis pedemontana        species
#> 7        378006                         Arabidopsis arenosa x Arabidopsis thaliana        species
#> 8        347883                                              Arabidopsis arenicola        species
#> 9        302551                                              Arabidopsis petrogena        species
#> 10        97980                                               Arabidopsis croatica        species
#> 11        97979                                            Arabidopsis cebennensis        species
#> 12        81970                                                Arabidopsis halleri        species
#> 13        59690                                             Arabidopsis kamchatica        species
#> 14        59689                                                 Arabidopsis lyrata        species
#> 15        45251                                               Arabidopsis neglecta        species
#> 16        45249                                                Arabidopsis suecica        species
#> 17        38785                                                Arabidopsis arenosa        species
#> 18         3702                                               Arabidopsis thaliana        species
#> 
#> attr(,"class")
#> [1] "children"
#> attr(,"db")
#> [1] "ncbi"

classification finds the lineage of a taxon

classification(3702, db='ncbi')
#> $`3702`
#>                    name         rank      id
#> 1    cellular organisms      no rank  131567
#> 2             Eukaryota superkingdom    2759
#> 3         Viridiplantae      kingdom   33090
#> 4          Streptophyta       phylum   35493
#> 5        Streptophytina    subphylum  131221
#> 6           Embryophyta      no rank    3193
#> 7          Tracheophyta      no rank   58023
#> 8         Euphyllophyta      no rank   78536
#> 9         Spermatophyta      no rank   58024
#> 10        Magnoliophyta      no rank    3398
#> 11      Mesangiospermae      no rank 1437183
#> 12       eudicotyledons      no rank   71240
#> 13           Gunneridae      no rank   91827
#> 14         Pentapetalae      no rank 1437201
#> 15               rosids     subclass   71275
#> 16              malvids      no rank   91836
#> 17          Brassicales        order    3699
#> 18         Brassicaceae       family    3700
#> 19           Camelineae        tribe  980083
#> 20          Arabidopsis        genus    3701
#> 21 Arabidopsis thaliana      species    3702
#> 
#> attr(,"class")
#> [1] "classification"
#> attr(,"db")
#> [1] "ncbi"

downstream finds all taxa descending from a taxon

downstream(3700, db='ncbi')
#> $`3700`
#>      childtaxa_id                                                     childtaxa_name     rank
#> 1         2071891                                                     Draba taylorii  species
#> 2         2071524                                                  Rorippa tenerrima  species
#> 3         2071523                                                Rorippa crystallina  species
#> 4         2071509                                                   Physaria calderi  species
#> 5         2071468                                                 Erysimum arenicola  species
#> 6         2071452                                                   Draba yukonensis  species
#> 7         2071451                                                   Draba thompsonii  species
#> ...
#> 326       1492251                                                 Erysimum lilacinum  species
#> 327       1492250                                              Erysimum leucanthemum  species
#> 328       1492249                                               Erysimum leptostylum  species
#> 329       1492248                                              Erysimum leptophyllum  species
#> 330       1492247                                               Erysimum leptocarpum  species
#> 331       1492246                                                Erysimum ledebourii  species
#> 332       1492245                                                Erysimum laxiflorum  species
#> 333       1492244                                                  Erysimum kurdicum  species
#>  [ reached getOption("max.print") -- omitted 2880 rows ]
#>
#> attr(,"class")
#> [1] "downstream"
#> attr(,"db")
#> [1] "ncbi"

All of these functions run very fast. It only takes a few seconds to find all bacterial taxa and count them:

downstream(2, db='ncbi')[[1]] %>%
    dplyr::group_by(rank) %>%
    dplyr::count()
#> #> [1] 138695
#> # A tibble: 18 x 2
#> # Groups:   rank [18]
#> rank                 n
#> <chr>            <int>
#>  1 class               83
#>  2 family             483
#>  3 forma                4
#>  4 genus             3497
#>  5 no rank          37140
#>  6 order              198
#>  7 phylum             134
#>  8 species          97031
#>  9 species group       68
#> 10 species subgroup    10
#> 11 subclass             3
#> 12 subfamily            1
#> 13 subgenus             1
#> 14 suborder             8
#> 15 subphylum            1
#> 16 subspecies          10
#> 17 tribe                2
#> 18 varietas            21

Mapping functions

Several mapping functions are available for the NCBI taxonomy database:

# Map scientific or common names to taxonomy IDs
name2taxid("pig")
#> [1] "9823"

# Map taxonomy IDs to scientific names
taxid2name(9823)
#> [1] "Sus scrofa"

# Map taxonomy IDs to rank
taxid2rank(2)
#> [1] "superkingdom"


ropenscilabs/taxizedb documentation built on May 4, 2023, 7:05 a.m.