Introduction

Taxonomic databases archive taxonomic classifications of discovered biological organisms. An ideal taxonomic database unfalteringly reflects the richness of biodiversity of a given taxa of a region. It also harbours comprehensive hierarchical taxonomic classifications from the highest (i.e. kingdom) to the lowest (i.e. species, subspecies, form or variety) ranks.

Nevertheless, organismal collections, identifications and documentations rarely, if not never, reach the ultimate goal of taxonomic collections and archival. A two-way challenge may contribute to the incompleteness of taxonomy. First, from time to time, an identifier fails to identify an organism to the finest taxonomic level in laboratory, which can be due to the lack of knowledge or appropriate identification guildes. Secondly, discovering all biodiversity o the earth by itself is just an impossible mission. The reality is that most biodiversity on the earth remains and will likely remain undiscovered.

As gathering all taxonomic diversity is extremely difficult, scientists resort to estimate global and regional taxonomic diversity. It has revealed that taxonomic diversity is patterning. At both global and local scales, higher taxonomic group can be used to predict the richness of lower taxonomic group [ref1, ref2]. Likewise, common taxonomic groups can also serve as basis to predict diversity of rare taxonomic groups, whose lack of documentations may arise from the insufficiency of investigation [ref1].

Here we harvest the property of taxonomic patterning, creating an open R package biotaxa with a purpose to help biodiversity discovery and conservation.

biotaxa visualizes the the accumulation of discovered taxonomic diversity at the temporal scale. It also computes the percentage of identification precision of a given taxa, and provides the rank of completeness. It also allows users to conduct logistic and Michaelis-Menten model fittings of taxanomic accumulations.

knitr::opts_chunk$set(echo = TRUE)

Use the package

devtools::install_github("hhsieh/biotaxa_Rpackage")

library(biotaxa)

Imported data should be formatted with the following columns: Kingdoms, Phyla, Classes, Orders, Families, Genera and AphiaIDs. And, currently, the imported dataset needs to be given a name data_m.

Below is an example

#load the dataset of interest
#check the dimension of the dataset
library(biotaxa)
dim(data_m)
#The column names of the dataset should include year and all required taxonomic hierarchical levels. Nevertheless, they do not need to be in the same order as that of the example dataset.
colnames(data_m)

Functions in biotaxa

  1. taxaaccum()
library(data.table)
library(dplyr)
library(ggplot2)
taxaaccum <- function(taxa, rank) {
  df <- subset(data_m, Kingdoms == taxa | Phyla == taxa | Classes == taxa | Orders == taxa | Families == taxa | Genera == taxa)
  dt = as.data.table(unique(df))
  setkey(dt, "year")
  if (rank == "Phylum") {
    dt[, id := as.numeric(factor(Phyla, levels = unique(Phyla)))]
    ranklabel = "phyla"
  } else if (rank == "Class") {
    dt[, id := as.numeric(factor(Classes, levels = unique(Classes)))]
    ranklabel = "classes"
  } else if (rank == "Order") {
    dt[, id := as.numeric(factor(Orders, levels = unique(Orders)))]
    ranklabel = "orders"
  } else if (rank == "Family") {
    dt[, id := as.numeric(factor(Families, levels = unique(Families)))]
    ranklabel = "families"
  } else if (rank == "Genus") {
    dt[, id := as.numeric(factor(Genera, levels = unique(Genera)))]
    ranklabel = "genera"
  } else if (rank == "Species") {
    dt[, id := as.numeric(factor(AphiaIDs, levels = unique(AphiaIDs)))]
    ranklabel = "species"
  }
  setkey(dt, "year", "id")
  dt.out <- dt[J(unique(year)), mult = "last"]#[, Phylum := NULL]
  dt.out[, id := cummax(id)]
  numtaxa <- cummax(as.numeric(factor(dt$id)))
  taxa_dt <- aggregate(numtaxa, list(year = dt$year), max )
  colnames(taxa_dt) <- c("year", "taxacount")
  minx <- min(as.vector(taxa_dt$year))
  maxx <- max(as.vector(taxa_dt$year))
  ylab = paste("Number of", ranklabel, sep = " ")
  p <- ggplot(taxa_dt, aes(x = year, y = taxacount, colour = "#FF9999")) + geom_point(colour = "cornflowerblue")
  p <- p + labs(x = "Year", y = ylab) + ggtitle(taxa) + scale_x_discrete(breaks = c(seq(minx, maxx, 25))) + theme(legend.position = "none", axis.text.x = element_text(angle = 60, hjust = 1), axis.text.y = element_text(angle = 60, hjust = 1), axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))

  p

}

taxaaccum() generates the accumulation curve of taxa in a lower rank belong to the same higher taxa. For instance, the following example generates the accumulation curve of discovered phyla belong to Animalia. Note that here taxa needs to be in higher order than rank.

#Example
taxaaccum(taxa = "Animalia", rank = "Phylum")
  1. taxamodel()
library(data.table)
library(drc)
taxamodel <- function(taxa, rank, method) {
  tryCatch({
    df <- subset(data_m, Kingdoms == taxa | Phyla == taxa | Classes == taxa | Orders == taxa | Families == taxa | Genera == taxa)
    dt = as.data.table(unique(df))
    setkey(dt, "year")
    if(rank == "Phylum") {
      dt[, id := as.numeric(factor(Phyla, levels = unique(Phyla)))]
      ranklabel = "phyla"
    } else if(rank == "Class") {
      dt[, id := as.numeric(factor(Classes, levels = unique(Classes)))]
      ranklabel = "classes"
    } else if(rank == "Order") {
      dt[, id := as.numeric(factor(Orders, levels = unique(Orders)))]
      ranklabel = "orders"
    } else if(rank == "Family") {
      dt[, id := as.numeric(factor(Families, levels = unique(Families)))]
      ranklabel = "families"
    } else if(rank == "Genus") {
      dt[, id := as.numeric(factor(Genera, levels = unique(Genera)))]
      ranklabel = "genera"
    } else if(rank == "Species") {
      dt[, id := as.numeric(factor(AphiaIDs, levels = unique(AphiaIDs)))]
      ranklabel = "species"
    }
    setkey(dt, "year", "id")
    dt.out <- dt[J(unique(year)), mult = "last"]#[, Phylum := NULL]
    dt.out[, id := cummax(id)]
    numtaxa <- cummax(as.numeric(factor(dt$id)))
    taxa_dt <- aggregate(numtaxa, list(year = dt$year), max )
    colnames(taxa_dt) <- c("year", "taxacount")

    minx <- min(as.vector(taxa_dt$year))
    maxx <- max(as.vector(taxa_dt$year))
    ylab = paste("Number of", ranklabel, sep = " ")
    p <- ggplot(taxa_dt, aes(x = year, y = taxacount, colour = "#FF9999", group = 1
    )) + geom_point(colour = "cornflowerblue")
    p <- p + labs(x = "Year", y = ylab) + ggtitle(taxa) + scale_x_discrete(breaks = c(seq(minx, maxx, 25))) + theme(legend.position = "none", axis.text.x = element_text(angle = 60, hjust = 1), axis.text.y = element_text(angle = 60, hjust = 1), axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)))

    if(method == "Michaelis-Menten") {

      # refer to this page https://stackoverflow.com/questions/27547548/solving-error-message-step-halving-factor-reduced-below-minimum-in-nls-step-a

      N_obs <- taxa_dt$'taxacount'
      times <- c(taxa_dt$year)

      model.drm <- drm(N_obs ~ times, data = data.frame(N_obs = N_obs, times = times), fct = MM.2())

      newtimes <- times
      preds <- suppressWarnings(predict(model.drm, times = newtimes, interval = "prediction", level = 0.95))

      LW = preds[,2]
      UP = preds[,3]
      corr_coef <- cor(N_obs, predict(model.drm))
      p <- p + geom_line(data = data.frame(preds, taxa_dt$year), aes(taxa_dt$year, Prediction), colour = "#FF9999")
      p <- p + geom_ribbon(aes(ymin = LW, ymax = UP), linetype = 2, alpha = 0.1)
      p
    } else if (method == "logistic") {

      N_obs <- taxa_dt$'taxacount'
      times <- c(taxa_dt$year)

      ryegrass.m1 <- drm(N_obs ~ times, data = data.frame(N_obs = N_obs, times = times), fct = L.4())

      pred <- suppressWarnings(as.data.frame(predict(
        ryegrass.m1,
        newdata = data.frame(N_obs = N_obs, times = times),
        interval = "prediction", level = 0.95)));
      pred$times <- times;

      LW = pred[,2]
      UP = pred[,3]

      p <- p + geom_line(data = data.frame(pred, taxa_dt$year), aes(taxa_dt$year, Prediction), colour = "#FF9999")
      p <- p + geom_ribbon(aes(ymin = LW, ymax = UP), linetype = 2, alpha = 0.1)
      p
    }
  }#, error = function(e) {list(taxa = taxa, rank = rank, method = method, corr_coef = cat("model fails to converge", "\n"))}
  )
}

taxamodel() generates explanation curve of the taxa accumulation of a lower taxon belong to the same higher taxon based on one of the user-defined models: logistic or Michaelis-Menten models.

#example
taxamodel("Animalia", "Phylum", "logistic")
  1. taxamodel_corr()

taxamodel_corr() yields the correlation coefficient of the selected model. Although a model may present a high correlation coefficient between the real data and the predictions, a high correlation coefficient does not necessarily mean a goodness of fit. Visualisation can help the selection of a good model.

taxamodel_cor <- function(taxa, rank, method) {
  tryCatch({
    df <- subset(data_m, Kingdoms == taxa | Phyla == taxa | Classes == taxa | Orders == taxa | Families == taxa | Genera == taxa)
    dt = as.data.table(unique(df))
    setkey(dt, "year")
    if(rank == "Phylum") {
      dt[, id := as.numeric(factor(Phyla, levels = unique(Phyla)))]
      ranklabel = "phyla"
    } else if(rank == "Class") {
      dt[, id := as.numeric(factor(Classes, levels = unique(Classes)))]
      ranklabel = "classes"
    } else if(rank == "Order") {
      dt[, id := as.numeric(factor(Orders, levels = unique(Orders)))]
      ranklabel = "orders"
    } else if(rank == "Family") {
      dt[, id := as.numeric(factor(Families, levels = unique(Families)))]
      ranklabel = "families"
    } else if(rank == "Genus") {
      dt[, id := as.numeric(factor(Genera, levels = unique(Genera)))]
      ranklabel = "genera"
    } else if(rank == "Species") {
      dt[, id := as.numeric(factor(AphiaIDs, levels = unique(AphiaIDs)))]
      ranklabel = "species"
    }
    setkey(dt, "year", "id")
    dt.out <- dt[J(unique(year)), mult = "last"]#[, Phylum := NULL]
    dt.out[, id := cummax(id)]
    numtaxa <- cummax(as.numeric(factor(dt$id)))
    taxa_dt <- aggregate(numtaxa, list(year = dt$year), max )
    colnames(taxa_dt) <- c("year", "taxacount")
    N_obs <- taxa_dt$'taxacount'
    times <- as.numeric(taxa_dt$year)
    if(method == "logistic") {

      ryegrass.m1 <- suppressWarnings(drm(N_obs ~ times, data = data.frame(N_obs = N_obs, times = times), fct = L.4()))
      corr_coef <- cor(N_obs, predict(ryegrass.m1))
      res <- list(taxa=taxa, rank=rank, method=method, corr_coef=corr_coef)
      return(res)

    } else if(method == "Michaelis-Menten") {
      model.drm <- suppressWarnings(drm(N_obs ~ times, data = data.frame(N_obs = N_obs, times = times), fct = MM.2()))

      corr_coef <- cor(N_obs, predict(model.drm))
      res <- list(taxa=taxa, rank=rank, method=method, corr_coef=corr_coef)
      return(res)
    } else if(method == "Asymtopic_Regression_Model") {
      model.drm <- suppressWarnings(drm(N_obs ~ times, data = data.frame(N_obs = N_obs, times = times), fct = AR.3()))
      corr_coef <- cor(N_obs, predict(model.drm))
      res <- list(taxa=taxa, rank=rank, method=method, corr_coef=corr_coef)
      return(res)
    }
  }#, error = function(e) {list(taxa = taxa, rank = rank, method = method, corr_coef = cat("model fails to converge", "\n"))}
    )
}
#example
taxamodel_corr(taxa = "Animalia", rank = "Genus", method = "logistic")
  1. taxaprecision()

taxaprecision()computes the percentage of records of a taxa being identified to the species level.

taxaprecision <- function(taxa) {
  taxa <- subset(data_m, Kingdoms == taxa | Phyla == taxa | Classes == taxa | Orders == taxa | Families == taxa | Genera == taxa)
  species_complete <- which(taxa$AphiaIDs != "")
  all_species <- dim(taxa)[[1]][1]
  species_precision = length(species_complete) / all_species
  return(species_precision)
}
#example: compute the percentage of organisms of Isodictya identified to species
taxaprecision("Isodictya")
  1. frequencyrank()

frequencyrank() reports the frequencies of a lower taxa belong to the same higher taxa

frequencyrank <- function(taxa, rank) {
  library(dplyr, warn.conflicts = FALSE)
  df <- as.data.frame(subset(data_m, Kingdoms == taxa | Phyla == taxa | Classes == taxa | Orders == taxa | Families == taxa | Genera == taxa))
  if(rank == "Phylum") {
    df_mid <- count(df, df$Phyla)
  } else if(rank == "Class") {
    df_mid <- count(df, df$Classes)
  } else if(rank == "Order") {
    df_mid <- count(df, df$Order)
  } else if(rank == "Family") {
    df_mid <- count(df, df$Families)
  } else if(rank == "Genus") {
    df_mid <- count(df, df$Genera)
  }
  df_mid <- as.data.frame(df_mid)
  colnames(df_mid) <- c(taxa, "freq")
  df_end <- df_mid[with(df_mid, order(-freq)),]
  colnames(df_end) <- c(rank, "freq")
  return(df_end)
}
#example: rank and report phyla frequecies of Plantae
frequencyrank("Plantae", "Phylum")
  1. topn()

topn() provides the n most frequent lower-ranked taxa belong to the same higher taxa

topn <- function(taxa, rank, n) {
  df <- as.data.frame(subset(data_m, Kingdoms == taxa | Phyla == taxa | Classes == taxa | Orders == taxa | Families == taxa | Genera == taxa))
  if(rank == "Phylum") {
    df_mid <- count(df, df$Phyla)
  } else if(rank == "Class") {
    df_mid <- count(df, df$Classes)
  } else if(rank == "Order") {
    df_mid <- count(df, df$Order)
  } else if(rank == "Family") {
    df_mid <- count(df, df$Families)
  } else if(rank == "Genus") {
    df_mid <- count(df, df$Genera)
  }
  df_mid <- as.data.frame(df_mid)
  colnames(df_mid) <- c(taxa, "freq")
  df_end <- df_mid[with(df_mid, order(-freq)),]
  df_end_n <- df_end[c(1:n),]
  colnames(df_end_n) <- c(rank, "freq")
  return(df_end_n)
}
#example: return the 5 most frequent phyla of Animalia and reports their frequencies; n is a user-defined value.
topn("Animalia", "Phylum", 5)
  1. OBIS_impresisionrate()

OBIS_imprecisionrate() computes the percentage of occurrences of a taxa unidentified to the species level

OBIS_imprecisionrate <- function(taxa) {
  dd <- robis::occurrence(taxa)$species
  incomplete <- length(which(is.na(dd == TRUE)))
  imprecisionrate <- incomplete / length(dd)
  return(imprecisionrate)
}
#example: return the imprecision rate of "Abra".
OBIS_imprecisionrate("Abra")
  1. alltaxalist()

alltaxalist() returns all taxa of all ranks in a dataset

alltaxalist <- function(data_m) {
  Kingdoms <- levels(data_m$Kingdoms)
  Phyla <- levels(data_m$Phyla)
  Classes <- levels(data_m$Classes)
  Orders <- levels(data_m$Orders)
  Families <- levels(data_m$Families)
  Genera <- levels(data_m$Genera)
  ranks <- c(rep("Kingdom", length(Kingdoms)), rep("Phylum", length(Phyla)), rep("Class", length(Classes)), rep("Order", length(Orders)), rep("Family", length(Families)), rep("Genus", length(Genera)))
  taxa <- c(Kingdoms, Phyla, Classes, Orders, Families, Genera)
  ddd <- data.frame(taxa, ranks)
  colnames(ddd) <- c("taxa", "rank")
  return(ddd)
}
#example: return all taxa in data_m
alltaxa <- alltaxalist(data_m)
alltaxa[c(10:20), ]
  1. spetaxalist()

spetaxalist() returns all taxa of one rank in a dataset

spetaxalist <- function(data_m, rank) {
  if(rank == "Kingdom") {
    taxa = levels(data_m$Kingdoms)
  } else if(rank == "Phylum") {
    taxa = levels(data_m$Phyla)
  } else if(rank == "Class") {
    taxa = levels(data_m$Classes)
  } else if(rank == "Order") {
    taxa = levels(data_m$Orders)
  } else if(rank == "Family") {
    data_m$Families = as.factor(data_m$Families)
    taxa = levels(data_m$Families)
  } else if(rank == "Genus") {
    taxa = levels(data_m$Genera)
  }
  return(taxa)
}
spelist <- spetaxalist(data_m, "Phylum")
tail(spelist)


hhsieh/biotaxa_Rpackage documentation built on May 12, 2019, 4:26 p.m.