#' Assign traits to taxa at different taxonomic levels
#'
#' @description
#' A function for assigning traits to taxa at different taxonomic levels.
#'
#' @details
#' This function allows to obtain missing traits for a target taxon by taking traits from lower or to upper taxomic levels.
#' For instance, consider the case where the genus *Acroloxus* is present in the user dataset and the species *Acroloxus lacustris*
#' in the traits dataset. A simple merge would exclude *Acroloxus* from the tha analysis since Acroloxus and *A. lacustris*
#' would not match. The function assign_traits allows to assign *Acroloxus lacustris* traits to *Acroloxus*.
#' This function works also in the opposite direction. Consider the case where there are no traits for the target taxon and
#' the target taxon has not been identified at species level. The function assign_traits will assign the traits of the taxon nearest in the taxonomic tree
#' (e.g. Tanypodinae traits assign to *Ablabesmyia monilis*). Consider also these examples to understand the behaviour of this
#' function. For instance *Anabolia lombarda* is present in the user taxomic dataset while only *Anabolia nervosa* and *Anabolia* are present
#' in the trait database. In this case `assign_traits()` will assign only the traits of *Anabolia* to *A. lombarda*.
#' Moreover, let's assume that *Coelostoma* is present in the user dataset while only *Berosus* and *Crenitis punctatostriata* are present in the traits dataset.
#' Here `assign_traits()` will assign the scores of *Berosus* and *C. punctatostriata* to *Coelostoma* because they belong to the same family and there are no information at family level. \cr \cr
#' The function `assign_traits()` will calculate the taxonomic distance between the target taxa and the taxa used to assign the trait score. This distance
#' can be positive (e.g. Species to Genus) and negative (e.g. Genus to Species). The distance is measured assigning values as follows:
#' 1 (Species to Genus), 4 (Species to family), -3 (Family to Genus), etc. `assign_traits()` considers only the
#' taxonomic levels from Subspecies to Family (Subspecies, Species, Genus, Tribus, Subfamily, Family).\cr
#' The function `average_traits()` averages traits when traits from multiple taxa are assigned to the same taxon. \cr
#' Th function `sample_traits()` samples traits from one taxon when multiple taxa are assigned to the same taxon.
#'
#'
#' @param x Results of `aggregate_taxa()`.
#' @param trait_db A trait `data.frame` with a column `Taxa` and the other columns
#' containing the traits.
#' By default, the dataset used is the one from Tachet et al. (2010) for macroinvertebrates that
#' can be retrieved from
#' [freshwaterecology.info](https://www.freshwaterecology.info/) website
#' (Schmidt-Kloiber & Hering, 2015).
#' @param group Biotic group of interest. Possible values are `mi` for macroinvertebrates, `mf` for macrophytes and `fi` for fish.
#' The choice will set the right reference database for the specified group.
#' This option will not be considered if a custom reference database is provided. Default to `mi`.
#' @param tax_lev Taxonomic level at which the calculation has to be performed.
#' Default to `Taxa`, the maximum taxonomic level is `Family`.
#' @param dfref Reference dataset as used in the function `aggregate_taxa()`.
#' @param filter_by_distance Filter the results according to the taxonomic distance. Possible values are `pos`, `neg` or a positive integer.
#' @param col_blocks A vector containing the number of modalities for each trait. IF `NULL`, the default Tachet et al. (2010) `col_blocks` will be assigned.
#' @param type The type of variables specified in `trait_db`. Must be one of `F`, fuzzy, or `C`, continuous.
#'
#' @importFrom dplyr '%>%' mutate select left_join group_by summarise ungroup
#' @importFrom tidyr gather spread
#'
#' @examples
#' data(macro_ex)
#'
#' data_bio <- as_biomonitor(macro_ex)
#' data_agr <- aggregate_taxa(data_bio)
#' data_ts <- assign_traits(data_agr)
#'
#' # select only the nearest traits
#' data_ts_sub <- manage_traits(data_ts, method = "nearest+-")
#'
#' # averaging
#' data_ts_av <- average_traits(data_ts_sub)
#'
#' # traits random sampling
#' data_ts_st <- sample_traits(data_ts)
#' @seealso [aggregate_taxa]
#'
#' @export
#'
#' @export sample_traits
#'
#' @export average_traits
assign_traits <- function(x, trait_db = NULL, group = "mi", tax_lev = "Taxa", dfref = NULL, filter_by_distance = NULL) {
if (is.null(trait_db)) {
# check if x is of class biomonitoR and mi
classCheck(x)
if (inherits(x, "custom") & is.null(trait_db)) {
warning("It seems that you used your own reference database. Please check the consistency of the taxonomy used for calculating the index with those of your reference database to have reliable results.")
}
trait_db <- traitsTachet
} else {
trait_db$Taxa <- trimws(trait_db$Taxa)
classCheck(x)
}
if (is.null(dfref)) {
if (identical(group, "mi")) {
dfref <- mi_ref
}
if (identical(group, "mf")) {
dfref <- mf_ref
}
if (identical(group, "fi")) {
dfref <- fi_ref
}
} else {
dfref <- dfref
}
# merge the trait database with the reference database in order to scale
# traits across taxonomic levels
# [ , - 1] deletes the Taxa columns
ref <- merge(dfref, trait_db, by = "Taxa", sort = FALSE)[, -1]
# create a data.frame with the same column as trait_db but with 0 rows
# it will be important later to iterate rbind to this object
trait.interm <- data.frame(trait_db[-c(1:nrow(trait_db)), ])
# ref.na is a 0 length vector to store the taxa names of the selected rows
# this allow to does not loose the information about the name of the taxa at the orginial
# taxonomic level
ref.na <- c()
# 10 to 5 because it is intended to work from subspecises to family
# cycle to scale the traits among taxonomic levels
for (i in 10:5) {
temp <- ref[, -which(c(1:10) != i)]
names(temp)[1] <- "Taxa"
temp <- temp[temp[, 1] != "", ]
ref.name <- ref[rownames(ref) %in% rownames(temp), 5:10]
ref.name <- apply(ref.name, 1, function(x) (rev(x)[rev(x) != ""][1]))
ref.na <- c(ref.na, ref.name)
trait.interm <- rbind(trait.interm, temp)
}
trait_db <- trait.interm
DF <- x[["Tree"]]
# allow the user to work at a desired taxonomic level
if (!tax_lev %in% c(
"Family", "Subfamily", "Tribus", "Genus",
"Species", "Subspecies", "Taxa"
)) {
stop("Maximum taxonomic level is family.")
}
if (!identical(tax_lev, "Taxa")) {
# set values to "" for the taxonomic levels lower than specified
DF <- DF[, 1:11]
DF[colnames(DF)[(which(colnames(DF) %in% tax_lev) + 1):11]] <- ""
# remove duplicated rows
DF <- DF[!duplicated(DF), ]
# change Taxa to the taxa of the required taxonomic level
DF[, 11] <- DF[, tax_lev]
}
# DFtaxa stores the taxa present in the user database
DFtaxa <- as.character(DF[11])
result.list <- apply(DF, 1, function(x) traitS(x = x, y = DF, z = trait_db, w = ref.na))
result.data.frame <- do.call(rbind, result.list)
result.data.frame <- result.data.frame[result.data.frame$Taxa_db != "", ]
result.data.frame.single <- result.data.frame
unique.taxa <- unique(result.data.frame.single$Taxa_db)
# deleting row for the following reason. It happens that traits are present for a species
# and also for the genus of this species. If the user sample
# contains a species other than that reported in the trait list we want only to keep the
# trait at genus level
for (i in 1:length(unique.taxa)) {
res <- result.data.frame.single[result.data.frame.single$Taxa_db %in% unique.taxa[i], 1:3]
res.sum <- (res[, 1] == res[, 2]) + (res[, 1] == res[, 3]) + (res[, 2] == res[, 3])
if (any(res.sum == 0)) {
if (sum(res.sum) != 0) {
to.del <- which(result.data.frame.single$Taxa_db %in% unique.taxa[i])[res.sum == 0]
result.data.frame.single <- result.data.frame.single[-to.del, ]
}
}
}
ref_long <- data.frame(Taxonomic_level = character(), Taxa = character(), stringsAsFactors = FALSE)
for (i in 10:1) {
temp <- as.character(dfref[, i])
temp <- temp[temp != ""]
temp.rep <- rep(names(ref[, i, drop = FALSE]), length(temp))
temp.df <- data.frame(Taxonomic_level = temp.rep, Taxa = temp)
ref_long <- rbind(ref_long, temp.df)
}
ref_long <- ref_long[!duplicated(ref_long), ]
taxa_db.taxlev <- ref_long[match(ref_long$Taxa, result.data.frame.single$Taxa_db), ]
result.data.frame.single$Taxa_db <- as.character(result.data.frame.single$Taxa_db)
result.data.frame.single$Traits_real <- as.character(result.data.frame.single$Traits_real)
ref_long$Taxa <- as.character(ref_long$Taxa)
taxa_db.taxlev <- inner_join(result.data.frame.single[, 1, drop = FALSE], ref_long, by = c("Taxa_db" = "Taxa"))
traits.taxlev <- inner_join(result.data.frame.single[, 2, drop = FALSE], ref_long, by = c("Traits_real" = "Taxa"))
names(taxa_db.taxlev) <- c("Taxa_taxlev", "Taxa_db")
names(traits.taxlev) <- c("Traits_taxlev", "Traits_real")
tax_lev.info <- data.frame(taxa_db.taxlev, traits.taxlev, stringsAsFactors = FALSE)
dist.taxlev <- data.frame(tax_lev = names(dfref)[-11], distance = c(10:1))
dist.taxlev$tax_lev <- as.character(dist.taxlev$tax_lev)
tax_lev.info$Taxa_db <- as.character(tax_lev.info$Taxa_db)
tax_lev.info$Traits_real <- as.character(tax_lev.info$Traits_real)
a <- inner_join(tax_lev.info[, c(2, 4)], dist.taxlev, by = c("Taxa_db" = "tax_lev"))[, 3]
b <- inner_join(tax_lev.info[, c(2, 4)], dist.taxlev, by = c("Traits_real" = "tax_lev"))[, 3]
tax_lev.info$Taxonomic_distance <- a - b
names(tax_lev.info)[1] <- "Taxa"
final.traits <- data.frame(tax_lev.info, result.data.frame.single[, -c(1:3)])
rownames(final.traits) <- NULL
if (is.null(filter_by_distance)) {
final.traits
} else {
if (is.character(filter_by_distance)) {
if (filter_by_distance == "pos") {
final.traits[final.traits$Taxonomic_distance >= 0, ]
} else {
if (filter_by_distance == "neg") {
final.traits[final.traits$Taxonomic_distance <= 0, ]
} else {
stop("pos, neg or an integer are needed when filter_by_distance is not NULL")
}
}
} else {
final.traits[abs(final.traits$Taxonomic_distance) <= abs(filter_by_distance), ]
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.