#' Import data into biomonitoR format
#'
#' @description
#' This function merge the user `data.frame` with a reference dataset and suggest corrections for mispelled names.
#'
#' @param x A `data.frame` with a column called `Taxa` where store taxa names and samples on the other columns (see the example `macro_ex`).
#' @param group Biotic group of interest. Possible values are `mi` for macroinvertebrates, `mf` for macrophytes, `fi` for fish and `di` for diatoms. The choice will set the right reference dataset for the specified group.
#' This option will not be considered if a custom reference dataset is provided. Default to `mi`.
#' @param dfref A custom reference database that replaces the built-in reference databases.
#' @param to_change A `data.frame` specifying the taxa name that needs to be changed.
#' This `data.frame` needs a column called *Taxon* containing taxa to aggregate and a column called *Correct_Taxon* with the aggregation specifications.
#' By default, when group is set to `mi` Hydracarina, Hydracnidia and Acariformes are changed to Trombidiformes.
#' @param FUN The function to be applied for aggregating rows with duplicated taxa names.
#' It should be `sum` for abundances, while it should be `bin` for presence-absence data. Default to `sum`.
#' @param correct_names If `TRUE` alternative names will be suggested for taxa not found in the reference dataset. Default to
#' `FALSE`, with which the unrecognized taxa will be removed.
#' @param traceB Track changes in taxa names.
#'
#' @keywords as_biomonitor
#' @details The function `as_biomonitor()` checks the taxonomy of the `data.frame` provided by the user and suggests correction for mispelled names.
#' If one or more taxa names of `x` are not present in the reference dataset or the spell checker is not able to find any suggestion the user is asked to exit.
#' This behaviour is to assure consistency with other functions implemented in `biomonitoR`.
#' Default references databases are provided for macroinvertebrates, macrophytes and fish.
#' Reference datasets heavily rely on the information provided by the [freshwaterecology.info](https://www.freshwaterecology.info/) website.
#' If `dfref` is not `NULL` a custom dictionary will be saved in the working directory to let the `as_biomonitor ()` function work correctly.
#' If you are unable to build a reference database by your own please check the function \code{\link{ref_from_tree}} for a possible solution.
#' `as_biomonitor()` returns an object of class `asb` togheter with one of the classes `abundance` or `bin`.
#' The function \code{\link{quick_rename}} works as the `as_biomonitor()` but returns
#' a `data.frame` without the `biomonitoR` format.
#' `as_biomonitor()` aggregates all the rows with the same taxa name with the option `FUN` and converts all the `NA` to 0.
#' When `group = mi` Hydracarina, Hydracnidia or Acariformes are changed to Trombidiformes given the uncertain taxonomic status of this group.
#'
#' @importFrom stats aggregate
#' @importFrom utils select.list stack
#' @importFrom hunspell dictionary hunspell_check hunspell_suggest
#' @export
#' @seealso [quick_rename] [ref_from_tree]
#' @references Schmidt-Kloiber, A., & Hering, D. (2015). www.freshwaterecology.info -
#' An online tool that unifies, standardises and codifies more than
#' 20,000 European freshwater organisms and their ecological preferences.
#' Ecological indicators, 53, 271-282.
#' @examples
#' data(mi_prin)
#' data_bio <- as_biomonitor(mi_prin, group = "mi")
as_biomonitor <- function(x, group = "mi", dfref = NULL, to_change = "default", FUN = sum, correct_names = FALSE, traceB = FALSE) {
# check if user database contains a column called Taxa
if (!"Taxa" %in% names(x)) {
stop("A column called Taxa is needed")
}
asb.call <- as.character(as.list(match.call())[["FUN"]])
if (length(asb.call) == 0) {
asb.call <- "sum"
}
# check if columns other than Taxa are numeric
# position of column Taxa
col.taxa <- which(names(x) == "Taxa")
col.class <- sapply(x[, -col.taxa], is.numeric)
if (any(col.class == FALSE)) {
stop("Non-numeric columns other than Taxa are not allowed")
}
# check if NAs are present. If present they are changed to 0.
check.na <- any(is.na(x))
x[is.na(x)] <- 0
# initialize check.pa
check.pa <- FALSE
if (!is.null(to_change) & !is.data.frame(to_change) & !identical(to_change, "default")) (stop("to_change needs to be NULL, default or data.frame as specified in the help"))
if (is.data.frame(to_change)) {
if (length(unique(to_change$Taxon)) != nrow(to_change)) (stop("the same name cannot be present twice in the Taxon column of the data.frame to_change"))
if (nrow(to_change) == 0) (stop("to_change must have at least one entry"))
}
if (!any(x[, !colnames(x) %in% "Taxa"] > 1) & all(x[, !colnames(x) %in% "Taxa"] %% 1 == 0) & !identical(asb.call, "bin")) {
(warning("Presence-absence data detected but FUN is not set to bin. Is it this what you want?"))
check.pa <- TRUE
}
if (any(x[, !colnames(x) %in% "Taxa"] > 1) & all(x[, !colnames(x) %in% "Taxa"] %% 1 == 0) & !identical(asb.call, "sum")) (warning("Abundance data detected but FUN is not set to sum. Is it this what you want?"))
if (any(x[, !colnames(x) %in% "Taxa"] %% 1 != 0)) warning("Decimal numbers detected. Please check carefully which FUN to use.")
# set the reference database for the specified group
if (group == "mi") {
# macroinvertebrates
ref <- mi_ref
}
if (group == "mf") {
# macrophytes
ref <- mf_ref
}
if (group == "fi") {
# fish
ref <- fi_ref
}
if (group == "di") {
# fish
ref <- di_ref
}
# allow the users to use their own reference database
if (!is.null(dfref)) {
dfref[is.na(dfref)] <- ""
dfref <- as.data.frame(unclass(dfref))
ref <- dfref
newDictio(ref)
group <- "custom"
}
# select or create dictinoaries
if (!identical(group, "custom")) {
if (identical(group, "mi")) {
dic.path <- system.file("dict", "mi_dictionary.txt", package = "biomonitoR")
# very important to set cache equal to FALSE, otherwise suggestNames will provide inconsistent results.
dictio <- dictionary(dic.path, cache = FALSE)
}
if (identical(group, "mf")) {
dic.path <- system.file("dict", "mf_dictionary.txt", package = "biomonitoR")
dictio <- dictionary(dic.path, cache = FALSE)
}
if (identical(group, "fi")) {
dic.path <- system.file("dict", "fi_dictionary.txt", package = "biomonitoR")
dictio <- dictionary(dic.path, cache = FALSE)
}
if (identical(group, "di")) {
dic.path <- system.file("dict", "di_dictionary.txt", package = "biomonitoR")
dictio <- dictionary(dic.path, cache = FALSE)
}
}
if (identical(group, "custom")) {
dic.path <- c(paste(getwd(), "/custom_dictio.dic", sep = ""))
dictio <- dictionary(dic.path, cache = F)
}
# change Taxa from factor to character
x$Taxa <- as.character(x$Taxa)
# change the name of taxa to lowercase and capital letter
x$Taxa <- trimws(sapply(x$Taxa, capWords, USE.NAMES = FALSE))
# change the Hydracarina, Hydracnidia or Acariformes changed to Trombidiformes
if (identical(to_change, "default")) {
to_change_mi[, "Taxon"] <- trimws(sapply(to_change_mi[, "Taxon"], capWords, USE.NAMES = FALSE))
to_change_mi[, "Correct_Taxon"] <- trimws(sapply(to_change_mi[, "Correct_Taxon"], capWords, USE.NAMES = FALSE))
if (any(to_change_mi[, "Taxon"] %in% x$Taxa)) {
# store results for traceB
to_store <- to_change_mi[to_change_mi$Taxon %in% x$Taxa, ]
change_uni <- x[x$Taxa %in% to_change_mi$Taxon, "Taxa", drop = TRUE]
for (i in 1:length(change_uni)) {
x[x$Taxa %in% change_uni[i], "Taxa"] <- to_change_mi[to_change_mi$Taxon %in% change_uni[i], "Correct_Taxon"]
}
}
}
# change according to user needs
if (is.data.frame(to_change)) {
to_change[, "Taxon"] <- trimws(sapply(to_change[, "Taxon"], capWords, USE.NAMES = FALSE))
to_change[, "Correct_Taxon"] <- trimws(sapply(to_change[, "Correct_Taxon"], capWords, USE.NAMES = FALSE))
if (any(to_change[, "Taxon"] %in% x$Taxa)) {
# store results for traceB
to_store <- to_change[to_change$Taxon %in% x$Taxa, ]
change_uni <- x[x$Taxa %in% to_change$Taxon, "Taxa", drop = TRUE]
for (i in 1:length(change_uni)) {
x[x$Taxa %in% change_uni[i], "Taxa"] <- to_change[to_change$Taxon %in% change_uni[i], "Correct_Taxon"]
}
}
}
# search for mispelled names
ref_taxa <- unique(ref$Taxa)
wrong_taxa <- unique(x$Taxa)
# get wrong names
wrong_taxa <- wrong_taxa[!wrong_taxa %in% ref_taxa]
if (length(wrong_taxa) > 0) {
# replace space with underscore to be compatible with hunspell
wrong_taxa <- gsub(" ", "_", wrong_taxa)
# nameCheck and nameSuggest check for the wrong names and suggest for correct names.
# hunspell_check and hunspell_suggest are from the package hunspell
name_suggest <- hunspell_suggest(wrong_taxa, dict = dictio)
name_suggest <- lapply(name_suggest, function(x)if(length(x) == 0) {"none"} else {x} )
names(name_suggest) <- wrong_taxa
names_suggest <- stack(name_suggest)
names_suggest$ind <- as.character(names_suggest$ind)
colnames(names_suggest) <- c("suggested", "excluded")
names_suggest <- names_suggest[, 2:1]
}
### INTERACTIVE EQUAL TO FALSE
# if correct names is equal to FALSE all the wrong names will be discarded
if (!correct_names) {
x <- x[!x$Taxa %in% wrong_taxa, , drop = FALSE]
} else { # nocov start
if (length(wrong_taxa) == 0) {
x <- x
} else {
temp <- rep(NA, length(wrong_taxa)) # vector to store user choices
for (i in 1:length(wrong_taxa)) {
choice <- names_suggest[names_suggest$excluded %in% wrong_taxa[i], 2, drop = TRUE] # choices provided to the user
# provide alternative names to the user and if the user can't find the correct name he must exit
temp[i] <- select.list(choice, title = wrong_taxa[i])
if (temp[i] == "exit") stop()
}
taxa_corrected <- data.frame("wrong_names" = wrong_taxa, "correct_names" = temp, stringsAsFactors = FALSE)
# change wrong names
for (i in 1:nrow(taxa_corrected)) {
x[x$Taxa %in% taxa_corrected$wrong_names[i], "Taxa"] <- taxa_corrected$correct_names[i]
}
}
} # nocov end
x <- aggregate(. ~ Taxa, data = x, FUN = FUN)
if (check.pa) {
x <- data.frame(x[, 1, drop = FALSE], (x[, -1, drop = FALSE] > 0) * 1, stringsAsFactors = FALSE)
message("data imported as presence absence")
}
if (check.na) {
message("NA detected, transformed to 0")
}
# merge reference database to the user data.frame
taxa_def <- merge(ref, x, by = "Taxa", all = FALSE)
# reorder the columns
taxa_def <- taxa_def[, c(2:11, 1, 12:ncol(taxa_def)), drop = FALSE]
if (!traceB) {
taxa_def <- list(taxa_db = taxa_def)
} else {
if (!correct_names) {
if (length(wrong_taxa) == 0) {
if (!exists("to_store", inherits = FALSE)) {
taxa_def <- list(taxa_db = taxa_def)
} else {
taxa_def <- list(taxa_db = taxa_def, corrected_names = to_store)
}
} else {
if (!exists("to_store", inherits = FALSE)) {
taxa_def <- list(taxa_db = taxa_def, suggested_taxa_names = names_suggest)
} else {
names(to_store) <- c("wrong_names", "correct_names")
rownames(to_store) <- NULL
taxa_def <- list(taxa_db = taxa_def, corrected_names = to_store, suggested_taxa_names = names_suggest)
}
}
} else {
if (length(wrong_taxa) == 0) {
if (!exists("to_store", inherits = FALSE)) {
taxa_def <- list(taxa_db = taxa_def)
} else {
taxa_def <- list(taxa_db = taxa_def, corrected_names = to_store)
}
} else {
if (!exists("to_store", inherits = FALSE)) {
taxa_def <- list(taxa_db = taxa_def, suggested_taxa_names = names_suggest)
} else {
names(to_store) <- c("wrong_names", "correct_names")
taxa_corrected <- rbind(to_store, taxa_corrected)
rownames(to_store) <- NULL
taxa_def <- list(taxa_db = taxa_def, corrected_names = taxa_corrected)
}
}
}
}
class(taxa_def) <- c("asb")
if (identical(asb.call, "bin")) {
class(taxa_def) <- c(class(taxa_def), "bin")
}
if (!is.null(dfref)) {
class(taxa_def) <- c(class(taxa_def), "custom")
}
if( length(wrong_taxa) > 0 & ! traceB){
message("Some taxa were excluded, check with traceB = TRUE")
}
taxa_def
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.