## Requirement: 'worrms' + 'tibble'
# Info: To reduce missing informations in the column environment while the column climate is completed, this function searches if the upper taxa
# Info: are strictly marine, freshwater, brackish or terrestrial, to known the environment of a species.
# Note: The argument "not_strict_rank" allows to specify a column name from which the species does not need to live in one environment.
# Note: For example, the genus Bufo lives both in freshwater and terrestrial environments, so if set to "GENUS", the output will be "freshwater".
# Note: Set it to "NA" (between quotation marks) to specify that habitats can be infered only if all members of the ranks strictly lives in 1 habitat.
worms_ecology_upper_taxa = function(data, name_envir_col = "ENVIRONMENT", division_number = 200, not_strict_rank = "GENUS",
search_rank_col = c("PHYLUM", "CLASS", "ORDER", "FAMILY", "GENUS"), print_division = F){
columns = c(name_envir_col, search_rank_col)
if(length(which(colnames(data) %in% columns)) != length(columns) || any(duplicated(colnames(data))) || any(duplicated(columns)))
stop("Some columns are not in the data or their names is duplicated.")
if(!(not_strict_rank %in% search_rank_col)){
if(not_strict_rank != "NA") stop('The "not_strict_rank" argument is neither "NA" (between quotation marks), nor in the "search_rank_col" names.')
}
for(i in 1:length(search_rank_col)){
aphia_id = list()
taxa = list()
zone = list()
start = list()
end = list()
cat(paste("Checking the", search_rank_col[i], "habitats\n"))
cat("-----------------------------------------------------\n")
start_time_processing = Sys.time()
if(i == 1) {
data_replaced = data
cat("\n")
cat(paste0("Missing informations: ", round(nrow(data_replaced[is.na(data_replaced[["ENVIRONMENT"]]), ]) / nrow(data_replaced) * 100, 0), "%\n"))
cat("\n")
}
data_to_correct = data_replaced[is.na(data_replaced[[name_envir_col]]), ]
for(j in 1:length(unique(data_to_correct[[search_rank_col[i]]]))){
cat(paste0(unique(data_to_correct[[search_rank_col[i]]])[j], " (", j, "/", length(unique(data_to_correct[[search_rank_col[i]]])), ")\n"))
aphia_id[j] = tryCatch(wm_name2id(unique(data_to_correct[[search_rank_col[i]]])[j]), error = function(e) { NA })
}
end_time_processing = Sys.time()
duration = difftime(end_time_processing, start_time_processing)
cat("-----------------------------------------------------\n")
cat(paste("Aphia ID retrieved in:", round(duration[[1]], 2), units(duration), "\n"))
start_time_processing = Sys.time()
aphia = unlist(aphia_id)[!is.na(unlist(aphia_id))]
if(!all(is.na(as.numeric(unlist(aphia_id))))){
if(length(aphia) < division_number) {
start = 1
end = length(aphia)
}
else{
multiple = round(length(aphia)/division_number)
for(h in 1:multiple){
j = division_number * h
k = h - 1
if(h == 1) {
start[h] = 0
end[h] = division_number
}
else {
end[h] = j
start[h] = end[k]
}
}
start = unlist(start) + 1
end = unlist(end)
diff = length(h) - division_number*multiple
if(length(h) > division_number*multiple) {
start = c(start, end[length(end)] + 1)
end = c(end, end[length(end)] + diff)
}
}
if(print_division) {
test = list()
for(p in 1:length(start)){
test[p] = paste(start[p], end[p], sep = ":")}
print(unlist(test))
}
for(g in 1:length(start)){
cat("\n")
cat(paste0("Division of queries: Query number ", g, "/", length(start), "\n"))
infos = wm_record(as.numeric(aphia[start[g]:end[g]]))
for(k in 1:nrow(infos)){
taxa[k] = infos$scientificname[k]
if(is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) && is.na(infos$isBrackish[k]) && !is.na(infos$isTerrestrial[k])){
if(infos$isTerrestrial[k] == 1) zone[k] = "terrestrial"
else zone[k] = NA
}
else if(!is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) && is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
if(infos$isMarine[k] == 1) zone[k] = "marine"
else zone[k] = NA
}
else if(is.na(infos$isMarine[k]) && !is.na(infos$isFreshwater[k]) && is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
if(infos$isFreshwater[k] == 1) zone[k] = "freshwater"
else zone[k] = NA
}
else if(is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) && !is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
if(infos$isBrackish[k] == 1) zone[k] = "both"
else zone[k] = NA
}
else if(!is.na(infos$isMarine[k]) && !is.na(infos$isFreshwater[k]) && !is.na(infos$isBrackish[k]) && !is.na(infos$isTerrestrial[k])){
if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 1) zone[k] = "terrestrial"
else if(infos$isMarine[k] == 1 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 0) zone[k] = "marine"
else if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 1 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 0) zone[k] = "freshwater"
else if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 1 && infos$isTerrestrial[k] == 0) zone[k] = "both"
else if(search_rank_col[i] == not_strict_rank){
if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 1 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 1) zone[k] = "freshwater"
else if(infos$isMarine[k] == 1 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 1) zone[k] = "marine"
else if(infos$isMarine[k] == 1 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 1 && infos$isTerrestrial[k] == 0) zone[k] = "marine"
else if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 1 && infos$isBrackish[k] == 1 && infos$isTerrestrial[k] == 0) zone[k] = "freshwater"
else zone[k] = NA
}
else zone[k] = NA
}
else if(search_rank_col[i] == not_strict_rank && is.na(infos$isMarine[k]) && !is.na(infos$isFreshwater[k]) &&
is.na(infos$isBrackish[k]) && !is.na(infos$isTerrestrial[k])){
if(infos$isFreshwater[k] == 1) zone[k] = "freshwater"
else zone[k] = NA
}
else if(search_rank_col[i] == not_strict_rank && is.na(infos$isMarine[k]) && !is.na(infos$isFreshwater[k]) &&
!is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
if(infos$isFreshwater[k] == 1) zone[k] = "freshwater"
else zone[k] = NA
}
else if(search_rank_col[i] == not_strict_rank && !is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) &&
!is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
if(infos$isMarine[k] == 1) zone[k] = "marine"
else zone[k] = NA
}
else if(search_rank_col[i] == not_strict_rank && !is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) &&
is.na(infos$isBrackish[k]) && !is.na(infos$isTerrestrial[k])){
if(infos$isMarine[k] == 1) zone[k] = "marine"
else zone[k] = NA
}
else zone[k] = NA
}
new_infos = data.frame(TAXA = unlist(taxa), ENVIRONMENT = unlist(zone))
new_infos = new_infos[complete.cases(new_infos), ]
data_replaced = replace_values(data_replaced, new_infos, variables_original = name_envir_col, variables_model = "ENVIRONMENT",
id_original = search_rank_col[i], id_model = "TAXA", total_replacement = "NA")
end_time_processing = Sys.time()
end_time_processing = Sys.time()
duration = difftime(end_time_processing, start_time_processing)
cat(paste("Informations processed in:", round(duration[[1]], 2), units(duration), "\n"))
cat("\n")
cat(paste0("Remaining NA: ", round(nrow(data_replaced[is.na(data_replaced[["ENVIRONMENT"]]), ]) / nrow(data_replaced) * 100, 0), "%\n"))
cat("\n")
cat("-----------------------------------------------------\n")
cat("\n")
cat("\n")
}
}
else {
cat("\n")
cat(paste("No Aphia IDs retrieved from WoRMS for the", search_rank_col[i], "\n"))
cat("\n")
}
}
tibble(data_replaced)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.