## Requirement: 'taxizedb' + 'tibble' + 'taxize' + 'worms' + 'stringr'
# Info: This function takes the data with missing ecological informations and ambiguous status to check for new names and retrieve their ecology.
# Info: It first searches for accepted names on ITIS and WoRMS before lauching new research on Fishbase, OBIS, GBIF, VertNet & WoRMS for the new
# Info: species and the new genus. It then return the corrected dataframe.
# Note: See the function "worms_ecology_upper_taxa() for the argument "genus_not_strict".
correct_name_and_infos = function(data, data_na, genus_not_strict = T, name_species_col = "SPECIES", name_aphia_col = "APHIA_ID",
name_genus_col = "GENUS", name_status_col = "STATUS", name_envir_col = "ENVIRONMENT",
name_climate_col = "CLIMATE"){
na_species = data_na[[name_species_col]]
rm_NA = function(data){
data = data[!is.na(data)]
if(length(data) == 0) data = NA
data
}
no_message = function(code){
sink("NUL")
tmp = code
sink()
return(tmp)
}
no_worms = F
no_itis = F
cat("\n")
cat("Searching for accepted names in ITIS\n")
cat("---------------------------------------\n")
start.time = Sys.time()
na_species_itis = taxizedb::name2taxid(na_species, db = "itis", out_type = "summary")
if(nrow(na_species_itis) != 0){
na_species_itis = na_species_itis[!duplicated(na_species_itis), ]
checked_tsn = itis_acceptname(na_species_itis$id)
if(nrow(checked_tsn) != 0){
position_corrected_tsn = which(checked_tsn$submittedtsn != checked_tsn$acceptedtsn)
if(length(position_corrected_tsn) != 0){
initial_names_itis = taxizedb::taxid2name(as.numeric(checked_tsn$submittedtsn[position_corrected_tsn]), db = "itis")
final_names_itis = taxizedb::taxid2name(as.numeric(checked_tsn$acceptedtsn[position_corrected_tsn]), db = "itis")
final_itis = data.frame(APHIA_ID = rep(NA, length(initial_names_itis)), SPECIES = final_names_itis,
INITIAL_SPECIES = initial_names_itis, INITIAL_APHIA_ID = rep(NA, length(initial_names_itis)))
end.time = Sys.time()
diff.time = difftime(end.time, start.time)
cat(paste("Accepted names retrieved from ITIS in:", round(diff.time[[1]], 2), units(diff.time), "\n"))
cat("\n")
}
else no_itis = T
}
else no_itis = T
}
else no_itis = T
if(no_itis) warning("No new accepted names were found in the ITIS database.")
print(final_itis)
if(!all(is.na(data_na[[name_aphia_col]]))) {
cat("\n")
cat("Searching for accepted names in WoRMS\n")
cat("---------------------------------------\n")
start.time3 = Sys.time()
missing_aphia = data_na[!is.na(data_na[[name_aphia_col]]), ]
if(!no_itis) missing_aphia = missing_aphia[!missing_aphia[[name_species_col]] %in% initial_names_itis, ]
if(nrow(missing_aphia) != 0){
worms_accepted_names = no_message(worms::wormsaccepted(worms::wormsconsolidate(worms::wormsbyid(as.numeric(missing_aphia$APHIA_ID)))))
worms_accepted_names = worms_accepted_names[,c(1,3,9,10)]
worms_accepted_names = worms_accepted_names[complete.cases(worms_accepted_names), ]
position_corrected_aphia = which(worms_accepted_names$scientificname != worms_accepted_names$valid_name)
if(length(position_corrected_aphia) != 0){
final_worms = data.frame(APHIA_ID = worms_accepted_names[position_corrected_aphia, 1],
SPECIES = worms_accepted_names[position_corrected_aphia, 4],
INITIAL_SPECIES = worms_accepted_names[position_corrected_aphia, 2],
INITIAL_APHIA_ID = worms_accepted_names[position_corrected_aphia, 3])
end.time3 = Sys.time()
diff.time3 = difftime(end.time3, start.time3)
cat(paste("Accepted names retrieved from WoRMS in:", round(diff.time3[[1]], 2), units(diff.time3), "\n"))
cat("\n")
}
else no_worms = T
}
else no_worms = T
}
else no_worms = T
if(no_worms) warning("No new accepted names were found in the WoRMS database.")
print(final_worms)
if(no_itis && no_worms) {
warning("No new accepted names could be found on ITIS and WoRMS. The original dataframe is returned.")
return(tibble::tibble(data))
}
else {
if(no_itis && !no_worms) new_names = final_worms
else if(!no_itis && no_worms) new_names = final_itis
else if(!no_itis && !no_worms) new_names = rbind(final_worms, final_itis)
cat("\n")
cat("Retrieving ecological informations from Fishbase\n")
cat("---------------------------------------\n")
start.time4 = Sys.time()
new_names = cbind(new_names, STATUS = rep("ACCEPTED", nrow(new_names)), ID = rep(NA, nrow(new_names)))
new_names$SPECIES = gsub("(\\(.*?\\) )", "", new_names$SPECIES)
new_names[grep(" cf.", new_names$SPECIES), ] = NA
new_names[grep(" var.", new_names$SPECIES), ] = NA
new_names[grep(" aff.", new_names$SPECIES), ] = NA
new_names[grep(" sp.", new_names$SPECIES), ] = NA
new_names$SPECIES = stringr::word(new_names$SPECIES, 1, 2)
new_names = new_names[!is.na(new_names$SPECIES), ]
new_names = new_names[!duplicated(new_names$SPECIES), ]
new_names = new_names[which(new_names$SPECIES != new_names$INITIAL_SPECIES), ]
new_infos = fishbase_ecology(new_names$SPECIES)
end.time4 = Sys.time()
diff.time4 = difftime(end.time4, start.time4)
cat(paste("New ecological informations retrieved from Fishbase in:", round(diff.time4[[1]], 2), units(diff.time4), "\n"))
cat("\n")
cat("\n")
cat("Retrieving ecological informations from OBIS, GBIF & VertNet\n")
cat("---------------------------------------\n")
start.time5 = Sys.time()
new_infos = merge(new_names, new_infos)
new_infos = new_infos[!duplicated(new_infos$SPECIES), ]
new_infos = division_ecology(new_infos, mode = "ogv", division_number = ifelse(nrow(new_infos) < 200, nrow(new_infos), 200), write_csv = F)
end.time5 = Sys.time()
diff.time5 = difftime(end.time5, start.time5)
cat(paste("New ecological informations retrieved from OBIS, GBIF & VertNet in:", round(diff.time5[[1]], 2), units(diff.time5), "\n"))
cat("\n")
if(!all(is.na(data_na[[name_aphia_col]]))) {
cat("\n")
cat("Retrieving ecological informations from WoRMS\n")
cat("---------------------------------------\n")
start.time6 = Sys.time()
new_infos = division_ecology(new_infos, mode = "worms", get_aphia_id = T,
division_number = ifelse(nrow(new_infos) < 200, nrow(new_infos), 200))[, 1:8]
end.time6 = Sys.time()
diff.time6 = difftime(end.time6, start.time6)
cat(paste("New ecological informations retrieved from WoRMS in:", round(diff.time6[[1]], 2), units(diff.time6), "\n"))
cat("\n")
}
new_infos_final = merge(new_infos, new_names[,2:3])
new_infos_final = cbind(GENUS = stringr::word(new_infos_final$SPECIES, 1), new_infos_final)
new_genus = new_infos_final[which(new_infos_final$GENUS != stringr::word(new_infos_final$INITIAL_SPECIES, 1)), ]
still_missing_new_infos = new_genus[is.na(new_genus$ENVIRONMENT), ]
if(nrow(still_missing_new_infos) != 0){
cat("\n")
cat("Retrieving ecological informations from WoRMS for the new GENUS\n")
cat("---------------------------------------\n")
start.time7 = Sys.time()
if(genus_not_strict){
still_missing_new_infos = worms_ecology_upper_taxa(still_missing_new_infos, not_strict_rank = "GENUS", search_rank_col = "GENUS",
division_number = ifelse(nrow(still_missing_new_infos) < 200,
nrow(still_missing_new_infos), 200), print_division = F)
}
else still_missing_new_infos = worms_ecology_upper_taxa(still_missing_new_infos, not_strict_rank = "NA", search_rank_col = "GENUS",
division_number = ifelse(nrow(still_missing_new_infos) < 200,
nrow(still_missing_new_infos), 200), print_division = F)
new_infos_final = replace_values(data_original = new_infos_final, data_model = still_missing_new_infos,
variables_original = c("STATUS", "ENVIRONMENT"),
id_original = "INITIAL_SPECIES", total_replacement = "all")
end.time7 = Sys.time()
diff.time7 = difftime(end.time7, start.time7)
cat(paste("New ecological informations retrieved from WoRMS for the new GENUS in:", round(diff.time7[[1]], 2), units(diff.time7), "\n"))
cat("\n")
}
data_to_correct = cbind(INITIAL_SPECIES = data[[name_species_col]], data)
data_to_correct = data_to_correct[!duplicated(data_to_correct$INITIAL_SPECIES), ]
final = replace_values(data_original = data_to_correct, data_model = new_infos_final,
variables_original = c(name_species_col, name_genus_col, name_status_col, name_aphia_col, name_envir_col, name_climate_col),
variables_model = c("SPECIES", "GENUS", "STATUS", "APHIA_ID", "ENVIRONMENT", "CLIMATE"),
id_original = "INITIAL_SPECIES", total_replacement = "all")
return(list(data = tibble::tibble(final), corrected_data = tibble::tibble(new_infos_final)))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.