R/correct_name_and_infos.R

Defines functions correct_name_and_infos

## Requirement: 'taxizedb' + 'tibble' + 'taxize' + 'worms' + 'stringr'

# Info: This function takes the data with missing ecological informations and ambiguous status to check for new names and retrieve their ecology.
# Info: It first searches for accepted names on ITIS and WoRMS before lauching new research on Fishbase, OBIS, GBIF, VertNet & WoRMS for the new
# Info: species and the new genus. It then return the corrected dataframe.

# Note: See the function "worms_ecology_upper_taxa() for the argument "genus_not_strict".

correct_name_and_infos = function(data, data_na, genus_not_strict = T, name_species_col = "SPECIES", name_aphia_col = "APHIA_ID",  
                                  name_genus_col = "GENUS", name_status_col = "STATUS", name_envir_col = "ENVIRONMENT", 
                                  name_climate_col = "CLIMATE"){
  
  na_species = data_na[[name_species_col]]
  
  rm_NA = function(data){ 
    
    data = data[!is.na(data)] 
    
    if(length(data) == 0) data = NA
    
    data
    
  }
  
  no_message = function(code){
    
    sink("NUL")
    
    tmp = code
    
    sink()
    
    return(tmp)
    
  }
  
  
  no_worms = F
  
  no_itis = F
  
  
  cat("\n")
  cat("Searching for accepted names in ITIS\n")
  cat("---------------------------------------\n")
  
  start.time = Sys.time()
  
  na_species_itis = taxizedb::name2taxid(na_species, db = "itis", out_type = "summary")
  
  if(nrow(na_species_itis) != 0){
    
    na_species_itis = na_species_itis[!duplicated(na_species_itis), ]
    
    checked_tsn = itis_acceptname(na_species_itis$id)
    
    if(nrow(checked_tsn) != 0){
      
      position_corrected_tsn = which(checked_tsn$submittedtsn != checked_tsn$acceptedtsn)
      
      if(length(position_corrected_tsn) != 0){
        
        initial_names_itis = taxizedb::taxid2name(as.numeric(checked_tsn$submittedtsn[position_corrected_tsn]), db = "itis")
        
        final_names_itis = taxizedb::taxid2name(as.numeric(checked_tsn$acceptedtsn[position_corrected_tsn]), db = "itis")
        
        final_itis = data.frame(APHIA_ID = rep(NA, length(initial_names_itis)), SPECIES = final_names_itis, 
                                INITIAL_SPECIES = initial_names_itis, INITIAL_APHIA_ID = rep(NA, length(initial_names_itis)))
        
        end.time = Sys.time()
        diff.time = difftime(end.time, start.time)
        cat(paste("Accepted names retrieved from ITIS in:", round(diff.time[[1]], 2), units(diff.time), "\n"))
        cat("\n")
        
      }
      
      else no_itis = T
      
    }
    
    else no_itis = T
    
  }
  
  else no_itis = T
  
  if(no_itis) warning("No new accepted names were found in the ITIS database.")
  
  print(final_itis)
  
  
  if(!all(is.na(data_na[[name_aphia_col]]))) {
    
    cat("\n")
    cat("Searching for accepted names in WoRMS\n")
    cat("---------------------------------------\n")
    
    start.time3 = Sys.time()
    
    missing_aphia = data_na[!is.na(data_na[[name_aphia_col]]), ]
    
    if(!no_itis) missing_aphia = missing_aphia[!missing_aphia[[name_species_col]] %in% initial_names_itis, ]
    
    if(nrow(missing_aphia) != 0){
      
      worms_accepted_names = no_message(worms::wormsaccepted(worms::wormsconsolidate(worms::wormsbyid(as.numeric(missing_aphia$APHIA_ID)))))
      
      worms_accepted_names = worms_accepted_names[,c(1,3,9,10)]
      
      worms_accepted_names = worms_accepted_names[complete.cases(worms_accepted_names), ]
      
      position_corrected_aphia = which(worms_accepted_names$scientificname != worms_accepted_names$valid_name)
      
      if(length(position_corrected_aphia) != 0){
        
        final_worms = data.frame(APHIA_ID = worms_accepted_names[position_corrected_aphia, 1], 
                                 SPECIES = worms_accepted_names[position_corrected_aphia, 4], 
                                 INITIAL_SPECIES = worms_accepted_names[position_corrected_aphia, 2], 
                                 INITIAL_APHIA_ID = worms_accepted_names[position_corrected_aphia, 3])
        
        end.time3 = Sys.time()
        diff.time3 = difftime(end.time3, start.time3)
        cat(paste("Accepted names retrieved from WoRMS in:", round(diff.time3[[1]], 2), units(diff.time3), "\n"))
        cat("\n")
        
      }
      
      else no_worms = T
      
    }
    
    else no_worms = T
    
  }
  
  else no_worms = T
  
  if(no_worms) warning("No new accepted names were found in the WoRMS database.")
  
  print(final_worms)
  
  
  if(no_itis && no_worms) {
    
    warning("No new accepted names could be found on ITIS and WoRMS. The original dataframe is returned.")
    
    return(tibble::tibble(data))
    
  }
  
  else {
    
    if(no_itis && !no_worms) new_names = final_worms
    
    else if(!no_itis && no_worms) new_names = final_itis
    
    else if(!no_itis && !no_worms) new_names = rbind(final_worms, final_itis)
    
    cat("\n")
    cat("Retrieving ecological informations from Fishbase\n")
    cat("---------------------------------------\n")
    start.time4 = Sys.time()
    
    new_names = cbind(new_names, STATUS = rep("ACCEPTED", nrow(new_names)), ID = rep(NA, nrow(new_names)))
    
    new_names$SPECIES = gsub("(\\(.*?\\) )", "", new_names$SPECIES)
    
    new_names[grep(" cf.", new_names$SPECIES), ] = NA
    
    new_names[grep(" var.", new_names$SPECIES), ] = NA
    
    new_names[grep(" aff.", new_names$SPECIES), ] = NA
    
    new_names[grep(" sp.", new_names$SPECIES), ] = NA
    
    new_names$SPECIES = stringr::word(new_names$SPECIES, 1, 2)
    
    new_names = new_names[!is.na(new_names$SPECIES), ]
    
    new_names = new_names[!duplicated(new_names$SPECIES), ]
    
    new_names = new_names[which(new_names$SPECIES != new_names$INITIAL_SPECIES), ]
    
    new_infos = fishbase_ecology(new_names$SPECIES)
    
    end.time4 = Sys.time()
    diff.time4 = difftime(end.time4, start.time4)
    cat(paste("New ecological informations retrieved from Fishbase in:", round(diff.time4[[1]], 2), units(diff.time4), "\n"))
    cat("\n")
    
    
    cat("\n")
    cat("Retrieving ecological informations from OBIS, GBIF & VertNet\n")
    cat("---------------------------------------\n")
    start.time5 = Sys.time()
    
    new_infos = merge(new_names, new_infos)
    
    new_infos = new_infos[!duplicated(new_infos$SPECIES), ]
    
    new_infos = division_ecology(new_infos, mode = "ogv", division_number = ifelse(nrow(new_infos) < 200, nrow(new_infos), 200), write_csv = F)
    
    end.time5 = Sys.time()
    diff.time5 = difftime(end.time5, start.time5)
    cat(paste("New ecological informations retrieved from OBIS, GBIF & VertNet in:", round(diff.time5[[1]], 2), units(diff.time5), "\n"))
    cat("\n")
    
    
    if(!all(is.na(data_na[[name_aphia_col]]))) {
      
      cat("\n")
      cat("Retrieving ecological informations from WoRMS\n")
      cat("---------------------------------------\n")
      start.time6 = Sys.time()
      
      new_infos = division_ecology(new_infos, mode = "worms", get_aphia_id = T,
                                   division_number = ifelse(nrow(new_infos) < 200, nrow(new_infos), 200))[, 1:8]
      
      end.time6 = Sys.time()
      diff.time6 = difftime(end.time6, start.time6)
      cat(paste("New ecological informations retrieved from WoRMS in:", round(diff.time6[[1]], 2), units(diff.time6), "\n"))
      cat("\n")
      
    }
    
    
    new_infos_final = merge(new_infos, new_names[,2:3])
    
    new_infos_final = cbind(GENUS = stringr::word(new_infos_final$SPECIES, 1), new_infos_final)
    
    new_genus = new_infos_final[which(new_infos_final$GENUS != stringr::word(new_infos_final$INITIAL_SPECIES, 1)), ]
    
    still_missing_new_infos = new_genus[is.na(new_genus$ENVIRONMENT), ]
    
    if(nrow(still_missing_new_infos) != 0){
      
      cat("\n")
      cat("Retrieving ecological informations from WoRMS for the new GENUS\n")
      cat("---------------------------------------\n")
      start.time7 = Sys.time()
      
      if(genus_not_strict){
        
        still_missing_new_infos = worms_ecology_upper_taxa(still_missing_new_infos, not_strict_rank = "GENUS", search_rank_col = "GENUS", 
                                                           division_number = ifelse(nrow(still_missing_new_infos) < 200, 
                                                                                    nrow(still_missing_new_infos), 200), print_division = F)
        
      }
      
      else still_missing_new_infos = worms_ecology_upper_taxa(still_missing_new_infos, not_strict_rank = "NA", search_rank_col = "GENUS", 
                                                              division_number = ifelse(nrow(still_missing_new_infos) < 200, 
                                                                                       nrow(still_missing_new_infos), 200), print_division = F)
      
      new_infos_final = replace_values(data_original = new_infos_final, data_model = still_missing_new_infos, 
                                       variables_original = c("STATUS", "ENVIRONMENT"),
                                       id_original = "INITIAL_SPECIES", total_replacement = "all")
      
      end.time7 = Sys.time()
      diff.time7 = difftime(end.time7, start.time7)
      cat(paste("New ecological informations retrieved from WoRMS for the new GENUS in:", round(diff.time7[[1]], 2), units(diff.time7), "\n"))
      cat("\n")
      
    }
    
    
    data_to_correct = cbind(INITIAL_SPECIES = data[[name_species_col]], data)
    
    data_to_correct = data_to_correct[!duplicated(data_to_correct$INITIAL_SPECIES), ]
    
    final = replace_values(data_original = data_to_correct, data_model = new_infos_final, 
                           variables_original = c(name_species_col, name_genus_col, name_status_col, name_aphia_col, name_envir_col, name_climate_col), 
                           variables_model = c("SPECIES", "GENUS", "STATUS", "APHIA_ID", "ENVIRONMENT", "CLIMATE"),
                           id_original = "INITIAL_SPECIES", total_replacement = "all")
    
    return(list(data = tibble::tibble(final), corrected_data = tibble::tibble(new_infos_final)))
    
  }
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.