R/complete_taxonomy.R

Defines functions complete_taxonomy

## Replacement: 'worms' + 'tibble' + 'taxizedb' + 'stringr'

# Info: Automatically detect the columns with missing info. 
# Info: Based on references columns, it searches for all superior taxa in GBIF, ITIS, Catalog of Life and World Flora Online local databases.
# Info: Conflictual taxa are both conserved and separated with an "or", to avoid splitting a taxa in multiple synonyms.

# Note: BASE_TAXA can be set to automatic for dataframes obtained after treatment with the other functions (i.e. same column names).
# Note: Otherwise, it must be a vector with the reference the name of the reference column for each column with NA.

complete_taxonomy = function(data, BASE_TAXA = "automatic"){
  
  fct_env = new.env()
  
  na_columns = colSums(is.na(data)) 
  
  names_na_columns = names(na_columns[na_columns > 0])
  
  if(anyNA(names_na_columns)) stop("One of the column is named NA.")
  
  if(BASE_TAXA != "automatic") warning("You must provide a vector with the name of the reference column for each column with NA.")
  
  else{
    
    if(any(colnames(data[ ,grep("FAMILY", colnames(data)):ncol(data)]) != 
           c("FAMILY", "ORDER", "CLASS", "PHYLUM", "KINGDOM", "SUPERKINGDOM")) ||
       !any(colnames(data) == "SPECIES")) stop('You must provide a dataframe with the following column names: "SPECIES", "FAMILY", "ORDER", "CLASS", "PHYLUM", "KINGDOM" & "SUPERKINGDOM" (in capital) and they must be in this order.')
    
  }
  
  database_taxonomy = function(initial_data, database, column_to_correct, BASE_TAXA = "automatic") {
    
    no_message = function(code){
      
      sink("NUL")
      
      tmp = code
      
      sink()
      
      return(tmp)
      
    }
    
    if(BASE_TAXA == "automatic") {
      
      if(column_to_correct == "FAMILY") BASE_TAXA = "SPECIES"
      
      else if(column_to_correct == "ORDER") BASE_TAXA = "FAMILY"
      
      else if(column_to_correct == "CLASS") BASE_TAXA = "ORDER"
      
      else if(column_to_correct == "PHYLUM") BASE_TAXA = "CLASS"
      
      else if(column_to_correct == "KINGDOM") BASE_TAXA = "PHYLUM"
      
      else if(column_to_correct == "SUPERKINGDOM") BASE_TAXA = "KINGDOM" 
      
    }
    
    missing_data = initial_data[which(!complete.cases(initial_data[column_to_correct])),]
    
    if(nrow(missing_data) != 0){
      
      if(database == "worms"){
        
        worms_valid_id = no_message(worms::wormsbynames(missing_data[[BASE_TAXA]]))
        
        worms_valid_id = worms_valid_id$valid_AphiaID[!is.na(as.numeric(worms_valid_id$valid_AphiaID))]
        
        if(length(worms_valid_id) == 0) data_corrected = initial_data
        
        else{
          
          worms_new_infos = no_message(worms::wormsbyid(worms_valid_id))
          
          worms_new_infos = data.frame(SPECIES = worms_new_infos$scientificname, GENUS = worms_new_infos$genus, 
                                       FAMILY = worms_new_infos$family, ORDER = worms_new_infos$order, CLASS = worms_new_infos$class, 
                                       PHYLUM = worms_new_infos$phylum, KINGDOM = worms_new_infos$kingdom)
          
          data_corrected = add_infos(initial_data, worms_new_infos, id = BASE_TAXA)
          
        }
        
      }
      
      else {
        
        id_data = taxizedb::name2taxid(missing_data[[BASE_TAXA]][which(!is.na(missing_data[[BASE_TAXA]]))], db = database, out_type = "summary")
        
        id_data = id_data[!duplicated(id_data$name),]
        
        if(length(id_data$id) != 0) {
          
          new_classification = id_to_classification(unique(id_data$id), db = database)
          
          if(BASE_TAXA == "SPECIES" && database == "gbif") { new_classification["GENUS"] = stringr::word(new_classification$GENUS, 1) }
          
          data_corrected = add_infos(initial_data, new_classification, id = BASE_TAXA)
          
          
        }
        
        else { data_corrected = initial_data }
        
      }
      
    }
    
    else { data_corrected = initial_data }
    
    data_corrected
    
  }
  
  results = function(data_corrected, message, ok = F){
    
    na_col = colSums(is.na(data_corrected)) 
    
    na_col = na_col[na_col > 0]
    
    cat("\n")
    
    cat(paste(message, "refinement:\n"))
    
    cat("\n")
    
    if(ok) {
      
      end_vector = rep(0, length(names_na_columns))
      
      names(end_vector) = names_na_columns
      
      print(end_vector)
      
    }
    
    else print(na_col)
    
    cat("\n")
    
  }
  
  show_differences = function(data_original, data_new, data_before, database_name){
    
    before_na = sum(sapply(X = data_before, FUN = function(x) sum(is.na(x))))
    
    after_na = sum(sapply(X = data_new, FUN = function(x) sum(is.na(x))))
    
    if(after_na == 0){
      
      cat("\n")
      
      results(data_new, paste(database_name, names_na_columns[i]), ok = T)
      
      cat("All taxonomic informations were retrieved!\n")
      
      cat("\n")
      
    }
    
    else if(before_na == after_na) {
      
      cat("\n")
      
      cat(paste(database_name, names_na_columns[i], "refinement: No information added\n"))
      
      cat("\n")
      
    }
    
    else {
      
      results(data_new, paste(database_name, names_na_columns[i]))
      
    }
    
  }
  
  results(data, "Before")
  
  for(i in 1:length(names_na_columns)){
    
    if(i == 1){
      
      assign(paste("gbif", 1, sep = "_"), data, envir = fct_env)
      
      assign(paste("gbif", (i + 1), sep = "_"), database_taxonomy(get(paste("gbif", (i), sep = "_"), envir = fct_env), 
                                                                  "gbif", column_to_correct = names_na_columns[i]), envir = fct_env)
      
      show_differences(data, get(paste("gbif", (i + 1), sep="_"), envir = fct_env), 
                       get(paste("gbif", (i), sep = "_"), envir = fct_env), database_name = "'GBIF'")
      
    }
    
    else if(i > 1) { 
      
      assign(paste("gbif", (i + 1), sep = "_"), database_taxonomy(get(paste("Tour", (i), sep = "_"), envir = fct_env), 
                                                                  "gbif", column_to_correct = names_na_columns[i]), envir = fct_env) 
      
      show_differences(data, get(paste("gbif", (i + 1), sep="_"), envir = fct_env), 
                       get(paste("Tour", (i), sep = "_"), envir = fct_env), database_name = "'GBIF'")
      
    }
    
    if(anyNA(get(paste("gbif", (i + 1), sep = "_"), envir = fct_env)[[names_na_columns[i]]])) {
      
      assign(paste("itis", (i + 1), sep = "_"), database_taxonomy(get(paste("gbif", (i + 1), sep = "_"), envir = fct_env), 
                                                                  "itis", column_to_correct = names_na_columns[i]), envir = fct_env)
      
      show_differences(data, get(paste("itis", (i + 1), sep="_"), envir = fct_env), 
                       get(paste("gbif", (i + 1), sep = "_"), envir = fct_env), database_name = "'ITIS'")
      
      if(anyNA(get(paste("itis", (i + 1), sep = "_"), envir = fct_env)[[names_na_columns[i]]])) {
        
        assign(paste("col", (i + 1), sep = "_"), database_taxonomy(get(paste("itis", (i + 1), sep = "_"), envir = fct_env), 
                                                                   "col", column_to_correct = names_na_columns[i]), envir = fct_env)
        
        show_differences(data, get(paste("col", (i + 1), sep="_"), envir = fct_env), 
                         get(paste("itis", (i + 1), sep = "_"), envir = fct_env), database_name = "'Catalog of Life'")
        
        if(anyNA(get(paste("col", (i + 1), sep = "_"), envir = fct_env)[[names_na_columns[i]]])) {
          
          assign(paste("wfo", (i + 1), sep = "_"), database_taxonomy(get(paste("col", (i + 1), sep = "_"), envir = fct_env), 
                                                                     "wfo", column_to_correct = names_na_columns[i]), envir = fct_env)
          
          show_differences(data, get(paste("wfo", (i + 1), sep="_"), envir = fct_env), 
                           get(paste("col", (i + 1), sep = "_"), envir = fct_env), database_name = "'World Flora Online'")
          
          if(anyNA(get(paste("wfo", (i + 1), sep = "_"), envir = fct_env)[[names_na_columns[i]]])) {
            
            assign(paste("worms", (i + 1), sep = "_"), database_taxonomy(get(paste("wfo", (i + 1), sep = "_"), envir = fct_env), 
                                                                         "worms", column_to_correct = names_na_columns[i]), envir = fct_env)
            
            show_differences(data, get(paste("worms", (i + 1), sep="_"), envir = fct_env), 
                             get(paste("wfo", (i + 1), sep = "_"), envir = fct_env), database_name = "'WoRMS'")
            
            {if(anyNA(get(paste("worms", (i + 1), sep = "_"), envir = fct_env)[[names_na_columns[i]]])) {
              
              assign(paste("Tour", (i + 1), sep = "_"), get(paste("worms", (i + 1), sep = "_"), envir = fct_env), envir = fct_env)
              
              cat("\n")
              
              cat(paste("Not all", names_na_columns[i], "informations were retrieved despite searching in 5 databases.\n"))
              
              cat("\n")
              
            }
              
              else assign(paste("Tour", (i + 1), sep = "_"), get(paste("worms", (i + 1), sep = "_"), envir = fct_env), envir = fct_env)}
            
          }
          
          else assign(paste("Tour", (i + 1), sep = "_"), get(paste("wfo", (i + 1), sep = "_"), envir = fct_env), envir = fct_env)
          
        }
        
        else assign(paste("Tour", (i + 1), sep = "_"), get(paste("col", (i + 1), sep = "_"), envir = fct_env), envir = fct_env)
        
      }
      
      else assign(paste("Tour", (i + 1), sep = "_"), get(paste("itis", (i + 1), sep = "_"), envir = fct_env), envir = fct_env)
      
    }
    
    else assign(paste("Tour", (i + 1), sep = "_"), get(paste("gbif", (i + 1), sep = "_"), envir = fct_env), envir = fct_env)
    
  }
  
  final_data = suppressWarnings(get(paste("Tour", (i + 1), sep = "_"), envir = fct_env))
  
  final_data[final_data == "Not assigned"] = NA
  
  final_data[final_data == "incertae sedis"] = NA
  
  if(sum(sapply(X = final_data, FUN = function(x) sum(is.na(x)))) != 0){
    
    results(data, "Before")
    
    results(final_data, "After last")
    
  }
  
  tibble::tibble(final_data)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.