R/algaebase_taxonomy.R

Defines functions algaebase_taxonomy

## Requirement: 'tibble' + 'algaeClassify'

# Info: Automatically detect genus with missing taxonomic informations, and launch a search for each of them in ALGAEBASE, before returning 
# Info: the initial dataframe completed with algaebase informations.

algaebase_taxonomy = function(data, genus_col = "GENUS", family_col = "FAMILY"){
  
  if(!any(colnames(data) == genus_col) || !any(colnames(data) == family_col)) 
    stop('The data must contain columns provided in the genus_col and family_col argument (default: "GENUS" & "FAMILY").')
  
  genus_list = unique(data[rowSums(is.na(data)) > 0, ][[genus_col]])
  
  grouped_genus = consensus_deduplification(data[data[[genus_col]] %in% genus_list, ], family_col)
  
  grouped_genus = grouped_genus[!is.na(grouped_genus[[family_col]]), ]
  
  listed_genus = strsplit(grouped_genus[[genus_col]], " or ")
  
  if(length(which(duplicated(listed_genus))) != 0) 
    stop('Some genus with missing taxonomic infos belong to multiple families!')
  
  for(i in 1:length(genus_list)){
    
    start_time = Sys.time()
    
    cat("\n")
    
    cat(paste0('Searching info for "', genus_list[i],'":\n'))
    
    cat("\n")
    
    ecology = tryCatch(algaeClassify::algae_search(genus = genus_list[i], long = T), error = function(e) {rep(NA, 13)})
    
    if(i == 1) algaebase = ecology
    
    else algaebase = rbind(algaebase, ecology)
    
    if(length(genus_list) != 1) {
      
      end_time = Sys.time()
      
      duration = difftime(end_time, start_time)
      
      cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
      
      time_left = round(duration[[1]] * length(genus_list) - duration[[1]] * i, 2)
      
      if(time_left < 60) cat("Time left : ", time_left, " seconds", "\n", "")
      
      else if(time_left < 3600) cat("Time left : ", time_left/60, " minutes", "\n", "")
      
      else cat("Time left : ", time_left/3600, " hours", "\n", "")
      
      cat("\n")
      
    }
    
  }
  
  algaebase_table = data.frame(algaebase$genus, algaebase$Family, algaebase$Order, algaebase$Class, 
                               algaebase$Phylum, algaebase$Kingdom, algaebase$Empire)
  
  algaebase_table = algaebase_table[apply(algaebase_table, 1, function(x) !all(is.na(x))), ]
  
  algaebase_table = data.frame(t(apply(algaebase_table, 1, function(x) stringr::word(x, 1))))
  
  colnames(algaebase_table) = c("GENUS", "FAMILY", "ORDER", "CLASS", "PHYLUM", "KINGDOM", "SUPERKINGDOM")
  
  output = add_infos(data, algaebase_table, genus_col)
  
  tibble::tibble(output)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.