R/multiple_databases_ecology.R

Defines functions multiple_databases_ecology

## Requirement: 'tibble' + 'stringr' + 'spocc'

# Info: Same than fishbase_ecology() but get the ecology of non-fish taxa by quering OBIS, GBIF and VertNet databases.
# Info: This function takes much more time to process since it query multiple online databases and does that independently for each species.
# Info: To assess the latitudinal repartition of each species, a random sample of defined length (default: 50 per database) need to be defined.
# Info: Extreme latitudinal values are detected by is_extreme() (i.e. > Q3 + 3xIQR) and removed before assessing the maximum latitude.

# Note: Takes around 1h per 1,000 species with a 50 random samples and the 3 databases.
# Note: For large datasets, it is strongly recommended to use it in the division_ecology() function for progressive saving of data.

multiple_databases_ecology = function(data, col_species = "SPECIES", col_id = "ID", random_samples = 50, 
                                      databases = c("obis", "gbif", "vertnet")){
  
  print_time = function(start_time, end_time, message){
    
    time = round(end_time - start_time, 2)
    
    cat(paste0(message, " - Time taken : ", time, " seconds", "\n", ""))
    
    cat("\n")
    
  }
  
  is_extreme = function(x){
    
    res = x
    
    Q1 = quantile(x, 0.25, na.rm = TRUE)
    
    Q3 = quantile(x, 0.75, na.rm = TRUE)
    
    .IQR = IQR(x, na.rm = TRUE)
    
    upper.limit = Q3 + (10 * .IQR)
    
    lower.limit = Q1 - (10 * .IQR)
    
    outlier = ifelse(x < lower.limit | x > upper.limit, TRUE, FALSE)
    
    outlier
  }
  
  species_spaced = data.frame(name = data[[col_species]], taxid = data[[col_id]])
  
  number_genus = length(grep(" sp.", species_spaced$Spaced))
  
  species_spaced[grep(" sp.", species_spaced$name), ] = suppressWarnings(word(species_spaced[grep(" sp.", species_spaced$name), ], 1,1, " "))
  
  species_unspaced = gsub(" ", "_", species_spaced$name)
  
  start_time_retrieving = Sys.time()
  
  databases_info = suppressWarnings(occ(query = species_spaced$name, from = databases, limit = random_samples))
  
  end_time_retrieving = Sys.time()
  
  cat(paste0("Data retrieved in ", round((end_time_retrieving - start_time_retrieving) / 60, 2), " minutes\n"))
  
  start_time_processing = Sys.time()
  
  taxonomy_infos = function(data, species_list){
    
    aphia_id = list()
    taxon_status = list()
    initial_name = list()
    obis_name = list()
    
    rm_NA = function(data){ 
      
      data = data[!is.na(data)] 
      
      if(length(data) == 0) data = NA
      
      data
      
    }
    
    max_name = function(databases, species, variable, providers){
      
      data = list()
      
      for(j in 1:length(providers)){
        
        dat = databases[[providers[j]]][["data"]][[species]][[variable]]
        
        data[j] = ifelse(length(table(rm_NA(dat))) == 0, NA, 
                         names(which.max(table(rm_NA(dat)))))
        
      }
      
      ifelse(length(table(rm_NA(unlist(data)))) == 0, NA, 
             names(which.max(table(rm_NA(unlist(data))))))
      
    }
    
    for(i in 1:length(species_spaced$name)){
      
      aphia_id[i] = max_name(data, species_unspaced[i], variable = "speciesid", providers = "obis")
      
      taxon_status[i] = max_name(data, species_unspaced[i], variable = "taxonomicStatus", providers = "gbif")
      
    }
    
    data.frame(STATUS = unlist(taxon_status), APHIA_ID = unlist(aphia_id))
    
  }
  
  ecosystem_infos = function(data, species, variable){
    
    info = list()
    
    for(i in 1:length(species)){
      
      if(variable == "latitude"){
        
        information = as.numeric(c(data[["obis"]][["data"]][[species_unspaced[i]]][[variable]], 
                                   data[["gbif"]][["data"]][[species_unspaced[i]]][[variable]],
                                   data[["vertnet"]][["data"]][[species_unspaced[i]]][[variable]]))
        
        information = information[!is.na(information)]
        
        information = information[which(!is_extreme(information))]
        
        if(length(information) == 0) info[i] = NA
        
        else if(all(information > 40) || all(information < -40)) info[i] = "> 40"
        
        else if(all(information > 40) && all(information < -40)) info[i] = "> 40"
        
        else if(all(information < 40) && all(information > -40)) info[i] = "< 40"
        
        else info[i] = "both"
        
      }
      
      else if(variable == "zone"){
        
        terrestrial = data[["obis"]][["data"]][[species_unspaced[i]]][["terrestrial"]]
        
        terrestrial = terrestrial[!is.na(terrestrial)]
        
        marine = data[["obis"]][["data"]][[species_unspaced[i]]][["marine"]]
        
        marine = marine[!is.na(marine)]
        
        if(length(terrestrial) == 0 && length(marine) == 0) info[i] = NA
        
        else if(any(terrestrial)) info[i] = "terrestrial"
        
        else if(all(marine)) info[i] = "marine"
        
        else if(!all(marine)) info[i] = "freshwater"
        
        else info[i] = "both"
        
      }
      
    }
    
    if(variable == "latitude") data.frame(CLIMATE = unlist(info))
    
    else data.frame(ENVIRONMENT = unlist(info))
    
  }
  
  end_time_processing = Sys.time()
  
  cat(paste0("Data processed in ", round(end_time_processing - start_time_processing, 2), " seconds\n"))
  
  tibble(cbind(ID = species_spaced$taxid, SPECIES = species_spaced$name, 
               taxonomy_infos(databases_info),
               ecosystem_infos(databases_info, species_unspaced, "zone"), 
               ecosystem_infos(databases_info, species_unspaced, "latitude")))
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.