R/division_ecology.R

Defines functions division_ecology

## Requirement: 'tibble' + 'stringr' 

# Info: Since the multiple_databases_ecology or the worms_ecology functions they are very long to compute, this function aims at splitting the request.
# Info: Every X divisions provided by the user, it saves the data as a CSV file in the directory. 
# Info: At the end, it unites all the CSV files and moves the intermediate files to the temporary directory (deleted at the end of session). 
# Info: It also return a full dataframe and splitted lists for easy calls.

# Note: If the "write_csv" option is set to TRUE, do not open the temporary csv files during the process.

division_ecology = function(data, mode, division_number, get_aphia_id = F, random_samples = 50, write_csv = F, file_name = NULL,  
                            databases = c("obis", "gbif", "vertnet"), aphia_col = "APHIA_ID", genus_col = "GENUS", 
                            climate_col = "CLIMATE", species_col = "SPECIES", id_col = "ID", print_division = T){
  
  start = list()
  end = list()
  file_names = list()
  output = data.frame()
  
  if(write_csv && is.null(file_name)) stop('You must provide a file name if the "write_csv" option is set to TRUE.')
  
  if(!(mode %in% c("ogv","worms","algaebase"))) stop('Mode must be set to "ogv" for querying OBIS, GBIF and Vertnet databases, "worms" to query WORMS, or "algaebase" to query ALGAEBASE.')
  
  if(mode == "ogv" && !is.data.frame(data)) stop('The data must be a dataframe with a column containing the species names and their IDs.')
  
  if(mode == "worms" && !is.data.frame(data)) stop('The data must be a dataframe with a column containing APHIA IDs.')
  
  if(mode == "worms" && !any(colnames(data) == aphia_col)) stop('The data must contain a the column provided in the aphia_col argument (default: "APHIA_ID").')
  
  if(write_csv && is.null(file_name)) stop("The option write_csv has been set to true but no file name has been provided.")
  
  if(mode == "ogv") data_list = data[[species_col]]
  
  else if(mode == "worms") data_list = data[[aphia_col]]
  
  if(division_number > length(data_list)) stop("Division number must be inferior to the number of species names.")
  
  multiple = round(length(data_list)/division_number)
  
  for(i in 1:multiple){
    
    j = division_number*i
    k = i-1
    
    if(i == 1) {
      start[i] = 0
      end[i] = division_number
    }
    
    else {
      end[i] = j
      start[i] = end[k]
    }
    
  }
  
  start = unlist(start) + 1
  end = unlist(end)
  
  diff = length(data_list) - division_number*multiple
  
  if(length(data_list) > division_number*multiple) {
    
    start = c(start, end[length(end)] + 1)
    end = c(end, end[length(end)] + diff)
    
  }
  
  else if(length(data_list) < division_number*multiple) end[length(end)] = end[length(end)] + diff
  
  
  if(print_division) {
    
    test = list()
    
    for(i in 1:length(start)){
      test[i] = paste(start[i], end[i], sep = ":")}
    
    print(unlist(test))
    
  }
  
  if(get_aphia_id) completed_data = worms_ecology(data, name_aphia_col = aphia_col, add_aphia_id = T)
  
  else completed_data = data
  
  for(i in 1:length(start)){
    
    start_time = Sys.time()
    
    cat(paste(i,"/",length(start), " - State: ", sep = ""))
    
    if(mode == "ogv") ecology = multiple_databases_ecology(data[start[i]:end[i], ], col_species = species_col, col_id = id_col,
                                                           random_samples = random_samples, databases = databases)
    
    else if(mode == "worms") ecology = worms_ecology(completed_data[start[i]:end[i], ], name_aphia_col = aphia_col, add_aphia_id = F)
    
    if(i == 1) ecology_final = ecology
    
    else ecology_final = rbind(ecology_final, ecology)
    
    if(write_csv){
      
      file_names[i] = paste0(file_name,i,".csv")
      
      write.csv(ecology, file = file_names[i][[1]], row.names = F)
      
      cat("\n")
      
      cat(paste("DONE:", file_names[i][[1]],"\n"))
      
      cat("\n")
      
    }
    
    if(length(start) != 1) {
      
      end_time = Sys.time()
      time = round(end_time - start_time, 2)
      time_left = round(time * length(start) - time * i, 2)
      
      if(time_left < 60) cat("Time left : ", time_left, " seconds", "\n", "")
      
      else if(time_left < 3600) cat("Time left : ", time_left/60, " minutes", "\n", "")
      
      else cat("Time left : ", time_left/3600, " hours", "\n", "")
      
      cat("\n")
      
    }
    
  }
  
  if(write_csv){
    
    divided_files = unlist(file_names)
    
    if(any(list.files() == paste0(file_name, ".csv"))) {
      
      file.rename(from = paste0(getwd(), "/", paste0(file_name, ".csv")), to = paste0(getwd(), "/", file_name, "- previous version.csv"))
      
      warning("A previous version with the same file name was renamed to avoid overwritting it.")
      
    }
    
    for(j in 1:length(divided_files)){
      
      if (j == 1){
        
        unique_file = read.csv(divided_files[j], header = TRUE, sep = ",")
        
        file.rename(from = paste0(getwd(), "/", divided_files[j]), to = paste0(tempdir(), "/", divided_files[j]))
        
      }
      
      else{
        
        temporary_file = read.csv(divided_files[j], header = TRUE, sep = ",")
        
        unique_file = rbind(unique_file, temporary_file) 
        
        file.rename(from = paste0(getwd(), "/", divided_files[j]), to = paste0(tempdir(), "/", divided_files[j]))
        
      }
      
    }
    
    write.csv(unique_file, file = paste0(file_name,".csv"), row.names = F)
    
  }
  
  tibble(ecology_final)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.