R/worms_ecology_upper_taxa.R

Defines functions worms_ecology_upper_taxa

## Requirement: 'worrms' + 'tibble'

# Info: To reduce missing informations in the column environment while the column climate is completed, this function searches if the upper taxa
# Info: are strictly marine, freshwater, brackish or terrestrial, to known the environment of a species.

# Note: The argument "not_strict_rank" allows to specify a column name from which the species does not need to live in one environment.
# Note: For example, the genus Bufo lives both in freshwater and terrestrial environments, so if set to "GENUS", the output will be "freshwater".
# Note: Set it to "NA" (between quotation marks) to specify that habitats can be infered only if all members of the ranks strictly lives in 1 habitat.

worms_ecology_upper_taxa = function(data, name_envir_col = "ENVIRONMENT", division_number = 200, not_strict_rank = "GENUS",
                                    search_rank_col = c("PHYLUM", "CLASS", "ORDER", "FAMILY", "GENUS"), print_division = F){
  
  columns = c(name_envir_col, search_rank_col)
  
  if(length(which(colnames(data) %in% columns)) != length(columns) || any(duplicated(colnames(data))) || any(duplicated(columns)))
    stop("Some columns are not in the data or their names is duplicated.")
  
  if(!(not_strict_rank %in% search_rank_col)){
    
    if(not_strict_rank != "NA") stop('The "not_strict_rank" argument is neither "NA" (between quotation marks), nor in the "search_rank_col" names.')
    
  }
  
  for(i in 1:length(search_rank_col)){
    
    aphia_id = list()
    taxa = list()
    zone = list()
    start = list()
    end = list()
    
    cat(paste("Checking the", search_rank_col[i], "habitats\n"))
    cat("-----------------------------------------------------\n")
    
    start_time_processing = Sys.time()
    
    if(i == 1) {
      
      data_replaced = data
      
      cat("\n")
      
      cat(paste0("Missing informations: ", round(nrow(data_replaced[is.na(data_replaced[["ENVIRONMENT"]]), ]) / nrow(data_replaced) * 100, 0), "%\n"))
      
      cat("\n")
      
    }  
    
    data_to_correct = data_replaced[is.na(data_replaced[[name_envir_col]]), ]
    
    for(j in 1:length(unique(data_to_correct[[search_rank_col[i]]]))){
      
      cat(paste0(unique(data_to_correct[[search_rank_col[i]]])[j], " (", j, "/", length(unique(data_to_correct[[search_rank_col[i]]])), ")\n"))
      
      aphia_id[j] = tryCatch(wm_name2id(unique(data_to_correct[[search_rank_col[i]]])[j]), error = function(e) { NA })
      
    }
    
    end_time_processing = Sys.time()
    duration = difftime(end_time_processing, start_time_processing)
    cat("-----------------------------------------------------\n")
    cat(paste("Aphia ID retrieved in:", round(duration[[1]], 2), units(duration), "\n"))
    
    start_time_processing = Sys.time()
    
    aphia = unlist(aphia_id)[!is.na(unlist(aphia_id))]
    
    if(!all(is.na(as.numeric(unlist(aphia_id))))){
      
      if(length(aphia) < division_number) {
        
        start = 1
        
        end = length(aphia)
        
      }
      
      else{
        
        multiple = round(length(aphia)/division_number)
        
        for(h in 1:multiple){
          
          j = division_number * h
          k = h - 1
          
          if(h == 1) {
            start[h] = 0
            end[h] = division_number
          }
          
          else {
            end[h] = j
            start[h] = end[k]
          }
          
        }
        
        start = unlist(start) + 1
        end = unlist(end)
        
        diff = length(h) - division_number*multiple
        
        if(length(h) > division_number*multiple) {
          
          start = c(start, end[length(end)] + 1)
          end = c(end, end[length(end)] + diff)
          
        }
        
      }
      
      if(print_division) {
        
        test = list()
        
        for(p in 1:length(start)){
          
          test[p] = paste(start[p], end[p], sep = ":")}
        
        print(unlist(test))
        
      }
      
      for(g in 1:length(start)){
        
        cat("\n")
        cat(paste0("Division of queries: Query number ", g, "/", length(start), "\n"))
        
        infos = wm_record(as.numeric(aphia[start[g]:end[g]]))
        
        for(k in 1:nrow(infos)){
          
          taxa[k] = infos$scientificname[k]
          
          if(is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) && is.na(infos$isBrackish[k]) && !is.na(infos$isTerrestrial[k])){
            
            if(infos$isTerrestrial[k] == 1) zone[k] = "terrestrial" 
            
            else zone[k] = NA
            
          } 
          
          else if(!is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) && is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
            
            if(infos$isMarine[k] == 1) zone[k] = "marine" 
            
            else zone[k] = NA
            
          } 
          
          else if(is.na(infos$isMarine[k]) && !is.na(infos$isFreshwater[k]) && is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
            
            if(infos$isFreshwater[k] == 1) zone[k] = "freshwater" 
            
            else zone[k] = NA
            
          } 
          
          else if(is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) && !is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
            
            if(infos$isBrackish[k] == 1) zone[k] = "both" 
            
            else zone[k] = NA
            
          }
          
          else if(!is.na(infos$isMarine[k]) && !is.na(infos$isFreshwater[k]) && !is.na(infos$isBrackish[k]) && !is.na(infos$isTerrestrial[k])){
            
            if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 1) zone[k] = "terrestrial"
            
            else if(infos$isMarine[k] == 1 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 0) zone[k] = "marine"
            
            else if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 1 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 0) zone[k] = "freshwater"
            
            else if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 1 && infos$isTerrestrial[k] == 0) zone[k] = "both"
            
            else if(search_rank_col[i] == not_strict_rank){
              
              if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 1 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 1) zone[k] = "freshwater"
              
              else if(infos$isMarine[k] == 1 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 0 && infos$isTerrestrial[k] == 1) zone[k] = "marine"
              
              else if(infos$isMarine[k] == 1 && infos$isFreshwater[k] == 0 && infos$isBrackish[k] == 1 && infos$isTerrestrial[k] == 0) zone[k] = "marine"
              
              else if(infos$isMarine[k] == 0 && infos$isFreshwater[k] == 1 && infos$isBrackish[k] == 1 && infos$isTerrestrial[k] == 0) zone[k] = "freshwater"
              
              else zone[k] = NA
              
            }
            
            else zone[k] = NA
            
          }
          
          else if(search_rank_col[i] == not_strict_rank && is.na(infos$isMarine[k]) && !is.na(infos$isFreshwater[k]) && 
                  is.na(infos$isBrackish[k]) && !is.na(infos$isTerrestrial[k])){
            
            if(infos$isFreshwater[k] == 1) zone[k] = "freshwater" 
            
            else zone[k] = NA
            
          }
          
          else if(search_rank_col[i] == not_strict_rank && is.na(infos$isMarine[k]) && !is.na(infos$isFreshwater[k]) && 
                  !is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
            
            if(infos$isFreshwater[k] == 1) zone[k] = "freshwater" 
            
            else zone[k] = NA
            
          }
          
          else if(search_rank_col[i] == not_strict_rank && !is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) && 
                  !is.na(infos$isBrackish[k]) && is.na(infos$isTerrestrial[k])){
            
            if(infos$isMarine[k] == 1) zone[k] = "marine" 
            
            else zone[k] = NA
            
          }
          
          else if(search_rank_col[i] == not_strict_rank && !is.na(infos$isMarine[k]) && is.na(infos$isFreshwater[k]) && 
                  is.na(infos$isBrackish[k]) && !is.na(infos$isTerrestrial[k])){
            
            if(infos$isMarine[k] == 1) zone[k] = "marine" 
            
            else zone[k] = NA
            
          }
          
          else zone[k] = NA
          
        }
        
        new_infos = data.frame(TAXA = unlist(taxa), ENVIRONMENT = unlist(zone))
        
        new_infos = new_infos[complete.cases(new_infos), ]
        
        data_replaced = replace_values(data_replaced, new_infos, variables_original = name_envir_col, variables_model = "ENVIRONMENT", 
                                       id_original = search_rank_col[i], id_model = "TAXA", total_replacement = "NA")
        
        end_time_processing = Sys.time()
        
        end_time_processing = Sys.time()
        duration = difftime(end_time_processing, start_time_processing)
        cat(paste("Informations processed in:", round(duration[[1]], 2), units(duration), "\n"))
        cat("\n")
        cat(paste0("Remaining NA: ", round(nrow(data_replaced[is.na(data_replaced[["ENVIRONMENT"]]), ]) / nrow(data_replaced) * 100, 0), "%\n"))
        cat("\n")
        cat("-----------------------------------------------------\n")
        cat("\n")
        cat("\n")
        
      }
      
    }
    
    else {
      
      cat("\n")
      cat(paste("No Aphia IDs retrieved from WoRMS for the", search_rank_col[i], "\n"))
      cat("\n")
      
    }
    
  }
  
  tibble(data_replaced)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.