R/multiple_evaluations.R

Defines functions multiple_evaluations

## Requirement: "tibble" + "dplyr"

multiple_evaluations = function(eco_taxo_data, bin_data, metabarcode_data, 
                                environment_list, climate_list, order_list,
                                intra_bin_multi_list, inter_bin_multi_list,
                                specific_pcr_fish, unspecific_pcr_all,
                                specific_pcr_fish_reference,
                                similarity_reference, reference_data,
                                save_taxo_analysis = F, save_size_subset = F){
  
  score_list = vector(mode = "list", length = length(similarity_reference$THRESHOLD))
  names(score_list) = similarity_reference$THRESHOLD
  score_list = lapply(score_list, function(x) {x = vector(mode = "list", length = length(environment_list)) })
  score_list = lapply(score_list, function(x) lapply(x, function(y) y = vector(mode = "list", length = length(order_list))))
  
  taxo_list = vector(mode = "list", length = length(similarity_reference$THRESHOLD))
  names(taxo_list) = similarity_reference$THRESHOLD
  taxo_list = lapply(taxo_list, function(x) {x = vector(mode = "list", length = length(environment_list)) })
  taxo_list = lapply(taxo_list, function(x) lapply(x, function(y) y = vector(mode = "list", length = length(order_list))))
  
  for(similarity in similarity_reference$THRESHOLD){

    names(score_list)[100 - similarity] = similarity
    
    names(taxo_list)[100 - similarity] = similarity
    
    for(i in 1:length(environment_list)){
      
      names(score_list[[100 - similarity]])[i] = paste0(environment_list[[i]], "_", climate_list[[i]])
      
      names(taxo_list[[100 - similarity]])[i] = paste0(environment_list[[i]], "_", climate_list[[i]])
      
      for(j in 1:length(order_list)){
        
        subset_found = F
        
        if(order_list[[j]] == "All"){
          
          if(all(environment_list[[i]] == "both") && all(climate_list[[i]] == "both")){
            
            subset_data = eco_taxo_data
            
            subset_specific_pcr_fish = specific_pcr_fish
            
            subset_specific_pcr_fish_reference = specific_pcr_fish_reference
            
            subset_unspecific_pcr_all = unspecific_pcr_all
            
          }
          
          else {
            
            subset_data = subset(eco_taxo_data, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]])
            
            subset_specific_pcr_fish = subset(specific_pcr_fish, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]])
            
            subset_specific_pcr_fish_reference = subset(specific_pcr_fish_reference, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]])
            
            subset_unspecific_pcr_all = subset(unspecific_pcr_all, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]])
            
          }
          
        } 
        
        else if(order_list[[j]] == "Others"){
          
          if(all(environment_list[[i]] == "both") && all(climate_list[[i]] == "both")){
            
            subset_data = subset(eco_taxo_data, !(ORDER %in% unlist(order_list)))
            
            subset_specific_pcr_fish = subset(specific_pcr_fish, !(ORDER %in% unlist(order_list)))
            
            subset_specific_pcr_fish_reference = subset(specific_pcr_fish_reference, !(ORDER %in% unlist(order_list)))
            
            subset_unspecific_pcr_all = unspecific_pcr_all
            
          }
          
          
          else {
            
            subset_data = subset(eco_taxo_data, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]] &
                                   !(ORDER %in% unlist(order_list)))
            
            subset_specific_pcr_fish = subset(specific_pcr_fish, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]] &
                                                !(ORDER %in% unlist(order_list)))
            
            subset_specific_pcr_fish_reference = subset(specific_pcr_fish_reference, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]] &
                                                          !(ORDER %in% unlist(order_list)))
            
            subset_unspecific_pcr_all = subset(unspecific_pcr_all, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]])
            
          }
          
        }
        
        else {
          
          if(all(environment_list[[i]] == "both") && all(climate_list[[i]] == "both")){
            
            subset_data = subset(eco_taxo_data, ORDER == order_list[[j]])
            
            subset_specific_pcr_fish = subset(specific_pcr_fish, ORDER == order_list[[j]])
            
            subset_specific_pcr_fish_reference = subset(specific_pcr_fish_reference, ORDER == order_list[[j]])
            
            subset_unspecific_pcr_all = unspecific_pcr_all
            
          }
          
          else {
            
            subset_data = subset(eco_taxo_data, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]] &
                                   ORDER == order_list[[j]])
            
            subset_specific_pcr_fish = subset(specific_pcr_fish, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]] &
                                                ORDER == order_list[[j]])
            
            subset_specific_pcr_fish_reference = subset(specific_pcr_fish_reference, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]] &
                                                          ORDER == order_list[[j]])
            
            subset_unspecific_pcr_all = subset(unspecific_pcr_all, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]])
            
          }
          
        }
        
        subset_metabarcodes = subset(metabarcode_data, ACCESSION %in% subset_data$ACCESSION)
        
        if(length(climate_list) == 1 && length(environment_list) == 1 && length(order_list) == 1){
          
          subset_inter_bin = inter_bin_multi_list
          
          subset_intra_bin = intra_bin_multi_list
          
        }
        
        else{
          
          subset_inter_bin = inter_bin_multi_list[[paste0(names(environment_list)[i], "_", order_list[[j]])]]
          
          subset_intra_bin = intra_bin_multi_list[[paste0(names(environment_list)[i], "_", order_list[[j]])]]
          
        }
        
        if(!is.null(subset_inter_bin) && !is.null(subset_intra_bin) && all(class(subset_inter_bin) != "numeric") && 
           all(class(subset_intra_bin) != "numeric")){
          
          subset_found = T
          
          if(length(subset_inter_bin) == 1) subset_inter_bin_list = subset_inter_bin[[1]]
          
          else subset_inter_bin_list = subset_inter_bin
          
          if(length(subset_intra_bin) == 1) subset_intra_bin_list = subset_intra_bin[[1]]
          
          else subset_intra_bin_list = subset_intra_bin
          
          subset_intra_bin_list = list(SIMILARITY = subset_intra_bin_list$SIMILARITY, 
                                       THRESHOLD = subset_intra_bin_list$THRESHOLD[[
                                         similarity_reference[which(similarity_reference$THRESHOLD == similarity), ]$INDEX]])
          
          cat(paste0("Performing primers evaluation for ", order_list[[j]], " in the region ", names(environment_list)[i],
                     " with a similarity threshold of ", similarity, "%."))
          cat("\n")
          cat("\n")
          
          if(length(subset_inter_bin_list) != length(unique(metabarcode_data$PRIMERS))) stop("Missing element in subset_inter_bin_list. Modify function!")
          
          if(length(subset_inter_bin_list) != length(unique(metabarcode_data$PRIMERS))) stop("Missing element in subset_intra_bin_list. Modify function!")
          
          score = evaluate_primers(specific_pcr_fish = subset_specific_pcr_fish, 
                                   unspecific_pcr_all = subset_unspecific_pcr_all,
                                   specific_pcr_fish_reference = subset_specific_pcr_fish_reference,
                                   metabarcode_data = subset_metabarcodes,
                                   intra_bin_list = subset_intra_bin_list,
                                   inter_bin_list = subset_inter_bin_list, 
                                   reference_data = reference_data, 
                                   taxo_bin_data = subset_data, 
                                   similarity_threshold = similarity, 
                                   plot = F)
          
          if(save_taxo_analysis) {
            
            subset_metabarcodes_taxo = tibble(merge(subset_metabarcodes, subset_data))
            number_cases_intra_bin = subset(subset_metabarcodes_taxo, PRIMERS == unique(subset_metabarcodes_taxo$PRIMERS)[1]) %>% group_by(BIN) %>% summarise(N = n())
            number_cases_intra_bin = sapply(split(number_cases_intra_bin, number_cases_intra_bin$BIN), 
                                            function(x) length(matrix(ncol = x$N, nrow = x$N)[lower.tri(matrix(ncol = x$N, nrow = x$N), diag = F)]))
            total_number_cases_intra_bin = sum(number_cases_intra_bin)
            
            number_cases_inter_bin = sapply(split(subset_data, subset_data$BIN), 
                                            function(x) length(unique(x$ACCESSION)))
            number_cases_inter_bin = tibble(data.frame(BIN_INDEX = 1:length(number_cases_inter_bin),
                                                       BIN = names(number_cases_inter_bin),  
                                                       NUMBER_ACCESSION = number_cases_inter_bin))
            number_cases_inter_bin = number_cases_inter_bin$NUMBER_ACCESSION %o% number_cases_inter_bin$NUMBER_ACCESSION
            total_number_cases_inter_bin = sum(number_cases_inter_bin[lower.tri(number_cases_inter_bin, diag = F)])
            
            taxonomic_resolution_summary = tibble(merge(score$INTRA_BIN_ANALYSIS[,1:3], score$INTER_BIN_ANALYSIS[,1:3]))
            taxonomic_resolution_summary$INTRA_BIN_NUMBER_CASES = total_number_cases_intra_bin
            taxonomic_resolution_summary$INTER_BIN_NUMBER_CASES = total_number_cases_inter_bin
            taxonomic_resolution_summary$TOTAL_NUMBER_CASES = total_number_cases_intra_bin + total_number_cases_inter_bin
            taxonomic_resolution_summary$NUMBER_MISIDENTIFIED_CASES = taxonomic_resolution_summary$NUMBER_WRONG_DISCRIMINED_CASES + 
              taxonomic_resolution_summary$NUMBER_WRONG_UNDISCRIMINED_CASES
            taxonomic_resolution_summary$PERCENT_MISIDENTIFIED_CASES = (taxonomic_resolution_summary$NUMBER_MISIDENTIFIED_CASES /
                                                                              (taxonomic_resolution_summary$TOTAL_NUMBER_CASES)) * 100
            taxonomic_resolution_summary$PERCENT_MISIDENTIFIED_CASES = (taxonomic_resolution_summary$NUMBER_MISIDENTIFIED_CASES /
                                                                              (taxonomic_resolution_summary$TOTAL_NUMBER_CASES)) * 100
            taxonomic_resolution_summary$PERCENT_MISIDENTIFIED_CASES = (taxonomic_resolution_summary$NUMBER_MISIDENTIFIED_CASES /
                                                                              (taxonomic_resolution_summary$TOTAL_NUMBER_CASES)) * 100
            taxonomic_resolution_summary$PERCENT_INTRA_BIN_ERRORS_IN_TOTAL_MISIDENTIFIED_CASES = (taxonomic_resolution_summary$NUMBER_WRONG_DISCRIMINED_CASES / 
                                                                                                        (taxonomic_resolution_summary$TOTAL_NUMBER_CASES)) * 100
            taxonomic_resolution_summary$PERCENT_INTER_BIN_ERRORS_IN_TOTAL_MISIDENTIFIED_CASES = (taxonomic_resolution_summary$NUMBER_WRONG_UNDISCRIMINED_CASES / 
                                                                                                        (taxonomic_resolution_summary$TOTAL_NUMBER_CASES)) * 100
            colnames(taxonomic_resolution_summary)[c(3,5)] = c("PERCENT_INTRA_BIN_ERRORS", "PERCENT_INTER_BIN_ERRORS")
            taxonomic_resolution_summary$PERCENT_MISIDENTIFIED_CASES_PER_ANALYSIS = taxonomic_resolution_summary$PERCENT_INTRA_BIN_ERRORS +
              taxonomic_resolution_summary$PERCENT_INTER_BIN_ERRORS
            taxonomic_resolution_summary = tibble(cbind(taxonomic_resolution_summary[,1:2], taxonomic_resolution_summary[,6], taxonomic_resolution_summary[,3:4],
                                                            taxonomic_resolution_summary[,7], taxonomic_resolution_summary[,5], taxonomic_resolution_summary[,13],
                                                            taxonomic_resolution_summary[,9], taxonomic_resolution_summary[,8], taxonomic_resolution_summary[,10],
                                                            taxonomic_resolution_summary[,11:12]))
            
            taxo = taxonomic_resolution_summary
            
          }
          
          if(save_size_subset) score = tibble(cbind(ORDER = order_list[[j]], FISH_DNA = nrow(subset_metabarcodes),
                                                    FISH_PCR = nrow(subset_specific_pcr_fish),
                                                    NON_FISH_PCR = nrow(subset_unspecific_pcr_all), score$SCORES_LONG))
          
          else score = tibble(cbind(ORDER = rep(order_list[[j]], nrow(score$SCORES_LONG)), score$SCORES_LONG))
          
          if(subset_found) {
            
            score_list[[100 - similarity]][[i]][[j]] = score
            
            names(score_list[[100 - similarity]][[i]])[j] = order_list[[j]]
            
            if(save_taxo_analysis){
              
              taxo_list[[100 - similarity]][[i]][[j]] = taxo
              
              names(taxo_list[[100 - similarity]][[i]])[j] = order_list[[j]]
              
            }
            
          }
          
        }
        
        else if(similarity == max(similarity_reference$THRESHOLD)) 
          warning(paste0("No sequences of ", order_list[[j]], " exist for the region ", names(environment_list)[i]))
        
      }
      
    }
    
  }
  
  final_scores = dplyr::bind_rows(lapply(score_list, function(x) bind_rows(lapply(x, function(y)
    bind_rows(y, .id = "ORDER")), .id = "CLIMATE")), .id = "SIMILARITY")
  final_scores$ENVIRONMENT = word(final_scores$CLIMATE, 1, sep = fixed("_"))
  final_scores$CLIMATE = word(final_scores$CLIMATE, 2, sep = fixed("_"))
  if(save_size_subset) final_scores = tibble(cbind(final_scores[,1:2], final_scores[,10], final_scores[,3:9]))
  else final_scores = tibble(cbind(final_scores[,1:2], final_scores[,7], final_scores[,3:6]))
  
  if(save_taxo_analysis){
    
    final_taxo = dplyr::bind_rows(lapply(taxo_list, function(x) bind_rows(lapply(x, function(y)
      bind_rows(y, .id = "ORDER")), .id = "CLIMATE")), .id = "SIMILARITY")
    final_taxo$ENVIRONMENT = word(final_taxo$CLIMATE, 1, sep = fixed("_"))
    final_taxo$CLIMATE = word(final_taxo$CLIMATE, 2, sep = fixed("_"))
    final_taxo = tibble(cbind(final_taxo[,1:2], final_taxo[,17], final_taxo[,3:16]))
    
  }
  
  if(!save_taxo_analysis) return(final_scores)
  
  else return(list(SCORES = final_scores, TAXO = final_taxo))
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.