R/multiple_resolution_analysis.R

Defines functions multiple_resolution_analysis

## Requirement: "dplyr"

multiple_resolution_analysis = function(eco_taxo_data, bin_data, metabarcode_data, type, environment_list, climate_list, order_list,
                                        max_ratio_mean_length = 0.5, min_similarity_threshold = 90, max_similarity_threshold = 99){
  
  result_list = list()
  
  for(i in 1:length(environment_list)){
    
    for(j in 1:length(order_list)){
      
      cat("-------------------------------------------------------------------\n")
      cat("-------------------------------------------------------------------\n")
      
      cat(paste0("Environment: ", paste0(environment_list[[i]], collapse = "/"), " - Climate: ", 
                 paste0(climate_list[[i]], collapse = "/"), " - Order: ", order_list[[j]]))
      
      cat("\n")
      cat("\n")
      
      if(order_list[[j]] == "All"){
        
        if(all(environment_list[[i]] == "both") && all(climate_list[[i]] == "both"))
          subset_data = eco_taxo_data
        
        else subset_data = subset(eco_taxo_data, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]])
        
      } 
      
      else if(order_list[[j]] == "Others"){
        
        if(all(environment_list[[i]] == "both") && all(climate_list[[i]] == "both")) 
          subset_data = subset(eco_taxo_data, !(ORDER %in% unlist(order_list)))
        
        else subset_data = subset(eco_taxo_data, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]] &
                                    !(ORDER %in% unlist(order_list)))
        
      }
      
      else {
        
        if(all(environment_list[[i]] == "both") && all(climate_list[[i]] == "both"))
          subset_data = subset(eco_taxo_data, ORDER == order_list[[j]])
        
        else subset_data = subset(eco_taxo_data, ENVIRONMENT %in% environment_list[[i]] & CLIMATE %in% climate_list[[i]] &
                                    ORDER == order_list[[j]])
        
      }
      
      subset_metabarcodes = subset(metabarcode_data, ACCESSION %in% subset_data$ACCESSION)
      
      subset_bin_data = subset(bin_data, ACCESSION %in% subset_data$ACCESSION)
      
      if(nrow(subset_metabarcodes) != 0){
        
        subset_metabarcodes = subset(metabarcode_data, ACCESSION %in% subset_data$ACCESSION)
        
        if(type == "intra") {
          
          intra_bin_comparisons = intra_taxa_similarity(subset_metabarcodes, subset_bin_data, subset_data, taxa = "BIN")
          
          number_sequences_compared_per_bin = subset_bin_data %>% group_by(BIN) %>% summarise(N = n())
          
          number_comparisons_per_bin = sapply(split(number_sequences_compared_per_bin, number_sequences_compared_per_bin$BIN), 
                                              function(x) length(matrix(ncol = x$N, nrow = x$N)[lower.tri(matrix(ncol = x$N, nrow = x$N), diag = F)]))
          
          reference_bin = tibble(BIN = names(number_comparisons_per_bin), NUMBER_ACCESSION = number_sequences_compared_per_bin$N, 
                                 TOTAL_NUMBER_CASES = c(number_comparisons_per_bin))
          
          intra_bin_similarity = bind_rows(lapply(intra_bin_comparisons, function(y) bind_rows(lapply(split(y, y$BIN1), function(x) 
            tibble(MIN_SIMILARITY = summary(x$SIMILARITY)[1], Q1_SIMILARITY = summary(x$SIMILARITY)[2],
                   MEAN_SIMILARITY = summary(x$SIMILARITY)[3], Q3_SIMILARITY = summary(x$SIMILARITY)[4],
                   MAX_SIMILARITY = summary(x$SIMILARITY)[5], 
                   SD_SIMILARITY = sd(x$SIMILARITY))), .id = "BIN")), .id = "PRIMERS")
          
          intra_bin_similarity = tibble(merge(intra_bin_similarity, reference_bin))
          intra_bin_similarity = tibble(cbind(intra_bin_similarity[,2], intra_bin_similarity[,1], intra_bin_similarity[,10], intra_bin_similarity[,3:9]))
          
          intra_bin_threshold = list()
          for(similarity in min_similarity_threshold:max_similarity_threshold){
            
            intra_bin_threshold[[100 - similarity]] = bind_rows(lapply(intra_bin_comparisons, function(y) bind_rows(lapply(split(y, y$BIN1), function(x) 
              
              tibble(NUMBER_ACCESSION = length(unique(c(x$ACCESSION1, x$ACCESSION2))),
                     NUMBER_DISCRIMINED_CASES = nrow(subset(x, SIMILARITY < similarity))/2, 
                     TOTAL_NUMBER_CASES = nrow(x)/2, 
                     PERCENT_DISCRIMINED_CASES = nrow(subset(x, SIMILARITY < similarity)) / nrow(x) * 100,
                     MEAN_SIMILARITY_DISCRIMINED_CASES = mean(subset(x, SIMILARITY < similarity)$SIMILARITY),
                     SD_SIMILARITY_DISCRIMINED_CASES = sd(subset(x, SIMILARITY < similarity)$SIMILARITY))
              
            ), .id = "BIN")), .id = "PRIMERS")
            
            names(intra_bin_threshold)[100 - similarity] = similarity
            
          }
          
          result = list(SIMILARITY = intra_bin_similarity, THRESHOLD = intra_bin_threshold)
          
        }
        
        else if(type == "inter") {
          
          similarity_comparisons_list = pairwise_similarity(subset_metabarcodes, subset_bin_data, subset_data, 
                                                min_similarity_threshold = min_similarity_threshold, 
                                               max_similarity_threshold = max_similarity_threshold,
                                                number_hits_considered = c(0, 0))
          
          result = lapply(similarity_comparisons_list, function(x) x[which(x$BIN1 != x$BIN2),])
          
        }
        
        else stop('The argument "type" must be in "intra" or "inter" for intra-BIN or inter-BIN resolution analysis.')
        
        result_list[[paste0(names(environment_list[i]), "_", order_list[[j]])]] = result
        
      }
      
      else result_list[[paste0(names(environment_list[i]), "_", order_list[[j]])]] = 0
      
    }
    
  }
  
  return(result_list)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.