eDNAevaluation: Metabarcodes Evaluation

## Requirement: "tidyverse"

intra_taxa_analysis = function(data_comparisons, data_metabarcodes, data_taxo, mode, taxa = "BIN", 
                               separate_highest_interval = T, similarity_table = T, similarity_thresholds = 90:99){
  
  options(dplyr.summarise.inform = F)
  
  data_taxo = tibble(merge(data_metabarcodes, data_taxo, all = T))
  
  if(taxa == "BIN") max_reference = data_taxo %>% group_by(PRIMERS, BIN) %>% summarise(NUMBER_ACCESSION = n())
  
  else if(taxa == "GENUS") max_reference = data_taxo %>% group_by(PRIMERS, GENUS) %>% summarise(NUMBER_ACCESSION = n())
  
  else if(taxa == "FAMILY") max_reference = data_taxo %>% group_by(PRIMERS, FAMILY) %>% summarise(NUMBER_ACCESSION = n())
  
  else if(taxa == "ORDER") max_reference = data_taxo %>% group_by(PRIMERS, ORDER) %>% summarise(NUMBER_ACCESSION = n())
  
  else stop('Only "BIN", "GENUS", "FAMILY" & "ORDER" are supported in the argument "taxa".')
  
  max_reference_primers = split(max_reference, max_reference$PRIMERS)
  
  cat(paste0("CREATING A REFERENCE MATRIX: "))
  
  start1 = Sys.time()
  
  for(i in 1:length(max_reference_primers)){
    
    max_reference_primers_list = split(max_reference_primers[[i]], max_reference_primers[[i]][[taxa]])
    
    max_comparisons = list()
    
    for(j in 1:length(max_reference_primers_list)){
      
      reference_matrix = matrix(ncol = max_reference_primers_list[[j]]$NUMBER_ACCESSION, 
                                nrow = max_reference_primers_list[[j]]$NUMBER_ACCESSION)
      
      max_comparisons[j] = length(reference_matrix[lower.tri(reference_matrix, diag = F)])
      
    }
    
    if(i == 1) reference_intra = tibble(cbind(max_reference_primers[[i]], TOTAL_NUMBER_CASES = unlist(max_comparisons)))
    
    else reference_intra = rbind(reference_intra, tibble(cbind(max_reference_primers[[i]], TOTAL_NUMBER_CASES = unlist(max_comparisons))))
    
  }
  
  end1 = Sys.time()
  
  duration1 = difftime(end1, start1)
  
  cat("DONE\n")
  
  cat("-------------------------------------------------------------------\n")
  
  cat("\n")
  
  cat(paste("Time taken:", round(duration1[[1]], 2), units(duration1), "\n"))
  
  cat("-------------------------------------------------------------------\n")
  
  cat("\n")
  
  cat("\n")
  
  reference_intra_no_zero = subset(reference_intra, TOTAL_NUMBER_CASES != 0)
  
  
  ## Selecting the appropriate comparisons
  
  cat("SELECTING THE APPROPRIATE COMPARISONS: ")
  
  start2 = Sys.time()
  
  intra_list = lapply(data_comparisons, function(x) x[which(x[[paste0(taxa, "1")]] == x[[paste0(taxa, "2")]]),])
  
  end2 = Sys.time()
  
  duration2 = difftime(end2, start2)
  
  cat("DONE\n")
  
  cat("-------------------------------------------------------------------\n")
  
  cat("\n")
  
  cat(paste("Time taken:", round(duration2[[1]], 2), units(duration2), "\n"))
  
  cat("-------------------------------------------------------------------\n")
  
  cat("\n")
  
  cat("\n")
  
  
  ## Computing the intra-BIN analysis for each threshold
  
  if(mode == "ANALYSIS" & taxa == "BIN"){
    
    cat(paste0("COMPUTING THE INTRA-", taxa, " ANALYSIS: "))
    
    start3 = Sys.time()
    
    intra_threshold = list()
    
    for(similarity in similarity_thresholds){
      
      intra_threshold_primer = bind_rows(lapply(intra_list, function(y) tibble(TAXA = names(sapply(split(y, y[[paste0(taxa, "1")]]), function(x) nrow(x))),
                                                                               NUMBER_DISCRIMINED_CASES = sapply(split(y, y[[paste0(taxa, "1")]]), function(x) 
                                                                                 ceiling(nrow(x[which(x$SIMILARITY < similarity), ]) / 2)))), .id = "PRIMERS") 
      # Divided by 2 since vsearch return duplicated comparisons (i.e. SEQ1 - SEQ2 & SEQ2 - SEQ1) 
      
      colnames(intra_threshold_primer)[2] = taxa
      
      intra_threshold_df = tibble(merge(reference_intra_no_zero, intra_threshold_primer))
      
      intra_threshold_df = tibble(cbind(intra_threshold_df[,1:3], intra_threshold_df[,5], intra_threshold_df[,4], 
                                        PERCENT_DISCRIMINED_CASES = round(intra_threshold_df$NUMBER_DISCRIMINED_CASES / 
                                                                            intra_threshold_df$TOTAL_NUMBER_CASES * 100, 2)))
      
      intra_threshold[[100 - similarity]] = intra_threshold_df
      
      names(intra_threshold)[100 - similarity] = similarity
      
    }
    
    end3 = Sys.time()
    
    duration3 = difftime(end3, start3)
    
    cat("DONE\n")
    
    cat("-------------------------------------------------------------------\n")
    
    cat("\n")
    
    cat(paste("Time taken:", round(duration3[[1]], 2), units(duration3), "\n"))
    
    cat("-------------------------------------------------------------------\n")
    
    cat("\n")
    
    cat("\n")
    
  }
  
  if(mode == "ANALYSIS" & taxa == "BIN") return(intra_threshold)
  
  else if(mode == "ANALYSIS" & taxa != "BIN") stop('The mode "ANALYSIS" can only be executed if "taxa" is set to "BIN".')
  
  else if(mode == "INTERVAL"){
    
    cat(paste0("COMPUTING INTERVALS: "))
    
    start4 = Sys.time()
    
    if(similarity_table) similarity_df = bind_rows(lapply(intra_list, function(x) subset(x, select = SIMILARITY)), .id = "PRIMERS")

    reference_number_comparisons = sapply(split(reference_intra_no_zero, reference_intra_no_zero$PRIMERS), function(x) sum(x$TOTAL_NUMBER_CASES))
    
    if(separate_highest_interval){
      
      intervals_names = data.frame(INTERVALS = c(paste0("[0,", min(similarity_thresholds), ")"), # Interval with omitted cases by vsearch
                                                 names(table(cut(intra_list[[1]]$SIMILARITY, (min(similarity_thresholds)):(max(similarity_thresholds)+1), right = F))),
                                                 as.character(max(similarity_thresholds)+1))) # right = F to exclude the rigth interval, in order to get the highest alone
      
      lowest_interval = t(data.frame((((sapply(split(reference_intra_no_zero, reference_intra_no_zero$PRIMERS), function(x) sum(x$TOTAL_NUMBER_CASES)) -
                                          sapply(intra_list, function(x) floor(nrow(x) / 2))) / # <90% -> total number of comparisons substracted by comparisons above 90
                                         reference_number_comparisons) * 100)))
      
      centre_intervals = (sapply(intra_list, function(x) cbind(floor(table(cut(x$SIMILARITY, (min(similarity_thresholds)):(max(similarity_thresholds)+1), right = F)) / 2))) /
                            reference_number_comparisons)*100 
      
      highest_interval = t(data.frame((sapply(intra_list, function(x) floor(length(which(x$SIMILARITY == (max(similarity_thresholds)+1))) / 2 )) / 
                                         reference_number_comparisons)*100))
      
      intra_intervals = tibble(cbind(intervals_names, rbind(lowest_interval, centre_intervals, highest_interval)))
      intra_intervals = intra_intervals %>% pivot_longer(!INTERVALS, names_to = "PRIMERS", values_to = "PERCENTAGE_COMPARISONS_IN_INTERVAL")
      
    }
    
    else{
      
      intervals_names = data.frame(INTERVALS = c(paste0("[0,", min(similarity_thresholds), "]"), # Name the interval with omitted cases by vsearch
                                                 names(table(cut(intra_list[[1]]$SIMILARITY, (min(similarity_thresholds)):(max(similarity_thresholds)+1), right = T)))))
      
      lowest_interval = t(data.frame((((sapply(split(reference_intra_no_zero, reference_intra_no_zero$PRIMERS), function(x) sum(x$TOTAL_NUMBER_CASES)) -
                                          sapply(intra_list, function(x) floor(nrow(x) / 2))) + # <90% -> total number of comparisons substracted by comparisons above 90
                                         sapply(intra_list, function(x) floor(length(which(x$SIMILARITY == min(similarity_thresholds))) / 2))) / # +90% because not included in interval
                                        reference_number_comparisons)*100))
      
      upper_intervals = (sapply(intra_list, function(x) cbind(floor(table(cut(x$SIMILARITY, (min(similarity_thresholds)):(max(similarity_thresholds)+1), right = T)) / 2))) /
                           reference_number_comparisons)*100 # right = TRUE to include the rigth interval till 100
      
      intra_intervals = tibble(cbind(intervals_names, rbind(lowest_interval, upper_intervals)))
      intra_intervals = intra_intervals %>% pivot_longer(!INTERVALS, names_to = "PRIMERS", values_to = "PERCENTAGE_COMPARISONS_IN_INTERVAL")
      
    }

    end4 = Sys.time()
    
    duration4 = difftime(end4, start4)
    
    cat("DONE\n")
    
    cat("-------------------------------------------------------------------\n")
    
    cat("\n")
    
    cat(paste("Time taken:", round(duration4[[1]], 2), units(duration4), "\n"))
    
    cat("-------------------------------------------------------------------\n")
    
    cat("\n")
    
    cat("\n")

    if(similarity_table) return(list(DATA = similarity_df, INTERVALS = intra_intervals))
    
    else return(intra_intervals)
    
  }
  
  else stop('The argument "mode" can only be set to "INTERVAL" or "ANALYSIS".')
  
}