R/evaluate_primers.R

Defines functions evaluate_primers

## Requirement: "tidyverse" 

evaluate_primers = function(specific_pcr_fish, unspecific_pcr_all, specific_pcr_fish_reference, 
                            metabarcode_data, intra_bin_list, inter_bin_list, 
                                 reference_data, taxo_bin_data, 
                                 similarity_threshold, plot = T, plot_name = NULL, main_title,
                                 order_genes_graph = c("COI", "12S", "16S", "CytB"), 
                                 colours_genes_graph = c("#AD7000", "#C500C1", "#00C4C5", "#CBD400"), 
                                 gene_name_space_major = c(1,10,21,22), gene_name_space_minor = c(0,0,15,98), 
                                 font_size_title = 22, padding_rect_title = c(25, 5)){
  
  ##### INTERNAL FUNCTIONS #####
  
  normalize_scores = function(x, order) {
    
    if(var(x, na.rm = T) != 0 && !all(is.na(c(x)))){
      
      initial = c(x)
      
      if(class(initial) == "list") initial = initial[[1]]
      
      pos_na = which(is.na(x))
      
      pos_no_na = which(!is.na(x))
      
      x = x[!is.na(x)]
      
      x = as.matrix(as.numeric(x))
      
      minAttr = apply(x, 2, min)
      
      maxAttr = apply(x, 2, max)
      
      x = sweep(x, 2, minAttr, FUN = "-")
      
      x = sweep(x, 2,  maxAttr-minAttr, "/")
      
      final = replace(initial, pos_no_na, x)
      
      if(order == "asc")  return(final)
      
      else if(order == "dsc") return(1 - final)
      
      else stop('The argument "order" can only be "asc" or "dsc".')
      
    }
    
    else{
      
      if(all(x == 0) && order == "dsc") return(rep(1, length(x)))
      
      else if(all(x == 0) && order == "asc") return(rep(0, length(x)))
      
      else if(all(x == 1) && order == "asc") return(rep(1, length(x)))
      
      else if(all(x == 1) && order == "dsc") return(rep(0, length(x)))
      
      else if(all(is.na(x))) return(rep(NA, length(x)))
      
      else { 
        
        warning("Impossible to normalize since all values are equal and they are not 0 or 1.")
        
        return(rep(NA, length(x)))
        
      }
      
    }
    
  }
  
  
  
  
  ##### INITIALISATION ####
  
  options(dplyr.summarise.inform = F)
  
  reference_primers_df = subset(reference_data, select = PRIMERS)
  
  reference_primers = reference_data$PRIMERS
  
  bin_selected = length(unique(taxo_bin_data$BIN))
  
  
  
  
  ##### STABILITY ANALYSIS #####
  
  cat("\n")
  cat("----------------STABILITY ANALYSIS---------------")
  cat("\n")
  start = Sys.time()
  
  
  ## Length of fragments
  
  mean_length = metabarcode_data %>% group_by(PRIMERS, .drop = FALSE) %>% 
    dplyr::summarise(MEAN_LENGTH = mean(LENGTH), SD_LENGTH = sd(LENGTH))
  
  
  ## Informativness (i.e. concentration of diagnostic nucleotides)
  
  aligned_sequences_splitted = split(metabarcode_data, metabarcode_data$PRIMERS)
  
  consensus_sequences_splitted = lapply(aligned_sequences_splitted, function(x)
    ConsensusSequence(DNAStringSet(x$FRAGMENT)))
  
  diagnostic_nucleotides_vector = sapply(consensus_sequences_splitted, function(x)
    sum(str_count(paste(x), pattern = names(IUPAC_CODE_MAP)[5:15])) / width(x)[1])
  
  diagnostic_nucleotides = tibble(data.frame(PRIMERS = names(diagnostic_nucleotides_vector),
                                             RATIO_DIAGNOSTIC_NUCLEOTIDES = diagnostic_nucleotides_vector))
  
  
  ## Combining all stability results together
  
  if(nrow(mean_length) != length(reference_primers)) mean_length = tibble(merge(reference_primers_df, mean_length, all = T))
  if(nrow(diagnostic_nucleotides) != length(reference_primers)) diagnostic_nucleotides = tibble(merge(reference_primers_df, diagnostic_nucleotides, all = T))
  
  stability_analysis = tibble(cbind(mean_length, diagnostic_nucleotides[,2]))
  
  end = Sys.time()
  
  duration = difftime(end, start)
  
  cat("\n")
  
  cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
  
  cat("\n")
  
  cat("-------------------------------------------------\n")
  
  cat("\n")
  
  
  
  
  ##### TAXONOMIC RESOLUTION ANALYSIS #####
  
  cat("\n")
  cat("----------------TAXONOMIC ANALYSIS---------------")
  cat("\n")
  
  ## Intra-BIN resolution
  
  intra_bin_list$SIMILARITY$NUMBER_ACCESSION = as.numeric(intra_bin_list$SIMILARITY$NUMBER_ACCESSION)
  
  intra_bin_list$SIMILARITY$MEAN_SIMILARITY = as.numeric(intra_bin_list$SIMILARITY$MEAN_SIMILARITY)
  
  intra_bin_list$SIMILARITY$SD_SIMILARITY = as.numeric(intra_bin_list$SIMILARITY$SD_SIMILARITY)
  
  intra_bin_list$THRESHOLD$PERCENT_DISCRIMINED_CASES = as.numeric(intra_bin_list$THRESHOLD$PERCENT_DISCRIMINED_CASES)
  
  intra_bin_list$THRESHOLD$MEAN_SIMILARITY_DISCRIMINED_CASES = as.numeric(intra_bin_list$THRESHOLD$MEAN_SIMILARITY_DISCRIMINED_CASES)
  
  intra_bin_list$THRESHOLD$SD_SIMILARITY_DISCRIMINED_CASES = as.numeric(intra_bin_list$THRESHOLD$SD_SIMILARITY_DISCRIMINED_CASES)
  
  intra_bin_similarity = split(intra_bin_list$SIMILARITY, intra_bin_list$SIMILARITY$PRIMERS)
  
  intra_bin_threshold = split(intra_bin_list$THRESHOLD, intra_bin_list$THRESHOLD$PRIMERS)
  
  number_wrong_discrimined_cases = sapply(intra_bin_threshold, function(x) sum(x$NUMBER_DISCRIMINED_CASES))
  
  percent_wrong_discrimined_cases = sapply(intra_bin_threshold, function(x) 
    sum(x$NUMBER_DISCRIMINED_CASES) / sum(x$TOTAL_NUMBER_CASES) * 100)
  
  number_bin_with_wrong_discrimined_cases = sapply(intra_bin_similarity, function(x) 
    nrow(subset(x, as.numeric(NUMBER_ACCESSION) > 1 & as.numeric(MEAN_SIMILARITY) < similarity_threshold)))
  
  percent_bin_with_wrong_discrimined_cases = sapply(intra_bin_similarity, function(x) 
    nrow(subset(x, as.numeric(NUMBER_ACCESSION) > 1 & as.numeric(MEAN_SIMILARITY) < similarity_threshold)) /
      nrow(subset(x, NUMBER_ACCESSION > 1)) * 100)
  
  mean_number_wrong_discrimined_cases_per_bin = sapply(intra_bin_threshold, function(x) 
    mean(x$NUMBER_DISCRIMINED_CASES, na.rm = T))
  
  percent_number_wrong_discrimined_cases_per_bin = sapply(intra_bin_threshold, function(x) 
    mean(x$PERCENT_DISCRIMINED_CASES, na.rm = T))
  
  mean_similarity_wrong_discrimined_cases_per_bin = sapply(intra_bin_threshold, function(x) 
    mean(x$MEAN_SIMILARITY_DISCRIMINED_CASES, na.rm = T))
  
  sd_similarity_wrong_discrimined_cases_per_bin = sapply(intra_bin_threshold, function(x) 
    mean(x$SD_SIMILARITY_DISCRIMINED_CASES, na.rm = T))
  
  overall_mean_similarity = sapply(intra_bin_similarity, function(x) mean(as.numeric(x$MEAN_SIMILARITY), na.rm = T))
  
  overall_sd_similarity = sapply(intra_bin_similarity, function(x) sd(as.numeric(x$MEAN_SIMILARITY)))
  
  intra_bin_resolution = tibble(PRIMERS = names(intra_bin_similarity),
                                NUMBER_WRONG_DISCRIMINED_CASES = number_wrong_discrimined_cases,
                                PERCENT_WRONG_DISCRIMINED_CASES = round(percent_wrong_discrimined_cases, 3),
                                NUMBER_BIN_WITH_WRONG_DISCRIMINED_CASES = number_bin_with_wrong_discrimined_cases,
                                PERCENT_BIN_WITH_WRONG_DISCRIMINED_CASES = round(percent_bin_with_wrong_discrimined_cases, 3),
                                MEAN_NUMBER_WRONG_DISCRIMINED_CASES_PER_BIN = round(mean_number_wrong_discrimined_cases_per_bin, 3),
                                MEAN_PERCENT_WRONG_DISCRIMINED_CASES_PER_BIN = round(percent_number_wrong_discrimined_cases_per_bin, 3),
                                MEAN_SIMILARITY_WRONG_DISCRIMINED_CASES_PER_BIN = round(mean_similarity_wrong_discrimined_cases_per_bin, 3),
                                SD_SIMILARITY_WRONG_DISCRIMINED_CASES_PER_BIN = round(sd_similarity_wrong_discrimined_cases_per_bin, 3),
                                OVERALL_MEAN_SIMILARITY = round(overall_mean_similarity, 3),
                                OVERALL_SD_SIMILARITY = round(overall_sd_similarity, 3))
  
  # percent_dissimilar_bin = round(sapply(intra_bin_similarity,
  #                                       function(x) nrow(subset(x, as.numeric(NUMBER_ACCESSION) > 1 & as.numeric(MEAN_SIMILARITY) < similarity_threshold)) /
  #                                         nrow(subset(x, NUMBER_ACCESSION > 1)) * 100), 3)
  # 
  # mean_similarity_similarity = round(sapply(intra_bin_similarity,
  #                                           function(x) mean(as.numeric(x$MEAN_SIMILARITY), na.rm = T)), 3)
  # 
  # se_similarity_similarity = round(sapply(intra_bin_similarity, function(x) mean(as.numeric(x$SE_SIMILARITY), na.rm = T)), 3)
  # 
  # intra_bin_similarity = tibble(data.frame(PRIMERS = names(mean_similarity_similarity),
  #                                          PERCENT_BIN_WITH_DISSIMILAR_SEQUENCES = percent_dissimilar_bin,
  #                                          MEAN_OVERALL_SIMILARITY = mean_similarity_similarity, 
  #                                          SE_OVERALL_SIMILARITY = se_similarity_similarity))
  # 
  # percent_dissimilar_cases = round(sapply(intra_bin_threshold,
  #                                         function(x) mean(x$PERCENT_DISCRIMINED_CASES, na.rm = T)), 3)
  # 
  # mean_similarity_cases = round(sapply(intra_bin_threshold, function(x) 
  #   mean(as.numeric(x$MEAN_SIMILARITY_DISCRIMINED_CASES), na.rm = T)), 3)
  # 
  # se_similarity_cases = round(sapply(intra_bin_threshold, function(x) 
  #   mean(as.numeric(x$SE_SIMILARITY_DISCRIMINED_CASES), na.rm = T)), 3)
  # 
  # intra_bin_threshold = tibble(data.frame(PRIMERS = names(mean_similarity_cases),
  #                                         MEAN_PERCENT_DISSIMILAR_CASES_PER_BIN = percent_dissimilar_cases,
  #                                         MEAN_DISCRIMINED_SIMILARITY = mean_similarity_cases, 
  #                                         SE_DISCRIMINED_SIMILARITY = se_similarity_cases))
  # 
  # intra_bin_resolution = tibble(merge(intra_bin_threshold, intra_bin_similarity))
  
  intra_bin_resolution = intra_bin_resolution[order(intra_bin_resolution$PRIMERS),]
  
  intra_bin_resolution$NUMBER_WRONG_DISCRIMINED_CASES[is.nan(intra_bin_resolution$NUMBER_WRONG_DISCRIMINED_CASES)] = 0
  intra_bin_resolution$PERCENT_WRONG_DISCRIMINED_CASES[is.nan(intra_bin_resolution$PERCENT_WRONG_DISCRIMINED_CASES)] = 0
  intra_bin_resolution$NUMBER_BIN_WITH_WRONG_DISCRIMINED_CASES[is.nan(intra_bin_resolution$NUMBER_BIN_WITH_WRONG_DISCRIMINED_CASES)] = 0
  intra_bin_resolution$PERCENT_BIN_WITH_WRONG_DISCRIMINED_CASES[is.nan(intra_bin_resolution$PERCENT_BIN_WITH_WRONG_DISCRIMINED_CASES)] = 0
  intra_bin_resolution$MEAN_NUMBER_WRONG_DISCRIMINED_CASES_PER_BIN[is.nan(intra_bin_resolution$MEAN_NUMBER_WRONG_DISCRIMINED_CASES_PER_BIN)] = 0
  intra_bin_resolution$MEAN_PERCENT_WRONG_DISCRIMINED_CASES_PER_BIN[is.nan(intra_bin_resolution$MEAN_PERCENT_WRONG_DISCRIMINED_CASES_PER_BIN)] = 0
  intra_bin_resolution$MEAN_SIMILARITY_WRONG_DISCRIMINED_CASES_PER_BIN[is.nan(intra_bin_resolution$MEAN_SIMILARITY_WRONG_DISCRIMINED_CASES_PER_BIN)] = 0
  intra_bin_resolution$SD_SIMILARITY_WRONG_DISCRIMINED_CASES_PER_BIN[is.nan(intra_bin_resolution$SD_SIMILARITY_WRONG_DISCRIMINED_CASES_PER_BIN)] = 0
  intra_bin_resolution$OVERALL_MEAN_SIMILARITY[is.nan(intra_bin_resolution$OVERALL_MEAN_SIMILARITY)] = 0
  intra_bin_resolution$OVERALL_SD_SIMILARITY[is.nan(intra_bin_resolution$OVERALL_SD_SIMILARITY)] = 0
  
  
  ## Inter-BIN resolution
  
  # if(nrow(inter_bin_list) == 0){
  # 
  #     na_details_inter_bin = data.frame(t(matrix(rep(NA, 13))))
  # 
  #     colnames(na_details_inter_bin) = c("SIMILARITY", "CLIMATE", "ENVIRONMENT", "ORDER", "PRIMERS", "MEAN_NUMBER_UNDISCRIMINED_CASES_PER_BIN",
  #                                        "MEAN_PERCENT_UNDISCRIMINED_CASES_PER_BIN", "MEAN_PERCENT_DISSIMILAR_CASES_PER_BIN", "MEAN_DISCRIMINED_SIMILARITY",
  #                                        "SE_DISCRIMINED_SIMILARITY", "PERCENT_BIN_WITH_DISSIMILAR_SEQUENCES", "MEAN_OVERALL_SIMILARITY", "SE_OVERALL_SIMILARITY")
  # 
  #     na_summary_inter_bin = data.frame(PRIMERS = reference_data$PRIMERS,
  #                                       MEAN_NUMBER_UNDISCRIMINED_CASES_PER_BIN = 0,
  #                                       MEAN_PERCENT_UNDISCRIMINED_CASES_PER_BIN = 0)
  # 
  #     inter_bin_output = list(SUMMARY = na_summary_inter_bin, DETAILS = na_details_inter_bin)
  # 
  #   }
  # 
  # else 
    #inter_bin_output = suppressWarnings(inter_bin_analysis(inter_bin_data_list = inter_bin_list, bin_data = taxo_bin_data, 
     #                                                         similarity_threshold = similarity_threshold))
  
  inter_bin_output = inter_bin_analysis(inter_bin_data_list = inter_bin_list, bin_data = taxo_bin_data, 
                                                           similarity_threshold = similarity_threshold)
  
  inter_bin_resolution = inter_bin_output[[1]][order(inter_bin_output[[1]]$PRIMERS),]
  
  
  ## Combining all taxonomic resolution results together
  
  if(nrow(inter_bin_resolution) != length(reference_primers)){
    
    inter_bin_resolution = tibble(merge(reference_primers_df, inter_bin_resolution, all = T))
    
    inter_bin_resolution[is.na(inter_bin_resolution)] = 0
    
  }
  
  if(nrow(intra_bin_resolution) != length(reference_primers))
    intra_bin_resolution = tibble(merge(reference_primers_df, intra_bin_resolution, all = T))
  
  taxonomic_resolution_analysis = tibble(cbind(inter_bin_resolution, intra_bin_resolution[,-1]))
  
  end = Sys.time()
  
  duration = difftime(end, start)
  
  cat("\n")
  
  cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
  
  cat("\n")
  
  cat("-------------------------------------------------\n")
  
  cat("\n")
  
  
  
  
  ##### AMPLIFICATION ANALYSIS #####
  
  cat("\n")
  cat("--------------AMPLIFICATION ANALYSIS-------------")
  cat("\n")
  start = Sys.time()
  
  
  ## Universality (i.e. number of species amplified compared to the maximum in the database)
  
  all_species_number = length(unique(specific_pcr_fish_reference$SPECIES))
  
  number_species = data.frame(specific_pcr_fish) %>% group_by(PRIMERS, .drop = FALSE) %>%
    dplyr::summarise(NUMBER_SPECIES = n_distinct(SPECIES), 
                     RATIO_SPECIES_AMPLIFIED = n_distinct(SPECIES) / all_species_number)
  
  
  ## Equitability (i.e. ratio of species amplified per order)
  
  species_per_order = specific_pcr_fish %>% group_by(PRIMERS, ORDER, .drop = FALSE) %>% dplyr::mutate(SPECIES_PER_ORDER = n_distinct(SPECIES))
  
  max_species_per_order = specific_pcr_fish_reference %>% group_by(ORDER, .drop = FALSE) %>% dplyr::mutate(MAX_SPECIES_PER_ORDER = n_distinct(SPECIES))
  
  max_species_per_order = tibble(data.frame(ORDER = max_species_per_order$ORDER,
                                        MAX_SPECIES_PER_ORDER = max_species_per_order$MAX_SPECIES_PER_ORDER))
  max_species_per_order = max_species_per_order[!duplicated(max_species_per_order),]
  
  species_per_order_ratio = tibble(merge(species_per_order, max_species_per_order)) %>% group_by(PRIMERS, .drop = FALSE) %>%
    dplyr::summarise(RATIO_SPECIES_AMPLIFIED_PER_ORDER = mean(SPECIES_PER_ORDER / MAX_SPECIES_PER_ORDER))
  
  
  ## Hybridation efficiency
  
  mean_hybridation_eff = specific_pcr_fish %>% group_by(PRIMERS, .drop = FALSE) %>%
    dplyr::summarise(MEAN_HYBRIDATION = mean(HYBRIDATION_EFFICIENCY), SD_HYBRIDATION = sd(HYBRIDATION_EFFICIENCY))
  
  
  ## Elongation efficiency
  
  mean_elongation_eff = specific_pcr_fish %>% group_by(PRIMERS, .drop = FALSE) %>%
    dplyr::summarise(MEAN_ELONGATION = mean(ELONGATION_EFFICIENCY), SD_ELONGATION = sd(ELONGATION_EFFICIENCY))
  
  
  ## Overall amplification efficiency
  
  mean_amplification_eff = specific_pcr_fish %>% group_by(PRIMERS, .drop = FALSE) %>%
    dplyr::summarise(MEAN_AMPLIFICATION = mean(AMPLIFICATION_EFFICIENCY), SD_AMPLIFICATION = sd(AMPLIFICATION_EFFICIENCY))
  
  
  # Removing potential NA lines

  number_species_na = apply(number_species[, "PRIMERS"], 1, is.na)
  number_species = number_species[!number_species_na,]
  
  species_per_order_ratio_na = apply(species_per_order_ratio[, "PRIMERS"], 1, is.na)
  species_per_order_ratio = species_per_order_ratio[!species_per_order_ratio_na,]
  
  mean_hybridation_eff_na = apply(mean_hybridation_eff[, "PRIMERS"], 1, is.na)
  mean_hybridation_eff = mean_hybridation_eff[!mean_hybridation_eff_na,]
  
  mean_elongation_eff_na = apply(mean_elongation_eff[, "PRIMERS"], 1, is.na)
  mean_elongation_eff = mean_elongation_eff[!mean_elongation_eff_na,]
  
  mean_amplification_eff_na = apply(mean_amplification_eff[, "PRIMERS"], 1, is.na) 
  mean_amplification_eff = mean_amplification_eff[!mean_amplification_eff_na,]
  

  ## Combining all amplification analysis results together
  
  if(nrow(number_species) != length(reference_primers)) number_species = tibble(merge(reference_primers_df, number_species, all = T))
  if(nrow(species_per_order_ratio) != length(reference_primers)) species_per_order_ratio = tibble(merge(reference_primers_df, species_per_order_ratio, all = T))
  if(nrow(mean_hybridation_eff) != length(reference_primers)) mean_hybridation_eff = tibble(merge(reference_primers_df, mean_hybridation_eff, all = T))
  if(nrow(mean_elongation_eff) != length(reference_primers)) mean_elongation_eff = tibble(merge(reference_primers_df, mean_elongation_eff, all = T))
  if(nrow(mean_amplification_eff) != length(reference_primers)) mean_amplification_eff = tibble(merge(reference_primers_df, mean_amplification_eff, all = T))
  
  amplification_analysis = tibble(cbind(number_species, tibble(species_per_order_ratio[,-1]),
                                        mean_hybridation_eff[,-1], mean_elongation_eff[,-1], mean_amplification_eff[,-1]))
  
  amplification_analysis[is.na(amplification_analysis)] = 0
  
  end = Sys.time()
  
  duration = difftime(end, start)
  
  cat("\n")
  
  cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
  
  cat("\n")
  
  cat("-------------------------------------------------\n")
  
  cat("\n")
  
  
  
  
  ##### SPECIFICITY ANALYSIS #####
  
  cat("\n")
  cat("----------------SPECIFITY ANALYSIS---------------")
  cat("\n")
  start = Sys.time()
  
  
  ## Calculating the ratio of sequences amplified belonging to Actinopterygii
  
  ratio_fish_seq = unspecific_pcr_all %>% group_by(PRIMERS, .drop = FALSE) %>% dplyr::mutate(NUMBER_SEQUENCES = n()) %>%
    filter(CLASS == "Actinopterygii") %>% dplyr::summarise(RATIO_SEQUENCES_FISH = n() / unique(NUMBER_SEQUENCES))
  
  
  ## Calculating the ratio of amplification of Actinopterygii sequences compared to the other taxa
  
  ratio_amplification_eff_fish = unspecific_pcr_all %>% group_by(PRIMERS, .drop = FALSE) %>% dplyr::mutate(MEAN_AMPLIFICATION = mean(AMPLIFICATION_EFFICIENCY)) %>%
    filter(CLASS == "Actinopterygii") %>% dplyr::summarise(RATIO_AMPLIFICATION_FISH = unique(MEAN_AMPLIFICATION) / mean(AMPLIFICATION_EFFICIENCY))
  
  mean_amplification_other = unspecific_pcr_all %>% group_by(PRIMERS, .drop = FALSE) %>% filter(CLASS != "Actinopterygii") %>%
    dplyr::summarise(MEAN_AMPLIFICATION_OTHER = mean(AMPLIFICATION_EFFICIENCY))
  
  mean_amplification_fish = unspecific_pcr_all %>% group_by(PRIMERS, .drop = FALSE) %>% filter(CLASS == "Actinopterygii") %>%
    dplyr::summarise(MEAN_AMPLIFICATION_FISH = mean(AMPLIFICATION_EFFICIENCY))
  
  ratio_amplification_fish_other = tibble(merge(mean_amplification_fish, mean_amplification_other)) %>%
    dplyr::mutate(RATIO_AMPLIFICATION_FISH_OTHER = MEAN_AMPLIFICATION_FISH / MEAN_AMPLIFICATION_OTHER)
  
  
  ## Combining all specificity analysis results together
  
  if(nrow(ratio_fish_seq) != length(reference_primers)) ratio_fish_seq = tibble(merge(reference_primers_df, ratio_fish_seq, all = T))
  if(nrow(ratio_amplification_fish_other) != length(reference_primers))
    ratio_amplification_fish_other = tibble(merge(reference_primers_df, ratio_amplification_fish_other, all = T))
  
  specificity_analysis = tibble(merge(ratio_fish_seq, ratio_amplification_fish_other[,c(1,4)]))
  
  specificity_analysis[is.na(specificity_analysis)] = 0
  
  end = Sys.time()
  
  duration = difftime(end, start)
  
  cat("\n")
  
  cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
  
  cat("\n")
  
  cat("-------------------------------------------------\n")
  
  cat("\n")
  
  
  
  
  ##### COMBINING ALL ANALYSIS RESULTS #####
  
  cat("\n")
  cat("----------------COMBINING RESULTS----------------")
  cat("\n")
  start = Sys.time()
  
  
  ## Ordering all tables
  
  reference_data = reference_data %>% arrange(PRIMERS)
  stability_analysis = stability_analysis %>% arrange(PRIMERS)
  amplification_analysis = amplification_analysis %>% arrange(PRIMERS)
  taxonomic_resolution_analysis = taxonomic_resolution_analysis %>% arrange(PRIMERS)
  specificity_analysis = specificity_analysis %>% arrange(PRIMERS)
  
  check_names = cbind(reference_data[,1], stability_analysis[,1], amplification_analysis[,1],
                      taxonomic_resolution_analysis[,1], specificity_analysis[,1])
  check_names = which(apply(check_names, 2, function(x) length(unique(x))) == 1)
  if(length(check_names) != 0) stop("Error during the binding of all results: analysis results tables are in different orders.")
  
  
  ## Binding all tables together
  
  primers_evaluation = tibble(cbind(subset(reference_data, select = c("PRIMERS", "GENE")), stability_analysis[,-1],
                                    amplification_analysis[,-1], taxonomic_resolution_analysis[,-1], specificity_analysis[,-1]))
  
  end = Sys.time()
  
  duration = difftime(end, start)
  
  cat("\n")
  
  cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
  
  cat("\n")
  
  cat("-------------------------------------------------\n")
  
  cat("\n")
  
  
  
  
  ##### PRIMER EVALUATION #####
  
  cat("\n")
  cat("-----------------COMPUTING SCORES----------------")
  cat("\n")
  start = Sys.time()
  
  ## Normalisation of each score
  
  primers_evaluation_normalized =
    tibble(cbind(primers_evaluation[,1:2], MEAN_LENGTH = normalize_scores(as.numeric(primers_evaluation$MEAN_LENGTH), order = "dsc"),
                 RATIO_DIAGNOSTIC_NUCLEOTIDES = normalize_scores(as.numeric(primers_evaluation$RATIO_DIAGNOSTIC_NUCLEOTIDES), order = "asc"),
                 INTRA_BIN = normalize_scores(as.numeric(primers_evaluation$NUMBER_WRONG_DISCRIMINED_CASES), order = "dsc"),
                 INTER_BIN = normalize_scores(as.numeric(primers_evaluation$NUMBER_WRONG_UNDISCRIMINED_CASES), order = "dsc"),
                 RATIO_SPECIES_AMPLIFIED = normalize_scores(as.numeric(primers_evaluation$RATIO_SPECIES_AMPLIFIED), order = "asc"),
                 RATIO_SPECIES_AMPLIFIED_PER_ORDER = normalize_scores(as.numeric(primers_evaluation$RATIO_SPECIES_AMPLIFIED_PER_ORDER), order = "asc"),
                 MEAN_HYBRIDATION = normalize_scores(as.numeric(primers_evaluation$MEAN_HYBRIDATION), order = "asc"),
                 MEAN_ELONGATION = normalize_scores(as.numeric(primers_evaluation$MEAN_ELONGATION), order = "asc"),
                 RATIO_SEQUENCES_FISH = normalize_scores(as.numeric(primers_evaluation$RATIO_SEQUENCES_FISH), order = "asc"),
                 RATIO_AMPLIFICATION_FISH_OTHER = normalize_scores(as.numeric(primers_evaluation$RATIO_AMPLIFICATION_FISH_OTHER), order = "asc")))
  
  
  ## Computing number of sequences amplified by each primer
  
  number_sequences = specific_pcr_fish %>% group_by(PRIMERS, .drop = FALSE) %>% dplyr::summarise(NUMBER_SEQUENCES = n())
  
  
  ## Commputing mean score on all criteria
  
  primers_mean_score = tibble(cbind(primers_evaluation_normalized[,1:2],
                                    MEAN_SCORE = apply(primers_evaluation_normalized[,-c(1:2)], 1, mean))) %>% arrange(-MEAN_SCORE)
  
  ## Pivoting table to wide format
  
  primers_evaluation_wide = data.frame(t(primers_evaluation_normalized))
  colnames(primers_evaluation_wide) = primers_evaluation_wide[2,]
  primers_evaluation_wide = tibble(data.frame(SCORES = row.names(primers_evaluation_wide[-c(1:2),]),
                                              apply(primers_evaluation_wide[-c(1:2),], 2, as.numeric)))
  colnames(primers_evaluation_wide) = c("SCORES", primers_evaluation$PRIMERS)
  primers_evaluation_wide = primers_evaluation_wide[c("SCORES", primers_mean_score$PRIMERS)]
  
  
  ## Pivoting table to long format
  
  primers_evaluation_long = primers_evaluation_normalized[,-2] %>% pivot_longer(!PRIMERS, names_to = "CRITERION", values_to = "SCORES")
  
  
  ## Assembling results in a list
  
  if(length(inter_bin_output) == 2) output_list = list(MEAN_SCORES = primers_mean_score,
                                                       SCORES_WIDE = primers_evaluation_wide,
                                                       SCORES_LONG = primers_evaluation_long,
                                                       STABILITY_ANALYSIS = stability_analysis,
                                                       INTRA_BIN_ANALYSIS = intra_bin_resolution,
                                                       INTER_BIN_ANALYSIS = inter_bin_resolution,
                                                       INTER_BIN_DETAILS = inter_bin_output[[2]],
                                                       AMPLIFICATION_ANALYSIS = amplification_analysis,
                                                       SPECIFICITY_ANALYSIS = specificity_analysis)
  
  else output_list = list(MEAN_SCORES = primers_mean_score,
                          SCORES_WIDE = primers_evaluation_wide,
                          SCORES_LONG = primers_evaluation_long,
                          STABILITY_ANALYSIS = stability_analysis,
                          INTRA_BIN_ANALYSIS = intra_bin_resolution,
                          INTER_BIN_ANALYSIS = inter_bin_resolution,
                          AMPLIFICATION_ANALYSIS = amplification_analysis,
                          SPECIFICITY_ANALYSIS = specificity_analysis)
  
  end = Sys.time()
  
  duration = difftime(end, start)
  
  cat("\n")
  
  cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
  
  cat("\n")
  
  cat("-------------------------------------------------\n")
  
  cat("\n")
  
  
  
  
  ##### PLOTTING RESULTS #####
  
  if(plot){
    
    cat("\n")
    cat("-----------------PLOTTING RESULTS----------------")
    cat("\n")
    start = Sys.time()
    
    if(is.null(plot_name)) plot_name = paste0(main_title, "_", similarity_threshold, "_evaluation.png")
    
    number_sequences = specific_pcr_fish %>% group_by(PRIMERS, .drop = FALSE) %>% dplyr::summarise(NUMBER_SEQUENCES = n())
    if(nrow(number_sequences) != length(reference_primers)) number_sequences = tibble(merge(reference_primers_df, number_sequences, all = T))
    
    number_sequences[is.na(number_sequences)] = 0
    
    evaluation_plot(evaluation_data = output_list, number_sequences = number_sequences, 
                    max_number_sequences = length(unique(specific_pcr_fish_reference$ACCESSION)), 
                    plot_name = plot_name, order_genes_graph = order_genes_graph, colours_genes_graph = colours_genes_graph, 
                    gene_name_space_major = gene_name_space_major, gene_name_space_minor = gene_name_space_minor,
                    main_title = main_title, font_size_title = font_size_title, padding_rect_title = padding_rect_title)
    
    end = Sys.time()
    
    duration = difftime(end, start)
    
    cat("\n")
    
    cat("The plot was successfully saved in the working directory!")
    
    cat("\n")
    
    cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
    
    cat("\n")
    
    cat("-------------------------------------------------\n")
    
    cat("\n")
    
  }
  
  
  
  ##### FINAL RESULTS #####
  
  return(output_list)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.