intra_taxa_similarity = function(data_metabarcodes, bin_data, taxo_data, taxa, search_whole_database = F, verbose = F){
options(dplyr.summarise.inform = F)
first = T
comparison_one_done = F
min_similarity_thresholds = c(0, 20, 50, 70, 75, 80, 85, 90, 95, 99)
max_similarity_thresholds = c(19.9, 39.9, 69.9, 74.9, 79.9, 84.9, 89.9, 94.9, 98.9, 100)
data_taxo = tibble(merge(data_metabarcodes, taxo_data))
if(var(sapply(split(data_taxo, data_taxo$PRIMERS), nrow)) != 0)
stop('The table filled in "data_metabarcodes" must contain equal number of rows accross primers (i.e. same accession).')
if(taxa == "BIN") {
number_sequences_compared = subset(data_taxo, PRIMERS == unique(data_taxo$PRIMERS)[1]) %>% group_by(BIN) %>% summarise(N = n())
number_comparisons = sapply(split(number_sequences_compared, number_sequences_compared$BIN),
function(x) length(matrix(ncol = x$N, nrow = x$N)[lower.tri(matrix(ncol = x$N, nrow = x$N), diag = F)]))
}
else if(taxa == "GENUS") {
number_sequences_compared = subset(data_taxo, PRIMERS == unique(data_taxo$PRIMERS)[1]) %>% group_by(GENUS, BIN) %>% summarise(N = n())
number_comparisons = sapply(split(number_sequences_compared, number_sequences_compared$GENUS),
function(x) sum(outer(x$N, x$N, "*")[lower.tri(outer(x$N, x$N, "*"), diag = F)]))
}
else if(taxa == "FAMILY"){
number_sequences_compared = subset(data_taxo, PRIMERS == unique(data_taxo$PRIMERS)[1]) %>% group_by(FAMILY, GENUS) %>% summarise(N = n())
number_comparisons = sapply(split(number_sequences_compared, number_sequences_compared$FAMILY),
function(x) sum(outer(x$N, x$N, "*")[lower.tri(outer(x$N, x$N, "*"), diag = F)]))
}
else if(taxa == "ORDER"){
number_sequences_compared = subset(data_taxo, PRIMERS == unique(data_taxo$PRIMERS)[1]) %>% group_by(ORDER, FAMILY) %>% summarise(N = n())
number_comparisons = sapply(split(number_sequences_compared, number_sequences_compared$ORDER),
function(x) sum(outer(x$N, x$N, "*")[lower.tri(outer(x$N, x$N, "*"), diag = F)]))
}
else stop('The argument "taxa" must be in "BIN", "GENUS", "FAMILY" or "ORDER".')
reference = tibble(TAXA = names(number_comparisons), TOTAL_NUMBER_CASES = c(number_comparisons))
reference_no_zero = subset(reference, TOTAL_NUMBER_CASES != 0)
reference_one = subset(reference, TOTAL_NUMBER_CASES == 1)
reference_higher_one = subset(reference, TOTAL_NUMBER_CASES > 1)
if(nrow(reference_one) != 0 && nrow(reference_one) < 100){
cat(paste0("Retrieving 1 row for ", nrow(reference_one), " taxons each\n"))
similarity_list_temporary = pairwise_similarity(data_taxo[which(data_taxo[[taxa]] %in% reference_one$TAXA),], bin_data, taxo_data,
min_similarity_threshold = 0, max_similarity_threshold = 100, verbose = verbose)
if(taxa == "BIN") similarity_list_temporary = lapply(similarity_list_temporary, function(x) x[which(x$BIN1 == x$BIN2),])
if(taxa == "GENUS") similarity_list_temporary = lapply(similarity_list_temporary, function(x)
x[which(x$GENUS1 == x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "FAMILY") similarity_list_temporary = lapply(similarity_list_temporary, function(x)
x[which(x$FAMILY1 == x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "ORDER") similarity_list_temporary = lapply(similarity_list_temporary, function(x)
x[which(x$ORDER1 == x$ORDER2 & x$FAMILY1 != x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
similarity_list = similarity_list_temporary
first = F
comparison_one_done = T
cat("-------------------------------------------------------------\n\n")
}
if(nrow(reference_higher_one) != 0 || nrow(reference_one) == 0 && !comparison_one_done){
if(!comparison_one_done) chosen_reference = reference_no_zero
else chosen_reference = reference_higher_one
for(i in 1:nrow(chosen_reference)){
cat(paste0("Retrieving ", chosen_reference$TOTAL_NUMBER_CASES[i], " rows for ",
chosen_reference$TAXA[i], " (taxon ", i, "/", length(chosen_reference$TAXA), ")\n"))
cat("-------------------------------------------------------------\n\n")
if(chosen_reference$TOTAL_NUMBER_CASES[i] > 10000){
cat("Large number of comparisons: spliting in smaller batches\n")
for(j in 1:10){
cat(paste0("Batch ", j, "/10: "))
similarity_list_temporary_subset = pairwise_similarity(data_taxo[which(data_taxo[[taxa]] == chosen_reference$TAXA[i]),], bin_data, taxo_data,
min_similarity_threshold = min_similarity_thresholds[j],
max_similarity_threshold = max_similarity_thresholds[j], verbose = verbose)
if(taxa == "BIN") similarity_list_temporary_subset = lapply(similarity_list_temporary_subset, function(x) x[which(x$BIN1 == x$BIN2),])
if(taxa == "GENUS") similarity_list_temporary_subset = lapply(similarity_list_temporary_subset, function(x)
x[which(x$GENUS1 == x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "FAMILY") similarity_list_temporary_subset = lapply(similarity_list_temporary_subset, function(x)
x[which(x$FAMILY1 == x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "ORDER") similarity_list_temporary_subset = lapply(similarity_list_temporary_subset, function(x)
x[which(x$ORDER1 == x$ORDER2 & x$FAMILY1 != x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(j == 1) similarity_list_temporary = similarity_list_temporary_subset
else {
old_names_subset = names(similarity_list_temporary)
similarity_list_temporary = lapply(names(similarity_list_temporary), function(x)
rbind(similarity_list_temporary[[x]], similarity_list_temporary_subset[[x]]))
names(similarity_list_temporary) = old_names_subset
}
cat("DONE\n")
}
}
else {
similarity_list_temporary = pairwise_similarity(data_taxo[which(data_taxo[[taxa]] == chosen_reference$TAXA[i]),], bin_data, taxo_data,
min_similarity_threshold = 0, max_similarity_threshold = 100, verbose = verbose)
if(taxa == "BIN") similarity_list_temporary = lapply(similarity_list_temporary, function(x) x[which(x$BIN1 == x$BIN2),])
if(taxa == "GENUS") similarity_list_temporary = lapply(similarity_list_temporary, function(x)
x[which(x$GENUS1 == x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "FAMILY") similarity_list_temporary = lapply(similarity_list_temporary, function(x)
x[which(x$FAMILY1 == x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "ORDER") similarity_list_temporary = lapply(similarity_list_temporary, function(x)
x[which(x$ORDER1 == x$ORDER2 & x$FAMILY1 != x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
}
if(any(sapply(similarity_list_temporary, function(x) nrow(x)/2) != chosen_reference$TOTAL_NUMBER_CASES[i])){
cat("Missing values detected during the 1st screening\n\n")
problematic_taxa = taxo_data[which(taxo_data[[taxa]] == chosen_reference$TAXA[i]),]
problematic_taxa1 = problematic_taxa
colnames(problematic_taxa1) = paste0(colnames(problematic_taxa1), "1")
problematic_taxa2 = problematic_taxa
colnames(problematic_taxa2) = paste0(colnames(problematic_taxa2), "2")
combined_problematic_taxa = tibble(merge(problematic_taxa1, problematic_taxa2, all = T))
if(taxa == "BIN") combined_problematic_taxa = combined_problematic_taxa[which(combined_problematic_taxa$BIN1 == combined_problematic_taxa$BIN2),]
if(taxa == "GENUS") combined_problematic_taxa = combined_problematic_taxa[which(combined_problematic_taxa$GENUS1 == combined_problematic_taxa$GENUS2 &
combined_problematic_taxa$BIN1 != combined_problematic_taxa$BIN2),]
if(taxa == "FAMILY") combined_problematic_taxa = combined_problematic_taxa[which(combined_problematic_taxa$FAMILY1 == combined_problematic_taxa$FAMILY2 &
combined_problematic_taxa$GENUS1 != combined_problematic_taxa$GENUS2 &
combined_problematic_taxa$BIN1 != combined_problematic_taxa$BIN2),]
if(taxa == "ORDER") combined_problematic_taxa = combined_problematic_taxa[which(combined_problematic_taxa$ORDER1 == combined_problematic_taxa$ORDER2 &
combined_problematic_taxa$FAMILY1 != combined_problematic_taxa$FAMILY2 &
combined_problematic_taxa$GENUS1 != combined_problematic_taxa$GENUS2 &
combined_problematic_taxa$BIN1 != combined_problematic_taxa$BIN2),]
if(nrow(combined_problematic_taxa)/2 != chosen_reference$TOTAL_NUMBER_CASES[i])
chosen_reference$TOTAL_NUMBER_CASES = replace(chosen_reference$TOTAL_NUMBER_CASES, i, nrow(combined_problematic_taxa)/2)
missing_accession = lapply(similarity_list_temporary, function(x) suppressMessages(unique(unlist(anti_join(select(combined_problematic_taxa,
c("ACCESSION1", "ACCESSION2")),
select(x, c("ACCESSION1", "ACCESSION2")))))))
missing_names = names(lengths(missing_accession)[lengths(missing_accession) != 0])
if(length(missing_names) != 0){
for(k in 1:length(missing_names)){
number_missing_comparisons = length(missing_accession[[missing_names[k]]])
cat(paste0("Processing ", number_missing_comparisons^2, " comparisons to retrieve missing rows for ", missing_names[k],"\n"))
if(number_missing_comparisons > 1000){
cat("Large number of comparisons: spliting in smaller batches\n")
for(m in 1:10){
cat(paste0("Batch ", m, "/10: "))
if(search_whole_database) hits = c(0,0)
else hits = c(0,32)
similarity_list_missing_subset = pairwise_similarity(subset(data_metabarcodes, PRIMERS == missing_names[k] &
ACCESSION %in% missing_accession[[missing_names[k]]]), bin_data, taxo_data,
min_similarity_threshold = min_similarity_thresholds[m],
max_similarity_threshold = max_similarity_thresholds[m],
number_hits_considered = hits, verbose = verbose)
if(taxa == "BIN") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x) x[which(x$BIN1 == x$BIN2),])
if(taxa == "GENUS") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x)
x[which(x$GENUS1 == x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "FAMILY") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x)
x[which(x$FAMILY1 == x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "ORDER") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x)
x[which(x$ORDER1 == x$ORDER2 & x$FAMILY1 != x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(m == 1) similarity_list_missing = similarity_list_missing_subset
else {
old_names_missing_subset = names(similarity_list_missing)
similarity_list_missing = lapply(names(similarity_list_missing), function(x)
rbind(similarity_list_missing[[x]], similarity_list_missing_subset[[x]]))
names(similarity_list_missing) = old_names_missing_subset
}
cat("DONE\n")
}
cat("\n")
}
else {
if(search_whole_database) hits = c(0,0)
else hits = c(0,32)
similarity_list_missing = pairwise_similarity(subset(data_metabarcodes, PRIMERS == missing_names[k] &
ACCESSION %in% missing_accession[[missing_names[k]]]), bin_data, taxo_data,
min_similarity_threshold = min_similarity_thresholds[m],
max_similarity_threshold = max_similarity_thresholds[m],
number_hits_considered = hits, verbose = verbose)
if(taxa == "BIN") similarity_list_missing = lapply(similarity_list_missing, function(x) x[which(x$BIN1 == x$BIN2),])
if(taxa == "GENUS") similarity_list_missing = lapply(similarity_list_missing, function(x)
x[which(x$GENUS1 == x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "FAMILY") similarity_list_missing = lapply(similarity_list_missing, function(x)
x[which(x$FAMILY1 == x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "ORDER") similarity_list_missing = lapply(similarity_list_missing, function(x)
x[which(x$ORDER1 == x$ORDER2 & x$FAMILY1 != x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
}
element_to_replace = which(names(similarity_list_temporary) == missing_names[k])
similarity_list_temporary[[element_to_replace]] = rbind(similarity_list_temporary[[element_to_replace]], similarity_list_missing[[1]])
similarity_list_temporary[[element_to_replace]] = similarity_list_temporary[[element_to_replace]][!duplicated(similarity_list_temporary[[element_to_replace]]),]
}
cat("\n")
}
else cat("wrongly estimated comparison number corrected!\n\n")
}
if(any(sapply(similarity_list_temporary, function(x) nrow(x)/2) != chosen_reference$TOTAL_NUMBER_CASES[i])){
cat("Missing values detected during the 2nd screening\n\n")
missing_accession = lapply(similarity_list_temporary, function(x) suppressMessages(unique(unlist(anti_join(select(combined_problematic_taxa,
c("ACCESSION1", "ACCESSION2")),
select(x, c("ACCESSION1", "ACCESSION2")))))))
missing_names = names(lengths(missing_accession)[lengths(missing_accession) != 0])
if(length(missing_names) != 0){
for(k in 1:length(missing_names)){
number_missing_comparisons = length(missing_accession[[missing_names[k]]])
cat(paste0("Processing ", number_missing_comparisons^2, " comparisons to retrieve missing rows for ", missing_names[k],"\n"))
if(number_missing_comparisons > 1000){
cat("Large number of comparisons: spliting in smaller batches\n")
for(m in 1:10){
cat(paste0("Batch ", m, "/10: "))
if(search_whole_database) hits = c(0,0)
else hits = c(0,1000)
similarity_list_missing_subset = pairwise_similarity(subset(data_metabarcodes, PRIMERS == missing_names[k] &
ACCESSION %in% missing_accession[[missing_names[k]]]), bin_data, taxo_data,
min_similarity_threshold = min_similarity_thresholds[m],
max_similarity_threshold = max_similarity_thresholds[m],
number_hits_considered = hits, verbose = verbose)
if(taxa == "BIN") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x) x[which(x$BIN1 == x$BIN2),])
if(taxa == "GENUS") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x)
x[which(x$GENUS1 == x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "FAMILY") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x)
x[which(x$FAMILY1 == x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "ORDER") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x)
x[which(x$ORDER1 == x$ORDER2 & x$FAMILY1 != x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(m == 1) similarity_list_missing = similarity_list_missing_subset
else {
old_names_missing_subset = names(similarity_list_missing)
similarity_list_missing = lapply(names(similarity_list_missing), function(x)
rbind(similarity_list_missing[[x]], similarity_list_missing_subset[[x]]))
names(similarity_list_missing) = old_names_missing_subset
}
cat("DONE\n")
}
cat("\n")
}
else {
if(search_whole_database) hits = c(0,0)
else hits = c(0,1000)
similarity_list_missing = pairwise_similarity(subset(data_metabarcodes, PRIMERS == missing_names[k] &
ACCESSION %in% missing_accession[[missing_names[k]]]), bin_data, taxo_data,
min_similarity_threshold = min_similarity_thresholds[m],
max_similarity_threshold = max_similarity_thresholds[m],
number_hits_considered = hits, verbose = verbose)
if(taxa == "BIN") similarity_list_missing = lapply(similarity_list_missing, function(x) x[which(x$BIN1 == x$BIN2),])
if(taxa == "GENUS") similarity_list_missing = lapply(similarity_list_missing, function(x)
x[which(x$GENUS1 == x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "FAMILY") similarity_list_missing = lapply(similarity_list_missing, function(x)
x[which(x$FAMILY1 == x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "ORDER") similarity_list_missing = lapply(similarity_list_missing, function(x)
x[which(x$ORDER1 == x$ORDER2 & x$FAMILY1 != x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
}
element_to_replace = which(names(similarity_list_temporary) == missing_names[k])
similarity_list_temporary[[element_to_replace]] = rbind(similarity_list_temporary[[element_to_replace]], similarity_list_missing[[1]])
similarity_list_temporary[[element_to_replace]] = similarity_list_temporary[[element_to_replace]][!duplicated(similarity_list_temporary[[element_to_replace]]),]
}
cat("\n")
}
}
if(any(sapply(similarity_list_temporary, function(x) nrow(x)/2) != chosen_reference$TOTAL_NUMBER_CASES[i])){
cat("Missing values detected during the 3rd screening\n\n")
missing_accession = lapply(similarity_list_temporary, function(x) suppressMessages(unique(unlist(anti_join(select(combined_problematic_taxa,
c("ACCESSION1", "ACCESSION2")),
select(x, c("ACCESSION1", "ACCESSION2")))))))
missing_names = names(lengths(missing_accession)[lengths(missing_accession) != 0])
if(length(missing_names) != 0){
for(k in 1:length(missing_names)){
number_missing_comparisons = length(missing_accession[[missing_names[k]]])
cat(paste0("Processing ", number_missing_comparisons^2, " comparisons to retrieve missing rows for ", missing_names[k],"\n"))
if(number_missing_comparisons > 1000){
cat("Large number of comparisons: spliting in smaller batches\n")
for(m in 1:10){
cat(paste0("Batch ", m, "/10: "))
if(search_whole_database) hits = c(0,0)
else hits = c(0,10000)
similarity_list_missing_subset = pairwise_similarity(subset(data_metabarcodes, PRIMERS == missing_names[k] &
ACCESSION %in% missing_accession[[missing_names[k]]]), bin_data, taxo_data,
min_similarity_threshold = min_similarity_thresholds[m],
max_similarity_threshold = max_similarity_thresholds[m],
number_hits_considered = hits, verbose = verbose)
if(taxa == "BIN") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x) x[which(x$BIN1 == x$BIN2),])
if(taxa == "GENUS") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x)
x[which(x$GENUS1 == x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "FAMILY") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x)
x[which(x$FAMILY1 == x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "ORDER") similarity_list_missing_subset = lapply(similarity_list_missing_subset, function(x)
x[which(x$ORDER1 == x$ORDER2 & x$FAMILY1 != x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(m == 1) similarity_list_missing = similarity_list_missing_subset
else {
old_names_missing_subset = names(similarity_list_missing)
similarity_list_missing = lapply(names(similarity_list_missing), function(x)
rbind(similarity_list_missing[[x]], similarity_list_missing_subset[[x]]))
names(similarity_list_missing) = old_names_missing_subset
}
cat("DONE\n")
}
cat("\n")
}
else {
if(search_whole_database) hits = c(0,0)
else hits = c(0,10000)
similarity_list_missing = pairwise_similarity(subset(data_metabarcodes, PRIMERS == missing_names[k] &
ACCESSION %in% missing_accession[[missing_names[k]]]), bin_data, taxo_data,
min_similarity_threshold = min_similarity_thresholds[m],
max_similarity_threshold = max_similarity_thresholds[m],
number_hits_considered = hits, verbose = verbose)
if(taxa == "BIN") similarity_list_missing = lapply(similarity_list_missing, function(x) x[which(x$BIN1 == x$BIN2),])
if(taxa == "GENUS") similarity_list_missing = lapply(similarity_list_missing, function(x)
x[which(x$GENUS1 == x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "FAMILY") similarity_list_missing = lapply(similarity_list_missing, function(x)
x[which(x$FAMILY1 == x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
if(taxa == "ORDER") similarity_list_missing = lapply(similarity_list_missing, function(x)
x[which(x$ORDER1 == x$ORDER2 & x$FAMILY1 != x$FAMILY2 & x$GENUS1 != x$GENUS2 & x$BIN1 != x$BIN2),])
}
element_to_replace = which(names(similarity_list_temporary) == missing_names[k])
similarity_list_temporary[[element_to_replace]] = rbind(similarity_list_temporary[[element_to_replace]], similarity_list_missing[[1]])
similarity_list_temporary[[element_to_replace]] = similarity_list_temporary[[element_to_replace]][!duplicated(similarity_list_temporary[[element_to_replace]]),]
}
cat("\n")
}
}
if(any(sapply(similarity_list_temporary, function(x) nrow(x)/2) != chosen_reference$TOTAL_NUMBER_CASES[i]))
warning(paste0("After 3 refinments on smaller subsets, and considering 1000 non-matching hits instead of 32 (default), not all comparisons were retrieved for", chosen_reference$TAXA[i]))
if(first) {
similarity_list = similarity_list_temporary
first = F
}
else{
old_names = names(similarity_list)
similarity_list = lapply(names(similarity_list), function(x) rbind(similarity_list[[x]], similarity_list_temporary[[x]]))
names(similarity_list) = old_names
}
cat("-------------------------------------------------------------\n\n")
}
return(similarity_list)
}
else if(nrow(reference_higher_one) == 0 && comparison_one_done) return(similarity_list)
else stop("No taxa to analyze!")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.