## Requirement: "tidyverse"
intra_taxa_analysis = function(data_comparisons, data_metabarcodes, data_taxo, mode, taxa = "BIN",
separate_highest_interval = T, similarity_table = T, similarity_thresholds = 90:99){
options(dplyr.summarise.inform = F)
data_taxo = tibble(merge(data_metabarcodes, data_taxo, all = T))
if(taxa == "BIN") max_reference = data_taxo %>% group_by(PRIMERS, BIN) %>% summarise(NUMBER_ACCESSION = n())
else if(taxa == "GENUS") max_reference = data_taxo %>% group_by(PRIMERS, GENUS) %>% summarise(NUMBER_ACCESSION = n())
else if(taxa == "FAMILY") max_reference = data_taxo %>% group_by(PRIMERS, FAMILY) %>% summarise(NUMBER_ACCESSION = n())
else if(taxa == "ORDER") max_reference = data_taxo %>% group_by(PRIMERS, ORDER) %>% summarise(NUMBER_ACCESSION = n())
else stop('Only "BIN", "GENUS", "FAMILY" & "ORDER" are supported in the argument "taxa".')
max_reference_primers = split(max_reference, max_reference$PRIMERS)
cat(paste0("CREATING A REFERENCE MATRIX: "))
start1 = Sys.time()
for(i in 1:length(max_reference_primers)){
max_reference_primers_list = split(max_reference_primers[[i]], max_reference_primers[[i]][[taxa]])
max_comparisons = list()
for(j in 1:length(max_reference_primers_list)){
reference_matrix = matrix(ncol = max_reference_primers_list[[j]]$NUMBER_ACCESSION,
nrow = max_reference_primers_list[[j]]$NUMBER_ACCESSION)
max_comparisons[j] = length(reference_matrix[lower.tri(reference_matrix, diag = F)])
}
if(i == 1) reference_intra = tibble(cbind(max_reference_primers[[i]], TOTAL_NUMBER_CASES = unlist(max_comparisons)))
else reference_intra = rbind(reference_intra, tibble(cbind(max_reference_primers[[i]], TOTAL_NUMBER_CASES = unlist(max_comparisons))))
}
end1 = Sys.time()
duration1 = difftime(end1, start1)
cat("DONE\n")
cat("-------------------------------------------------------------------\n")
cat("\n")
cat(paste("Time taken:", round(duration1[[1]], 2), units(duration1), "\n"))
cat("-------------------------------------------------------------------\n")
cat("\n")
cat("\n")
reference_intra_no_zero = subset(reference_intra, TOTAL_NUMBER_CASES != 0)
## Selecting the appropriate comparisons
cat("SELECTING THE APPROPRIATE COMPARISONS: ")
start2 = Sys.time()
intra_list = lapply(data_comparisons, function(x) x[which(x[[paste0(taxa, "1")]] == x[[paste0(taxa, "2")]]),])
end2 = Sys.time()
duration2 = difftime(end2, start2)
cat("DONE\n")
cat("-------------------------------------------------------------------\n")
cat("\n")
cat(paste("Time taken:", round(duration2[[1]], 2), units(duration2), "\n"))
cat("-------------------------------------------------------------------\n")
cat("\n")
cat("\n")
## Computing the intra-BIN analysis for each threshold
if(mode == "ANALYSIS" & taxa == "BIN"){
cat(paste0("COMPUTING THE INTRA-", taxa, " ANALYSIS: "))
start3 = Sys.time()
intra_threshold = list()
for(similarity in similarity_thresholds){
intra_threshold_primer = bind_rows(lapply(intra_list, function(y) tibble(TAXA = names(sapply(split(y, y[[paste0(taxa, "1")]]), function(x) nrow(x))),
NUMBER_DISCRIMINED_CASES = sapply(split(y, y[[paste0(taxa, "1")]]), function(x)
ceiling(nrow(x[which(x$SIMILARITY < similarity), ]) / 2)))), .id = "PRIMERS")
# Divided by 2 since vsearch return duplicated comparisons (i.e. SEQ1 - SEQ2 & SEQ2 - SEQ1)
colnames(intra_threshold_primer)[2] = taxa
intra_threshold_df = tibble(merge(reference_intra_no_zero, intra_threshold_primer))
intra_threshold_df = tibble(cbind(intra_threshold_df[,1:3], intra_threshold_df[,5], intra_threshold_df[,4],
PERCENT_DISCRIMINED_CASES = round(intra_threshold_df$NUMBER_DISCRIMINED_CASES /
intra_threshold_df$TOTAL_NUMBER_CASES * 100, 2)))
intra_threshold[[100 - similarity]] = intra_threshold_df
names(intra_threshold)[100 - similarity] = similarity
}
end3 = Sys.time()
duration3 = difftime(end3, start3)
cat("DONE\n")
cat("-------------------------------------------------------------------\n")
cat("\n")
cat(paste("Time taken:", round(duration3[[1]], 2), units(duration3), "\n"))
cat("-------------------------------------------------------------------\n")
cat("\n")
cat("\n")
}
if(mode == "ANALYSIS" & taxa == "BIN") return(intra_threshold)
else if(mode == "ANALYSIS" & taxa != "BIN") stop('The mode "ANALYSIS" can only be executed if "taxa" is set to "BIN".')
else if(mode == "INTERVAL"){
cat(paste0("COMPUTING INTERVALS: "))
start4 = Sys.time()
if(similarity_table) similarity_df = bind_rows(lapply(intra_list, function(x) subset(x, select = SIMILARITY)), .id = "PRIMERS")
reference_number_comparisons = sapply(split(reference_intra_no_zero, reference_intra_no_zero$PRIMERS), function(x) sum(x$TOTAL_NUMBER_CASES))
if(separate_highest_interval){
intervals_names = data.frame(INTERVALS = c(paste0("[0,", min(similarity_thresholds), ")"), # Interval with omitted cases by vsearch
names(table(cut(intra_list[[1]]$SIMILARITY, (min(similarity_thresholds)):(max(similarity_thresholds)+1), right = F))),
as.character(max(similarity_thresholds)+1))) # right = F to exclude the rigth interval, in order to get the highest alone
lowest_interval = t(data.frame((((sapply(split(reference_intra_no_zero, reference_intra_no_zero$PRIMERS), function(x) sum(x$TOTAL_NUMBER_CASES)) -
sapply(intra_list, function(x) floor(nrow(x) / 2))) / # <90% -> total number of comparisons substracted by comparisons above 90
reference_number_comparisons) * 100)))
centre_intervals = (sapply(intra_list, function(x) cbind(floor(table(cut(x$SIMILARITY, (min(similarity_thresholds)):(max(similarity_thresholds)+1), right = F)) / 2))) /
reference_number_comparisons)*100
highest_interval = t(data.frame((sapply(intra_list, function(x) floor(length(which(x$SIMILARITY == (max(similarity_thresholds)+1))) / 2 )) /
reference_number_comparisons)*100))
intra_intervals = tibble(cbind(intervals_names, rbind(lowest_interval, centre_intervals, highest_interval)))
intra_intervals = intra_intervals %>% pivot_longer(!INTERVALS, names_to = "PRIMERS", values_to = "PERCENTAGE_COMPARISONS_IN_INTERVAL")
}
else{
intervals_names = data.frame(INTERVALS = c(paste0("[0,", min(similarity_thresholds), "]"), # Name the interval with omitted cases by vsearch
names(table(cut(intra_list[[1]]$SIMILARITY, (min(similarity_thresholds)):(max(similarity_thresholds)+1), right = T)))))
lowest_interval = t(data.frame((((sapply(split(reference_intra_no_zero, reference_intra_no_zero$PRIMERS), function(x) sum(x$TOTAL_NUMBER_CASES)) -
sapply(intra_list, function(x) floor(nrow(x) / 2))) + # <90% -> total number of comparisons substracted by comparisons above 90
sapply(intra_list, function(x) floor(length(which(x$SIMILARITY == min(similarity_thresholds))) / 2))) / # +90% because not included in interval
reference_number_comparisons)*100))
upper_intervals = (sapply(intra_list, function(x) cbind(floor(table(cut(x$SIMILARITY, (min(similarity_thresholds)):(max(similarity_thresholds)+1), right = T)) / 2))) /
reference_number_comparisons)*100 # right = TRUE to include the rigth interval till 100
intra_intervals = tibble(cbind(intervals_names, rbind(lowest_interval, upper_intervals)))
intra_intervals = intra_intervals %>% pivot_longer(!INTERVALS, names_to = "PRIMERS", values_to = "PERCENTAGE_COMPARISONS_IN_INTERVAL")
}
end4 = Sys.time()
duration4 = difftime(end4, start4)
cat("DONE\n")
cat("-------------------------------------------------------------------\n")
cat("\n")
cat(paste("Time taken:", round(duration4[[1]], 2), units(duration4), "\n"))
cat("-------------------------------------------------------------------\n")
cat("\n")
cat("\n")
if(similarity_table) return(list(DATA = similarity_df, INTERVALS = intra_intervals))
else return(intra_intervals)
}
else stop('The argument "mode" can only be set to "INTERVAL" or "ANALYSIS".')
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.