## Requirement: 'tibble' + 'DECIPHER' + 'stringr'
metabarcode_extraction = function(data_primer, data_aligned, remove_primers = T, max_mismatch_whole_primer, indels, length_col_name,
name_col_alignement = "ALIGNED_SEQUENCES", name_col_accession = "ACCESSION"){
index_letter = function(splited_dna){
letter = 0
pos_list = list()
for(r in 1:length(splited_dna)){
if(splited_dna[r] == "-") pos_list[r] = 0
else {
if(letter == 0){
letter = 1
pos_list[r] = letter
}
else{
letter = letter + 1
pos_list[r] = letter
}
}
}
letter_positions = unlist(pos_list)
names(letter_positions) = splited_dna
return(letter_positions)
}
metabarcode_data = data.frame()
for(i in 1:nrow(data_primer)){
position_start_aligned_list = list()
position_end_aligned_list = list()
primer_found = list()
name_primer = data_primer[i,]$SHORT_NAME
number_loop = 0
start = Sys.time()
cat(paste0("EXTRACTING PRIMER: ", name_primer, "\n"))
cat("-------------------------------------------------------------------\n")
if(data_primer[i,]$PRIMER_USED == "2 x 1") {
forward_primer = data_primer$REVERSE[i]
reverse_primer = paste(reverseComplement(DNAStringSet(data_primer$FORWARD[i])))
}
else {
forward_primer = data_primer$FORWARD[i]
reverse_primer = paste(reverseComplement(DNAStringSet(data_primer$REVERSE[i])))
}
primer_length = nchar(forward_primer) + nchar(reverse_primer)
known_length_fragment = mean(c(data_primer[i,]$ARTICLE_LENGTH, data_primer[i,][[length_col_name]]), na.rm = T)
length_aligned = mean(c(data_primer[i,]$ARTICLE_LENGTH, data_primer[i,][[length_col_name]]), na.rm = T)
for(j in 1:nrow(data_aligned)){
if(number_loop == 0) number_loop = 1
else number_loop = number_loop + 1
if(number_loop > ceiling(nrow(data_aligned) / 100) && number_loop %% ceiling(nrow(data_aligned) / 100) == 0)
cat(paste0(round(number_loop / nrow(data_aligned) * 100), "% ... "))
nucleotide_index = index_letter(strsplit(data_aligned[[name_col_alignement]][j], "")[[1]])
unaligned_sequence = gsub("-", "", data_aligned[j, ][[name_col_alignement]])
if(!remove_primers){
position_start = start(matchPattern(forward_primer, unaligned_sequence,
max.mismatch = max_mismatch_whole_primer, with.indels = indels))[1]
position_end = end(matchPattern(reverse_primer, unaligned_sequence,
max.mismatch = max_mismatch_whole_primer, with.indels = indels))[1]
}
else{
position_start = end(matchPattern(forward_primer, unaligned_sequence,
max.mismatch = max_mismatch_whole_primer, with.indels = indels)) + 1
position_end = start(matchPattern(reverse_primer, unaligned_sequence,
max.mismatch = max_mismatch_whole_primer, with.indels = indels)) - 1
}
if(!is.na(position_start[1]) && !is.na(position_end[1])) primer_found[j] = "all"
else if(is.na(position_start[1]) && !is.na(position_end[1])) primer_found[j] = "no forward"
else if(!is.na(position_start[1]) && is.na(position_end[1])) primer_found[j] = "no reverse"
else if(is.na(position_start[1]) && is.na(position_end[1])) primer_found[j] = "none"
if(length(position_start) >= 2 && length(position_end) >= 2){
combination_position = levels(interaction(position_start, position_end, sep = ":"))
for(k in 1:length(combination_position )){
start_position = as.numeric(sub("\\:.*", "", combination_position[k]))
end_position = as.numeric(sub(".*:", "", combination_position[k]))
start_position_alignement = which(nucleotide_index == start_position)
end_position_alignement = which(nucleotide_index == end_position)
aligned_fragment = substr(data_aligned[[name_col_alignement]][j], start_position_alignement, end_position_alignement)
fragment = gsub("-", "", aligned_fragment)
if(k == 1) length_fragment = nchar(fragment)
else length_fragment = c(length_fragment, nchar(fragment))
}
best_combination = combination_position[which.min(abs(length_fragment - known_length_fragment))]
position_start_aligned_list[j] = as.numeric(sub("\\:.*", "", best_combination))
position_end_aligned_list[j] = as.numeric(sub(".*:", "", best_combination))
}
else {
if(!is.na(position_start[1])) position_start_aligned_list[j] = which(nucleotide_index == position_start[1])
else position_start_aligned_list[j] = NA
if(!is.na(position_end[1])) position_end_aligned_list[j] = which(nucleotide_index == position_end[1])
else position_end_aligned_list[j] = NA
}
}
position_start_aligned = unlist(position_start_aligned_list)
position_end_aligned = unlist(position_end_aligned_list)
position_start_aligned = position_start_aligned[!is.na(position_start_aligned)]
position_end_aligned = position_end_aligned[!is.na(position_end_aligned)]
best_position_start_aligned = as.numeric(names(sort(table(position_start_aligned), decreasing = T))[1])
best_position_end_aligned = as.numeric(names(sort(table(position_end_aligned), decreasing = T))[1])
if(length(best_position_start_aligned) == 0 || length(best_position_end_aligned) == 0)
stop("No primers found in alignement with such parameters.")
else{
all_metabarcodes = substr(data_aligned[[name_col_alignement]], best_position_start_aligned, best_position_end_aligned)
all_metabarcodes = gsub("-", "", all_metabarcodes)
best_positions = paste0(best_position_start_aligned, ":", best_position_end_aligned)
all_primer_found = unlist(primer_found)
if(length(all_metabarcodes) != 0){
if(remove_primers) length_with_primers = nchar(all_metabarcodes) + primer_length
else length_with_primers = nchar(all_metabarcodes)
if(nrow(metabarcode_data) == 0) metabarcode_data = tibble(data.frame(ACCESSION = data_aligned[[name_col_accession]],
PRIMERS = name_primer,
PRIMER_FOUND = all_primer_found ,
POSITION_IN_ALIGNMENT = best_positions,
LENGTH_WITH_PRIMERS = length_with_primers,
LENGTH = nchar(all_metabarcodes),
FRAGMENT = all_metabarcodes))
else metabarcode_data = rbind(metabarcode_data, tibble(data.frame(ACCESSION = data_aligned[[name_col_accession]],
PRIMERS = name_primer,
PRIMER_FOUND = all_primer_found,
POSITION_IN_ALIGNMENT = best_positions,
LENGTH_WITH_PRIMERS = length_with_primers,
LENGTH = nchar(all_metabarcodes),
FRAGMENT = all_metabarcodes)))
}
else stop("No metabarcodes could be extracted with the best positions found.")
}
end = Sys.time()
duration = difftime(end, start)
cat("\n")
cat("-------------------------------------------------------------------\n")
cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
cat("\n")
cat("\n")
}
return(metabarcode_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.