## Requirement: "DECIPHER" + "stringr" + "tibble"
remove_primers_blindly = function(data, forward_primer, reverse_primer, orientation = "1 x 2", verbose = T, max_diff_length,
max_prop_mismatch, max_prop_indels, rename_duplicates = T, print_duplicates = T,
remove_unmatched = "both"){
duplicates_index = function(previous_names){
freq = data.frame(table(previous_names))
rep_names = as.data.frame(lapply(freq, rep, freq$Freq))
rep_names = cbind(rep_names, POS = sort(match(previous_names, freq$previous_names)))
pos_data = data.frame(POS = match(previous_names, freq$previous_names), INITIAL_POS = seq(1:length(previous_names)))
output = merge(pos_data, rep_names)
output = output[!duplicated(output),]
output = cbind(output, INDEX = unlist(lapply(freq$Freq, function(x) seq(1:x))))
output = cbind(output, NAME = paste0(output$previous_names, ":", output$INDEX))[,-1]
colnames(output) = c("POSITION", "PREVIOUS_NAMES", "FREQUENCY", "INDEX", "NEW_NAMES")
output = output[order(output$POSITION),]
tibble(output)
}
if(!(remove_unmatched %in% c("both", "forward", "reverse", "none")))
stop('The argument "remove_unmatched" must be in "both", "forward", "reverse" or "none".')
if(orientation == "1 x 2") reverse_primer = paste(reverseComplement(DNAStringSet(reverse_primer)))
else if(orientation == "2 x 1") {
forward_primer2 = reverse_primer
reverse_primer = paste(reverseComplement(DNAStringSet(forward_primer)))
forward_primer = forward_primer2
}
else stop('As only one forward and one reverse primers can be provided, "orientation" must be in "1 x 2" or "2 x 1". Please check if the most represented value in the column "PRIMERS_USED" is comprised in those values.')
if(max_prop_indels == 0) indels_present = F
else indels_present = T
initial_data = data
all_primers_list = list(forward_primer, reverse_primer)
number_primer = sum(unlist(lengths(all_primers_list)))
number_loop = 0
names_sequences = names(data)
for(i in 1:2){
for(j in 1:length(all_primers_list[[i]])){
number_loop = number_loop + 1
if(verbose) {
start = Sys.time()
if(i == 1) cat(paste0("Forward primer number ", j, " on ", length(all_primers_list[[i]]), ": ",
paste(all_primers_list[[i]][j]), "\n"))
else cat(paste0("Reverse primer number ", j, " on ", length(all_primers_list[[i]]), ": ",
paste(all_primers_list[[i]][j]), "\n"))
cat("------------------------------------------------------------\n")
}
length_primer = nchar(paste(all_primers_list[[i]][j]))
maximum_mismatch = round(max_prop_mismatch * length_primer)
maximum_position = round(length_primer * (1 + max_prop_indels))
for(k in 1:length(data)){
if(verbose && k > 100 && k %% 100 == 0) cat(paste0(round(k / length(data) * 100), "% ... "))
if(verbose && k == length(data)) cat("\n------------------------------------------------------------\n")
found_pattern = matchPattern(paste(all_primers_list[[i]][j]), subject = paste(data[k]), with.indels = indels_present,
max.mismatch = maximum_mismatch)
if(length(found_pattern) != 0){
matched_forward = which(end(found_pattern) <= maximum_position)
matched_reverse = which(start(found_pattern) >= nchar(data[k]) - maximum_position)
if(k == 1) {
number_time_found = length(found_pattern)
if(i == 1 && length(matched_forward) != 0)
removed_primer = gsub(paste0(substring(data[k], 1, end(found_pattern)[min(matched_forward)]),
collapse = "|"), "", data[k], perl = T)
else if(i == 2 && length(matched_reverse) != 0)
removed_primer = gsub(paste0(substring(data[k], start(found_pattern)[max(matched_reverse)], nchar(data[k])),
collapse = "|"), "", data[k], perl = T)
else {
number_time_found = 0
removed_primer = paste(data[k])
}
}
else {
number_time_found_temporary = length(found_pattern)
if(i == 1 && length(matched_forward) != 0)
removed_primer = c(removed_primer, gsub(paste0(substring(data[k], 1, end(found_pattern)[min(matched_forward)]),
collapse = "|"), "", data[k], perl = T))
else if(i == 2 && length(matched_reverse) != 0)
removed_primer = c(removed_primer, gsub(paste0(substring(data[k], start(found_pattern)[max(matched_reverse)],
nchar(data[k])), collapse = "|"), "", data[k], perl = T))
else {
number_time_found_temporary = 0
removed_primer = c(removed_primer, paste(data[k]))
}
number_time_found = c(number_time_found, number_time_found_temporary)
}
}
else{
if(k == 1) {
number_time_found = 0
removed_primer = paste(data[k])
}
else {
number_time_found = c(number_time_found, 0)
removed_primer = c(removed_primer, paste(data[k]))
}
}
}
data = DNAStringSet(removed_primer)
if(i == 1 && j == 1){
replacement = data.frame(number_time_found)
colnames(replacement)[number_loop] = ifelse(i == 1, paste0("F_", j), paste0("R_", j))
}
else {
replacement = data.frame(replacement, number_time_found)
colnames(replacement)[number_loop] = ifelse(i == 1, paste0("F_", j), paste0("R_", j))
}
if(verbose){
end = Sys.time()
duration = difftime(end, start)
cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
cat("\n")
}
}
}
names(data) = names_sequences
if(length(data) != 0){
data_df = data.frame(ACCESSION = names(data),
SEQUENCES = paste(data))
replacement = tibble(replacement)
f_corrected = apply(replacement[, grep("F", colnames(replacement))], 1, function(x) all(x == 0))
r_corrected = apply(replacement[, grep("R", colnames(replacement))], 1, function(x) all(x == 0))
if(remove_unmatched == "both" && any(apply(data.frame(f_corrected, r_corrected), 1, all))){
unmatched_df = data_df[which(apply(data.frame(f_corrected, r_corrected), 1, all)), ]
data_df = data_df[-which(apply(data.frame(f_corrected, r_corrected), 1, all)), ]
}
else if(remove_unmatched == "forward" && any(f_corrected)){
unmatched_df = data_df[which(f_corrected), ]
data_df = data_df[-which(f_corrected), ]
}
else if(remove_unmatched == "reverse" && any(r_corrected)){
unmatched_df = data_df[which(r_corrected), ]
data_df = data_df[-which(r_corrected), ]
}
else {
unmatched_df = data.frame(ACCESSION = NA, SEQUENCES = NA)
data_df = data_df
}
replacement = cbind(names_sequences, replacement)
colnames(replacement) = c("NAMES", paste(unlist(all_primers_list)))
duplicated = which(duplicated(data_df$ACCESSION) & duplicated(data_df$SEQUENCES))
if(length(duplicated) != 0) data_df = data_df[-duplicated, ]
duplicated_accession = data_df[which(duplicated(data_df$ACCESSION) | duplicated(data_df$ACCESSION, fromLast = T)),]
if(rename_duplicates && nrow(duplicated_accession)) data_df$ACCESSION = duplicates_index(data_df$ACCESSION)$NEW_NAMES
if(nrow(duplicated_accession) != 0){
initial_data_df = tibble(data.frame(ACCESSION = names(initial_data),
LENGTH = width(initial_data),
SEQUENCES = paste(initial_data)))
duplicated_initial = which(duplicated(initial_data_df$ACCESSION) & duplicated(initial_data_df$SEQUENCES))
if(length(duplicated_initial) != 0) initial_data_df = initial_data_df[-duplicated_initial, ]
duplicated_initial_accession = initial_data_df[which(duplicated(initial_data_df$ACCESSION) |
duplicated(initial_data_df$ACCESSION, fromLast = T)),]
duplicated_initial_accession_ordered = duplicated_initial_accession[order(duplicated_initial_accession$ACCESSION,
-duplicated_initial_accession$LENGTH),]
if(print_duplicates){
duplicated_initial_accession_ordered_dna = DNAStringSet(duplicated_initial_accession_ordered$SEQUENCES)
names(duplicated_initial_accession_ordered_dna) = paste0(duplicated_initial_accession_ordered$ACCESSION,
word(names(data)[[i]], 1, sep = "_"))
disambiguated_primers = paste(unlist(Disambiguate(DNAStringSet(unlist(all_primers_list)))))
BrowseSeqs(duplicated_initial_accession_ordered_dna, pattern = disambiguated_primers)
}
}
final_dna = DNAStringSet(data_df$SEQUENCES)
names(final_dna) = data_df$ACCESSION
if(nrow(duplicated_accession) != 0) return(list(DNA = final_dna, TABLE = tibble(data_df),
DUPLICATED = duplicated_initial_accession_ordered,
UNMATCHED = tibble(unmatched_df)))
else return(list(DNA = final_dna, TABLE = tibble(data_df),
DUPLICATED = "NO DUPLICATES PRESENT",
UNMATCHED = tibble(unmatched_df)))
}
else return(list(DNA = NULL, TABLE = data.frame(ACCESSION = NA, SEQUENCES = NA),
DUPLICATED = NULL,
UNMATCHED = tibble(data.frame(ACCESSION = names_sequences, SEQUENCES = paste(initial_data)))))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.