R/remove_primers_blindly.R

Defines functions remove_primers_blindly

## Requirement: "DECIPHER" + "stringr" + "tibble"

remove_primers_blindly = function(data, forward_primer, reverse_primer, orientation = "1 x 2", verbose = T, max_diff_length,
                          max_prop_mismatch, max_prop_indels, rename_duplicates = T, print_duplicates = T, 
                          remove_unmatched = "both"){
  
  duplicates_index = function(previous_names){
    
    freq = data.frame(table(previous_names))
    
    rep_names = as.data.frame(lapply(freq, rep, freq$Freq))
    
    rep_names = cbind(rep_names, POS = sort(match(previous_names, freq$previous_names)))
    
    pos_data = data.frame(POS = match(previous_names, freq$previous_names), INITIAL_POS = seq(1:length(previous_names)))
    
    output = merge(pos_data, rep_names)
    
    output = output[!duplicated(output),]
    
    output = cbind(output, INDEX = unlist(lapply(freq$Freq, function(x) seq(1:x))))
    
    output = cbind(output, NAME = paste0(output$previous_names, ":", output$INDEX))[,-1]
    
    colnames(output) = c("POSITION", "PREVIOUS_NAMES", "FREQUENCY", "INDEX", "NEW_NAMES")
    
    output = output[order(output$POSITION),]
    
    tibble(output)
    
  }
  
  if(!(remove_unmatched %in% c("both", "forward", "reverse", "none")))
    stop('The argument "remove_unmatched" must be in "both", "forward", "reverse" or "none".')
  
  if(orientation == "1 x 2") reverse_primer = paste(reverseComplement(DNAStringSet(reverse_primer)))
  
  else if(orientation == "2 x 1") {
    
    forward_primer2 = reverse_primer
    
    reverse_primer = paste(reverseComplement(DNAStringSet(forward_primer)))
    
    forward_primer = forward_primer2
    
  }
  
  else stop('As only one forward and one reverse primers can be provided, "orientation" must be in "1 x 2" or "2 x 1". Please check if the most represented value in the column "PRIMERS_USED" is comprised in those values.')
  
  if(max_prop_indels == 0) indels_present = F
  
  else indels_present = T
  
  initial_data = data
  
  all_primers_list = list(forward_primer, reverse_primer)
  
  number_primer = sum(unlist(lengths(all_primers_list)))
  
  number_loop = 0
  
  names_sequences = names(data)
  
  for(i in 1:2){ 
    
    for(j in 1:length(all_primers_list[[i]])){
      
      number_loop = number_loop + 1
      
      if(verbose) {
        
        start = Sys.time()
        
        if(i == 1) cat(paste0("Forward primer number ", j, " on ", length(all_primers_list[[i]]), ": ", 
                              paste(all_primers_list[[i]][j]), "\n"))
        
        else cat(paste0("Reverse primer number ", j, " on ", length(all_primers_list[[i]]), ": ", 
                        paste(all_primers_list[[i]][j]), "\n"))
        
        cat("------------------------------------------------------------\n")
        
      }
      
      length_primer = nchar(paste(all_primers_list[[i]][j]))
      
      maximum_mismatch = round(max_prop_mismatch * length_primer)
      
      maximum_position = round(length_primer * (1 + max_prop_indels))
      
      for(k in 1:length(data)){
        
        if(verbose && k > 100 && k %% 100 == 0) cat(paste0(round(k / length(data) * 100), "% ... "))
        
        if(verbose && k == length(data)) cat("\n------------------------------------------------------------\n")
        
        found_pattern = matchPattern(paste(all_primers_list[[i]][j]), subject = paste(data[k]), with.indels = indels_present,
                                     max.mismatch = maximum_mismatch)
        
        if(length(found_pattern) != 0){
          
          matched_forward = which(end(found_pattern) <= maximum_position)
          
          matched_reverse = which(start(found_pattern) >= nchar(data[k]) - maximum_position)
          
          if(k == 1) {
            
            number_time_found = length(found_pattern)
            
            if(i == 1 && length(matched_forward) != 0) 
              removed_primer = gsub(paste0(substring(data[k], 1, end(found_pattern)[min(matched_forward)]), 
                                           collapse = "|"), "", data[k], perl = T)
            
            else if(i == 2 && length(matched_reverse) != 0) 
              removed_primer = gsub(paste0(substring(data[k], start(found_pattern)[max(matched_reverse)], nchar(data[k])), 
                                           collapse = "|"), "", data[k], perl = T) 
            
            else {
              
              number_time_found = 0
              
              removed_primer = paste(data[k])
              
            }
            
          }
          
          else {
            
            number_time_found_temporary = length(found_pattern)
            
            if(i == 1 && length(matched_forward) != 0) 
              removed_primer = c(removed_primer, gsub(paste0(substring(data[k], 1, end(found_pattern)[min(matched_forward)]), 
                                                             collapse = "|"), "", data[k], perl = T))  
            
            else if(i == 2 && length(matched_reverse) != 0) 
              removed_primer = c(removed_primer, gsub(paste0(substring(data[k], start(found_pattern)[max(matched_reverse)], 
                                                                       nchar(data[k])), collapse = "|"), "", data[k], perl = T))  
            
            else {
              
              number_time_found_temporary = 0
              
              removed_primer = c(removed_primer, paste(data[k]))  
              
            }
            
            number_time_found = c(number_time_found, number_time_found_temporary)
            
          }
          
        }
        
        else{
          
          if(k == 1) {
            
            number_time_found = 0
            
            removed_primer = paste(data[k])  
            
          }
          
          else {
            
            number_time_found = c(number_time_found, 0)
            
            removed_primer = c(removed_primer, paste(data[k]))  
            
          }
          
        }
        
      }
      
      data = DNAStringSet(removed_primer)
      
      if(i == 1 && j == 1){
        
        replacement = data.frame(number_time_found)
        
        colnames(replacement)[number_loop] = ifelse(i == 1, paste0("F_", j), paste0("R_", j))
      }
      
      else {
        
        replacement = data.frame(replacement, number_time_found)
        
        colnames(replacement)[number_loop] = ifelse(i == 1, paste0("F_", j), paste0("R_", j))
        
      }
      
      if(verbose){
        
        end = Sys.time()
        
        duration = difftime(end, start)
        
        cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
        
        cat("\n")
        
      }
      
    }
    
  }
  
  names(data) = names_sequences
  
  if(length(data) != 0){
    
    data_df = data.frame(ACCESSION = names(data),
                         SEQUENCES = paste(data))
    
    replacement = tibble(replacement)
    
    f_corrected = apply(replacement[, grep("F", colnames(replacement))], 1, function(x) all(x == 0))
    
    r_corrected = apply(replacement[, grep("R", colnames(replacement))], 1, function(x) all(x == 0))
    
    if(remove_unmatched == "both" && any(apply(data.frame(f_corrected, r_corrected), 1, all))){
      
      unmatched_df = data_df[which(apply(data.frame(f_corrected, r_corrected), 1, all)), ]
      
      data_df = data_df[-which(apply(data.frame(f_corrected, r_corrected), 1, all)), ]
      
    }
    
    
    else if(remove_unmatched == "forward" && any(f_corrected)){
      
      unmatched_df = data_df[which(f_corrected), ]
      
      data_df = data_df[-which(f_corrected), ]
      
    }
    
    
    else if(remove_unmatched == "reverse" && any(r_corrected)){
      
      unmatched_df = data_df[which(r_corrected), ]
      
      data_df = data_df[-which(r_corrected), ]
      
    }
    
    
    else {
      
      unmatched_df = data.frame(ACCESSION = NA, SEQUENCES = NA)
      
      data_df = data_df
      
    }
    
    replacement = cbind(names_sequences, replacement)
    
    colnames(replacement) = c("NAMES", paste(unlist(all_primers_list)))
    
    duplicated = which(duplicated(data_df$ACCESSION) & duplicated(data_df$SEQUENCES))
    
    if(length(duplicated) != 0) data_df = data_df[-duplicated, ]
    
    duplicated_accession = data_df[which(duplicated(data_df$ACCESSION) | duplicated(data_df$ACCESSION, fromLast = T)),]
    
    if(rename_duplicates && nrow(duplicated_accession)) data_df$ACCESSION = duplicates_index(data_df$ACCESSION)$NEW_NAMES
    
    if(nrow(duplicated_accession) != 0){
      
      initial_data_df = tibble(data.frame(ACCESSION = names(initial_data), 
                                          LENGTH = width(initial_data),
                                          SEQUENCES = paste(initial_data)))
      
      duplicated_initial = which(duplicated(initial_data_df$ACCESSION) & duplicated(initial_data_df$SEQUENCES))
      
      if(length(duplicated_initial) != 0) initial_data_df = initial_data_df[-duplicated_initial, ]
      
      duplicated_initial_accession = initial_data_df[which(duplicated(initial_data_df$ACCESSION) | 
                                                             duplicated(initial_data_df$ACCESSION, fromLast = T)),]
      
      duplicated_initial_accession_ordered = duplicated_initial_accession[order(duplicated_initial_accession$ACCESSION, 
                                                                                -duplicated_initial_accession$LENGTH),]
      
      if(print_duplicates){
        
        duplicated_initial_accession_ordered_dna = DNAStringSet(duplicated_initial_accession_ordered$SEQUENCES)
        
        names(duplicated_initial_accession_ordered_dna) = paste0(duplicated_initial_accession_ordered$ACCESSION,
                                                                 word(names(data)[[i]], 1, sep = "_"))
        
        disambiguated_primers = paste(unlist(Disambiguate(DNAStringSet(unlist(all_primers_list)))))
        
        BrowseSeqs(duplicated_initial_accession_ordered_dna, pattern = disambiguated_primers)
        
      }
      
    }
    
    final_dna = DNAStringSet(data_df$SEQUENCES)
    
    names(final_dna) = data_df$ACCESSION
    
    if(nrow(duplicated_accession) != 0) return(list(DNA = final_dna, TABLE = tibble(data_df), 
                                                    DUPLICATED = duplicated_initial_accession_ordered,
                                                    UNMATCHED = tibble(unmatched_df)))
    
    else return(list(DNA = final_dna, TABLE = tibble(data_df), 
                     DUPLICATED = "NO DUPLICATES PRESENT",
                     UNMATCHED = tibble(unmatched_df)))
    
  }
  
  else return(list(DNA = NULL, TABLE = data.frame(ACCESSION = NA, SEQUENCES = NA), 
                   DUPLICATED = NULL,
                   UNMATCHED = tibble(data.frame(ACCESSION = names_sequences, SEQUENCES = paste(initial_data)))))
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.