R/metabarcode_extraction.R

Defines functions metabarcode_extraction

## Requirement: 'tibble' + 'DECIPHER' + 'stringr'

metabarcode_extraction = function(data_primer, data_aligned, remove_primers = T, max_mismatch_whole_primer, indels, length_col_name,
                                  name_col_alignement = "ALIGNED_SEQUENCES", name_col_accession = "ACCESSION"){
  
  index_letter = function(splited_dna){
    
    letter = 0
    
    pos_list = list()
    
    for(r in 1:length(splited_dna)){
      
      if(splited_dna[r] == "-") pos_list[r] = 0
      
      else {
        
        if(letter == 0){
          
          letter = 1
          
          pos_list[r] = letter
          
        }
        
        else{
          
          letter = letter + 1
          
          pos_list[r] = letter
          
        }
        
      }
      
    }
    
    letter_positions = unlist(pos_list)
    
    names(letter_positions) = splited_dna
    
    return(letter_positions)
    
  }
  
  metabarcode_data = data.frame()
  
  for(i in 1:nrow(data_primer)){
    
    position_start_aligned_list = list()
    
    position_end_aligned_list = list()
    
    primer_found = list()
    
    name_primer = data_primer[i,]$SHORT_NAME
    
    number_loop = 0
    
    start = Sys.time()
    
    cat(paste0("EXTRACTING PRIMER: ", name_primer, "\n"))
    
    cat("-------------------------------------------------------------------\n")
    
    if(data_primer[i,]$PRIMER_USED == "2 x 1") {
      
      forward_primer = data_primer$REVERSE[i]
      
      reverse_primer = paste(reverseComplement(DNAStringSet(data_primer$FORWARD[i])))
      
    }
    
    else {
      
      forward_primer = data_primer$FORWARD[i]
      
      reverse_primer = paste(reverseComplement(DNAStringSet(data_primer$REVERSE[i])))
      
    }
    
    primer_length = nchar(forward_primer) + nchar(reverse_primer)
    
    known_length_fragment = mean(c(data_primer[i,]$ARTICLE_LENGTH, data_primer[i,][[length_col_name]]), na.rm = T) 
    
    length_aligned = mean(c(data_primer[i,]$ARTICLE_LENGTH, data_primer[i,][[length_col_name]]), na.rm = T) 
    
    for(j in 1:nrow(data_aligned)){
      
      if(number_loop == 0) number_loop = 1
      
      else number_loop = number_loop + 1
      
      if(number_loop > ceiling(nrow(data_aligned) / 100) && number_loop %% ceiling(nrow(data_aligned) / 100) == 0) 
        cat(paste0(round(number_loop / nrow(data_aligned) * 100), "% ... "))
      
      nucleotide_index = index_letter(strsplit(data_aligned[[name_col_alignement]][j], "")[[1]])
      
      unaligned_sequence = gsub("-", "", data_aligned[j, ][[name_col_alignement]])
      
      if(!remove_primers){
        
        position_start = start(matchPattern(forward_primer, unaligned_sequence, 
                                            max.mismatch = max_mismatch_whole_primer, with.indels = indels))[1]
        
        position_end = end(matchPattern(reverse_primer, unaligned_sequence, 
                                        max.mismatch = max_mismatch_whole_primer, with.indels = indels))[1]
        
      }
      
      else{
        
        position_start = end(matchPattern(forward_primer, unaligned_sequence, 
                                          max.mismatch = max_mismatch_whole_primer, with.indels = indels)) + 1
        
        position_end = start(matchPattern(reverse_primer, unaligned_sequence, 
                                          max.mismatch = max_mismatch_whole_primer, with.indels = indels)) - 1
        
      }
      
      if(!is.na(position_start[1]) && !is.na(position_end[1])) primer_found[j] = "all"
      
      else if(is.na(position_start[1]) && !is.na(position_end[1])) primer_found[j] = "no forward"
      
      else if(!is.na(position_start[1]) && is.na(position_end[1])) primer_found[j] = "no reverse"
      
      else if(is.na(position_start[1]) && is.na(position_end[1])) primer_found[j] = "none"
      
      if(length(position_start) >= 2 && length(position_end) >= 2){
        
        combination_position = levels(interaction(position_start, position_end, sep = ":"))
        
        for(k in 1:length(combination_position )){
          
          start_position = as.numeric(sub("\\:.*", "", combination_position[k]))
          
          end_position = as.numeric(sub(".*:", "", combination_position[k]))
          
          start_position_alignement = which(nucleotide_index ==  start_position)
          
          end_position_alignement = which(nucleotide_index ==  end_position)
          
          aligned_fragment = substr(data_aligned[[name_col_alignement]][j], start_position_alignement, end_position_alignement)
          
          fragment = gsub("-", "", aligned_fragment)
          
          if(k == 1) length_fragment = nchar(fragment)
          
          else length_fragment = c(length_fragment, nchar(fragment))
          
        }
        
        best_combination = combination_position[which.min(abs(length_fragment - known_length_fragment))]
        
        position_start_aligned_list[j] = as.numeric(sub("\\:.*", "", best_combination))
        
        position_end_aligned_list[j] = as.numeric(sub(".*:", "", best_combination))
        
      }
      
      else {
        
        if(!is.na(position_start[1])) position_start_aligned_list[j] = which(nucleotide_index == position_start[1])
        
        else position_start_aligned_list[j] = NA
        
        if(!is.na(position_end[1])) position_end_aligned_list[j] = which(nucleotide_index == position_end[1])
        
        else position_end_aligned_list[j] = NA
        
        
      }
      
    }
    
    position_start_aligned = unlist(position_start_aligned_list)
    
    position_end_aligned = unlist(position_end_aligned_list)
    
    position_start_aligned = position_start_aligned[!is.na(position_start_aligned)]
    
    position_end_aligned = position_end_aligned[!is.na(position_end_aligned)]
    
    best_position_start_aligned = as.numeric(names(sort(table(position_start_aligned), decreasing = T))[1])
    
    best_position_end_aligned = as.numeric(names(sort(table(position_end_aligned), decreasing = T))[1])
    
    if(length(best_position_start_aligned) == 0 || length(best_position_end_aligned) == 0) 
      stop("No primers found in alignement with such parameters.")
    
    else{
      
      all_metabarcodes = substr(data_aligned[[name_col_alignement]], best_position_start_aligned, best_position_end_aligned)
      
      all_metabarcodes = gsub("-", "", all_metabarcodes)
      
      best_positions = paste0(best_position_start_aligned, ":", best_position_end_aligned)
      
      all_primer_found = unlist(primer_found)
      
      if(length(all_metabarcodes) != 0){
        
        if(remove_primers) length_with_primers = nchar(all_metabarcodes) + primer_length
        
        else length_with_primers = nchar(all_metabarcodes)
        
        if(nrow(metabarcode_data) == 0) metabarcode_data = tibble(data.frame(ACCESSION = data_aligned[[name_col_accession]], 
                                                                             PRIMERS = name_primer,
                                                                             PRIMER_FOUND = all_primer_found ,
                                                                             POSITION_IN_ALIGNMENT = best_positions,
                                                                             LENGTH_WITH_PRIMERS = length_with_primers,
                                                                             LENGTH = nchar(all_metabarcodes), 
                                                                             FRAGMENT = all_metabarcodes))
        
        else metabarcode_data = rbind(metabarcode_data, tibble(data.frame(ACCESSION = data_aligned[[name_col_accession]], 
                                                                          PRIMERS = name_primer,
                                                                          PRIMER_FOUND = all_primer_found,
                                                                          POSITION_IN_ALIGNMENT = best_positions,
                                                                          LENGTH_WITH_PRIMERS = length_with_primers,
                                                                          LENGTH = nchar(all_metabarcodes), 
                                                                          FRAGMENT = all_metabarcodes)))
      }
      
      else stop("No metabarcodes could be extracted with the best positions found.")
      
    }
    
    end = Sys.time()
    
    duration = difftime(end, start)
    
    cat("\n")
    
    cat("-------------------------------------------------------------------\n")
    
    cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
    
    cat("\n")
    
    cat("\n")
    
  }
  
  return(metabarcode_data)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.