R/decipher_rapid_alignment.R

Defines functions decipher_rapid_alignment

## Requirement: "stringr", "tibble" & "DECIPHER"

decipher_rapid_alignment = function(data, progressive_saving = T, global_name,
                                sequence_col_name = "SEQUENCES", primer_col_name = "PRIMER_SET", 
                                accession_col_name = "ACCESSION", ...){
  
  if(any(class(data) %in% c("data.frame", "tbl", "tbl_df"))) data = split(data, data[[primer_col_name]])
  
  for(i in 1:length(data)){
    
    if(!any(colnames(data[[i]]) == sequence_col_name) || !any(colnames(data[[i]]) == primer_col_name) ||
       !any(colnames(data[[i]]) == primer_col_name)) 
      stop('The dataframes must contain columns provided in the "sequence_col_name", "accession_col_name" and "primer_col_name" argument (default: "SEQUENCES", "PRIMER_SET" & "ACCESSION").')
    
    dna_sequence = DNAStringSet(data[[i]][[sequence_col_name]])
    
    names(dna_sequence) = data[[i]][[accession_col_name]]
    
    if(length(dna_sequence) == 1) final = tibble(data.frame(ACCESSION = names(dna_sequence),
                                                            PRIMER_SET = word(names(data)[[i]], 1, sep = "_"),
                                                            ALIGNED_SEQUENCES = paste(dna_sequence)))
    
    else{
      
      guide_tree = lapply(order(width(dna_sequence), decreasing = TRUE), function(x) {
        attr(x, "height") = 0
        attr(x, "label") = names(dna_sequence)[x]
        attr(x, "members") = 1L
        attr(x, "leaf") = TRUE
        x})
      
      attr(guide_tree, "height") = 0.5
      
      attr(guide_tree, "members") = length(dna_sequence)
      
      class(guide_tree) = "dendrogram"
      
      aligned_sequences = AlignSeqs(dna_sequence, guideTree = guide_tree, iterations = 0, 
                                    refinements = 0, verbose = T, processors = NULL, ...)
      
      cat(paste0("Alignment for ", word(names(data)[[i]], 1, sep = "_")[1], " DONE\n"))
      cat("\n")
      
      final = tibble(data.frame(ACCESSION = names(dna_sequence),
                                PRIMER_SET = word(names(data)[[i]], 1, sep = fixed("_")),
                                ALIGNED_SEQUENCES = paste(aligned_sequences)))
      
    }
    
    if(progressive_saving && length(data) > 1)
      write.csv(final, paste0(paste(unique(word(names(data)[i], 1, sep = fixed("_"))), collapse = " & "), 
                              "_", global_name, "_aligned.csv"), row.names = F)
    
    if(i == 1) final_output = final
    
    else final_output = rbind(final_output, final)
    
  }
  
  if(length(data) > 1) write.csv(final_output, paste0("all_", global_name, "_aligned.csv"), row.names = F)
  
  else if(!progressive_saving) 
    write.csv(final_output, paste0(unique(word(names(data)[i], 1, sep = fixed("_"))), global_name, 
                                   "_aligned.csv"), row.names = F)
  
  tibble(final_output)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.