R/sequence_selector_and_aligner.R

Defines functions sequence_selector_and_aligner

Documented in sequence_selector_and_aligner

#' @title Quality check graphics for DNA Barcodes
#'
#' @name  sequence_selector_and_aligner
#'
#' @description This function performs an alignment using DECIPHER
#'
#' @param input_fasta_files_user fasta file with sequences to be align
#'
#' @param input_fasta_reference_file_user an optional fasta file with a sequence
#'                                        to be used as reference for a pairwise
#'                                        alignment. All parameter related with
#'                                        pairwise alignment are aligment against
#'                                        a reference sequence(s).
#'                                        do_pairwiseAlignment_user  must be TRUE
#' @param reference_taxa_name_user a optional sequence name(s) to be selected
#'                                 as reference reference sequence in
#'                                 input_fasta_reference_file_user
#' @param seqinr_seqtype_user Charater indicating if sequence data are "DNA
#'                            or "AA"
#' @param do_pairwiseAlignment_user Logical. If it is TRUE performs a pairwise
#'                                  alignment with reference sequence in
#'                                  input_fasta_reference_file_user
#' @param pairwiseAlignment_type_user Character indicating the type pairwise alignment
#'                   (see Biostrings::pairwiseAlignment function), example "global"
#' @param nucleotideSubstitutionMatrix_match_user Numeric. indicating the
#'                   parameter match in pairwise alignment (see
#'                   Biostrings::nucleotideSubstitutionMatrix).
#' @param nucleotideSubstitutionMatrix_mismatch_user Numeric indicating the
#'                   parameter mismatch in pairwise alignment (see
#'                    Biostrings::nucleotideSubstitutionMatrix).
#' @param nucleotideSubstitutionMatrix_baseOnly_user Numeric indicating the
#'                   parameter baseOnly in pairwise alignment (See
#'                   Biostrings::nucleotideSubstitutionMatrix).
#' @param nucleotideSubstitutionMatrix_type_user Numeric indicating the
#'                   parameter type in pairwise alignment (See
#'                   Biostrings::nucleotideSubstitutionMatrix).
#' @param pairwiseAlignment_gapOpening_user Numeric. indicating the
#'                   parameter gapOpening in  pairwise alignment (See
#'                    Biostrings::pairwiseAlignment)
#' @param pairwiseAlignment_gapExtension_user Numeric indicating the
#'                   parameter gapExtension in pairwise alignment (See
#'                    Biostrings::pairwiseAlignment)
#' @param order_by_pairwiseAlignment_user Character indicating the criterion to
#'                   order the sequence in the pairwise alignment. Example:
#'                   "nmatch" or based in "dissimilarity_distance"
#' @param pairwiseAlignment_filter_condition_user Character indicating the
#'                   Criterion to filter the sequence in the pairwise alignment.
#'                   Example: "nmatch" or based in "dissimilarity_distance"
#' @param filter_condition_value_user Numeric indicating the filter value in
#'                   pairwiseAlignment_filter_condition_user
#' @param filter_condition_operator_user Character indicating if the filter is
#'                   performed to values higher "more" than
#'                   filter_condition_value_user or "less" than
#'                   filter_condition_value_user
#' @param retain_top_n_seq_user Number of sequence to keep based filter condiction
#'                              against reference sequence in pairwise alignment
#' @param degap_before_multialign_user Logical. True remove all gaps in sequence
#'                              in pairwise alignment
#'
#' @param do_multipleAligment_user Logical. True perform a multiple alignment
#'                                 with differents aligners
#'                                 ("clustalo", "mafft", "DECIPHER")
#' @param aligner_name_user Character indicating the aligner to use if
#'                           do_multipleAligment_user is TRUE
#'                          ("clustalo", "mafft", "DECIPHER")
#' @param aligner_bin_location_user path directory where clustal and mafft
#'                                  bin folder are localetd
#' @param aligner_quiet_user Logical indicating whether to display  alignment
#'                           progress
#' @param clustal_n_threads_user Numeric indicating number of cores to be used
#'                               in clustal alignment
#'                               (if proper parameters are activated)
#'
#' @param muscle_n_iter_user Numeric indicating Maximum number of iterations in
#'                           muscle alignment (if proper parameters are activated)
#' @param mafft_accuracy_method_user Character indicating Accuracy-oriented
#'                                   methods in maftt alignment.
#'                                   (if proper parameters are activated)
#' @param mafft_retree_user Numeric indicating the alignment strategies in mafft
#'                          alignment. FFT-NS-1 (1) or FFT-NS-2 (2).
#' @param mafft_maxiterate_user Numeric indicating the Iterative refinement method
#'                              (max. 1000) in maftt alignment.
#' @param mafft_gap_opening_penalty_user Numeric indicating Gap opening penalty
#'                                       at group-to-group alignment in maftt
#'                                       alignment.
#' @param mafft_gap_extension_penalty_user Numeric indicating gap extension
#'                                         penalty maftt alignment.
#' @param decipher_iterations_user Number of iteration to perform in DECIPHER
#'                                 alignment. See iterations parameter in
#'                                 DECIPHER::AlignSeqs
#' @param decipher_refinements_user Number of refinement to perform in DECIPHER
#'                                  See refinements parameter in
#'                                  DECIPHER::AlignSeqs
#' @param decipher_gapOpening_user Numeric indicating cost for opening a gap in
#'                                 the DECIPHER alignment. See gapOpening
#'                                 parameter in DECIPHER::AlignSeqs
#' @param decipher_gapExtension_user Numeric indicating cost for extending an
#'                                   open gap in the  DECIPHER alignment. See
#'                                   gapExtension parameter in DECIPHER::AlignSeqs.
#' @param decipher_useStructures_user Logical indicating whether to use
#'                                    secondary structure predictions during
#'                                    DECIPHER alignment. See UseStructures
#'                                    parameter in DECIPHER::AlignSeqs.
#' @param decipher_structures_user Character indicating list of secondary
#'                                 structure probabilities for
#'                                 decipher_useStructures_user. see DECIPHER::AlignSeqs
#'                                 for more details.
#' @param decipher_levels_user Numeric with eight elements specifying the levels
#'                             at which to trigger events. see DECIPHER::AlignSeqs
#'                                 for more details.
#' @param file_out_user Character to be added in the output file
#' @param out_directory_user output directory
#'
#' @return
#' @export
#'
#' @importFrom seqinr "read.fasta"
#' @importFrom Biostrings "readAAMultipleAlignment"
#' @importFrom Biostrings "writeXStringSet"
#' @importFrom Biostrings "DNAStringSet"
#' @importFrom Biostrings "reverseComplement"
#' @importFrom Biostrings "AAStringSet"
#' @importFrom Biostrings "pairwiseAlignment"
#' @importFrom Biostrings "nindel"
#' @importFrom Biostrings "pid"
#' @importFrom Biostrings "nmatch"
#' @importFrom Biostrings "nmismatch"
#' @importFrom Biostrings "nedit"
#' @importFrom Biostrings "alignedSubject"
#' @importFrom Biostrings "xscat"
#' @importFrom Biostrings "pairwiseAlignment"
#' @importFrom Biostrings "DNAStringSet"
#' @importFrom Biostrings "writeXStringSet"
#' @importFrom Biostrings "writeXStringSet"
#' @importFrom Biostrings "DNAStringSet"
#' @importFrom Biostrings "AAStringSet"
#' @importFrom DECIPHER "AlignSeqs"
#' @importFrom DECIPHER "DistanceMatrix"
#' @importFrom DECIPHER "RemoveGaps"
#' @importFrom seqinr "read.fasta"
#' @importFrom utils "write.table"

#' @examples
#'
#'
#'
#'
#'
#'
#'
#'
sequence_selector_and_aligner <- function( input_fasta_files_user,
                                  input_fasta_reference_file_user = NULL,
                                        reference_taxa_name_user = NULL,
                                        seqinr_seqtype_user = c("DNA","AA"),
                                   do_pairwiseAlignment_user = TRUE,
                                   pairwiseAlignment_type_user = "global",
             nucleotideSubstitutionMatrix_match_user = 1,
                    nucleotideSubstitutionMatrix_mismatch_user = -3,
                    nucleotideSubstitutionMatrix_baseOnly_user = FALSE,
nucleotideSubstitutionMatrix_type_user = "DNA",
pairwiseAlignment_gapOpening_user = 5,
pairwiseAlignment_gapExtension_user = 2,
order_by_pairwiseAlignment_user = c("nmatch", "dissimilarity_distance"),
pairwiseAlignment_filter_condition_user = "nmatch",
filter_condition_value_user = 300,
filter_condition_operator_user = "more",
retain_top_n_seq_user = NULL,
degap_before_multialign_user = TRUE,
do_multipleAligment_user = TRUE,
aligner_name_user = c("clustalo", "mafft", "DECIPHER"),
aligner_bin_location_user = NULL,
aligner_quiet_user = FALSE,
clustal_n_threads_user = 1,
muscle_n_iter_user = 16,
mafft_accuracy_method_user = NULL,
mafft_retree_user = 2,
mafft_maxiterate_user = 1000,
mafft_gap_opening_penalty_user = 1.53,
mafft_gap_extension_penalty_user = 0.123,
decipher_iterations_user = 3,
decipher_refinements_user = 2,
decipher_gapOpening_user = c(-18,-16),
decipher_gapExtension_user = c(-2, -1),
decipher_useStructures_user = TRUE,
decipher_structures_user = NULL,
decipher_levels_user = c(0.9, 0.7, 0.7, 0.4, 10, 5, 5, 2),
file_out_user = NULL,
out_directory_user = NULL ) {


# user input

sequence_fasta_file <- input_fasta_files_user
ref_fasta_file <- input_fasta_reference_file_user
ref_taxa_name_pattern <- reference_taxa_name_user

is_pairwiseAlignment <- do_pairwiseAlignment_user
pairwiseAlignment_type <- pairwiseAlignment_type_user
mat_pairwiseAlignment <- Biostrings::nucleotideSubstitutionMatrix(match = nucleotideSubstitutionMatrix_match_user,
                                                             mismatch = nucleotideSubstitutionMatrix_mismatch_user,
                                                             baseOnly = nucleotideSubstitutionMatrix_baseOnly_user,
                                                                 type = nucleotideSubstitutionMatrix_type_user)
pairwiseAlignment_gapOpening <- pairwiseAlignment_gapOpening_user
pairwiseAlignment_gapExtension <- pairwiseAlignment_gapExtension_user
order_by <- order_by_pairwiseAlignment_user
filter_condition <- pairwiseAlignment_filter_condition_user
condition_value <- filter_condition_value_user
condition_operator <- filter_condition_operator_user
retain_top_n_seq <- retain_top_n_seq_user
degap_before_multialign <- degap_before_multialign_user

do_multipleAligment <- do_multipleAligment_user
aligner_name <- aligner_name_user
aligner_bin_location <- aligner_bin_location_user
aligner_quiet <- aligner_quiet_user

clustal_n_threads <- clustal_n_threads_user
muscle_n_iter <- muscle_n_iter_user

mafft_accuracy_method <- mafft_accuracy_method_user
mafft_retree <- mafft_retree_user
mafft_maxiterate <- mafft_maxiterate_user
mafft_gap_opening_penalty <- mafft_gap_opening_penalty_user
mafft_gap_extension_penalty <- mafft_gap_extension_penalty_user

decipher_iterations <- decipher_iterations_user
decipher_refinements <- decipher_refinements_user
decipher_gapOpening <- decipher_gapOpening_user
decipher_gapExtension <- decipher_gapExtension_user
decipher_useStructures <- decipher_useStructures_user
decipher_structures <- decipher_structures_user
decipher_levels <- decipher_levels_user

seqinr_seqtype <- seqinr_seqtype_user

file_out <- file_out_user

## outdir arguments

if(is.null(out_directory_user)) {
                 out_directory <- getwd()
                                } else {
                 setwd(out_directory_user)
                 out_directory <- getwd()
                                }

#### one or combine many fasta

if(length(sequence_fasta_file) > 1) {

one_file_name_1 <- sub(".*/", "", sequence_fasta_file[1])
one_file_name <- gsub("(.*)[.].*","\\1",one_file_name_1)
one_file_name <- paste0(one_file_name, "_combination_of_", length(sequence_fasta_file), ".fasta")

if (file.exists(one_file_name)) { file.remove(one_file_name) }

for(i in 1:length(sequence_fasta_file)) {
                              one_file <- readLines(sequence_fasta_file[i])
                              one_file <- c(one_file, "\n")
                         write(one_file,file=one_file_name,append=TRUE)
                                        }
sequence_fasta_file <- one_file_name
                                       }


########  get input sequence name

sequence_file_for_naming_long <- sub(".*/", "", sequence_fasta_file)
sequence_file_for_naming <- gsub("(.*)[.].*","\\1",sequence_file_for_naming_long)

############ read fasta file

## source sequence

source_sequence_list <- seqinr::read.fasta(file = sequence_fasta_file,
                                                    seqtype = seqinr_seqtype,
                                             set.attributes = FALSE,
                                                 apply.mask = FALSE,
                                               whole.header = TRUE)

source_sequence_vector <- lapply(source_sequence_list, paste, collapse = "")
source_sequence_vector <- toupper(source_sequence_vector)
names(source_sequence_vector) <- names(source_sequence_list)

######################################################
############ pairwise alignment evaluation

if(is_pairwiseAlignment == TRUE) {

## ref sequence

ref_sequence_list <- seqinr::read.fasta(file = ref_fasta_file,
                                                    seqtype = seqinr_seqtype,
                                             set.attributes = FALSE,
                                                 apply.mask = FALSE,
                                               whole.header = TRUE)

## select references

if(!is.null(ref_taxa_name_pattern)) {
                select_ref_sequence <- ref_sequence_list[grep(ref_taxa_name_pattern,names(ref_sequence_list))]
                                    } else {
                select_ref_sequence <- ref_sequence_list
                                    }

## iterate on references

constructed_best_DNAstringset_sets <- list()
constructed_best_DNAstringset_sets_df <- list()

for(uu in 1:length(select_ref_sequence)) {

# uu <- 2

one_ref_sequence <- select_ref_sequence[uu]
one_ref_vec <- as.character(lapply(one_ref_sequence, paste, collapse = ""))
one_ref_vec <- toupper(one_ref_vec)
names(one_ref_vec) <- names(one_ref_sequence)

# combine ro single string

ref_source_vector <- c(one_ref_vec, source_sequence_vector)

if(seqinr_seqtype == "DNA") {
      ref_source_string_for <- Biostrings::DNAStringSet(ref_source_vector, use.names=TRUE)
      ref_source_string_rev <- Biostrings::reverseComplement(ref_source_string_for)
                            }

if(seqinr_seqtype == "AA") {
         ref_source_string <- Biostrings::AAStringSet(ref_source_vector, use.names=TRUE)
                            }

# function to move from XstringSet to data.frame

dss2df <- function(dss) data.frame(width=width(dss), seq=as.character(dss), names=names(dss), stringsAsFactors = FALSE)

######################################## pairwise aligment -- Biostrings

# iterate sequences to find

counter_ref <- length(ref_source_vector)

best_DNAstring_df_list <- list()
counter <- 0

ref_one_seq <- ref_source_string_for[1]

cat("\n***** reference sequence used is: \n")
print(ref_one_seq)

cat("\n******* processing sequence -- \n")

for(i in 2:counter_ref) {
                    # i <- 8
                  counter <- counter + 1

       # forward
             target_one_seq_for <- ref_source_string_for[i]
          aligned_ref_one_seq_f <- try(Biostrings::pairwiseAlignment(ref_one_seq, target_one_seq_for, type = pairwiseAlignment_type,
                                                                         substitutionMatrix = mat_pairwiseAlignment,
                                                                                 gapOpening = pairwiseAlignment_gapOpening,
                                                                               gapExtension = pairwiseAlignment_gapExtension), silent = TRUE)
          if(class(aligned_ref_one_seq_f) == "try-error") {
                   cat("********* SEQUENCE HAS AMBIGUITIES THAT CANNOT BE HANDLED BY THE Biostrings::nucleotideSubstitutionMatrix -- check sequence ********* ")
               print(target_one_seq_for)
               cat("\n")
               problematic_seq <- as.character(target_one_seq_for)
               names(problematic_seq) <- NULL
               best_seq_ref <- data.frame(ref_name = names(one_ref_sequence),
                                      dissimilarity_distance = 1,
                                                      nmatch = NA,
                                                  seq_name = names(target_one_seq_for),
                                                   align_score = NA,
                                     levenshtein_edit_distance = NA,
                                     percent_sequence_identity = NA,
                                                     nmismatch = NA,
                                              insertion_length = NA,
                                            insertion_widthsum = NA,
                                               deletion_length = NA,
                                             deletion_widthsum = NA,
                                                   seq_reverse = NA,
                                                      sequence = problematic_seq,
                                              stringsAsFactors = FALSE)

           best_DNAstring_df_list[[counter]] <- best_seq_ref
                                       next
                                                        }

                  forward_score <- aligned_ref_one_seq_f@score
                 forward_nindel <- Biostrings::nindel(aligned_ref_one_seq_f)
       forward_insertion_length <- forward_nindel@insertion[1]
     forward_insertion_widthsum <- forward_nindel@insertion[2]
        forward_deletion_length <- forward_nindel@deletion[1]
      forward_deletion_widthsum <- forward_nindel@deletion[2]
                    forward_pid <- Biostrings::pid(aligned_ref_one_seq_f)
                 forward_nmatch <- Biostrings::nmatch(aligned_ref_one_seq_f)
              forward_nmismatch <- Biostrings::nmismatch(aligned_ref_one_seq_f)
                  forward_nedit <- Biostrings::nedit(aligned_ref_one_seq_f)
                forward_aligned <- Biostrings::alignedSubject(aligned_ref_one_seq_f)
           forward_aligned_char <- as.character(forward_aligned)
    names(forward_aligned_char) <- NULL
        pattern_forward_aligned <- Biostrings::xscat(c(Biostrings::alignedPattern(aligned_ref_one_seq_f), Biostrings::alignedSubject(aligned_ref_one_seq_f)))
     forward_aligned_distance_m <- DECIPHER::DistanceMatrix(pattern_forward_aligned,
                                                            includeTerminalGaps = FALSE,
                                                          penalizeGapGapMatches = TRUE,
                                                                     processors = NULL,
                                                                        verbose = FALSE)
     forward_aligned_distance_m <- round(forward_aligned_distance_m, digits = 5)
       forward_aligned_distance <- forward_aligned_distance_m[1,2]

cat(names(forward_aligned), "\n")

       # reverse
               target_one_seq_rev <- ref_source_string_rev[i]
          aligned_ref_one_seq_r <- Biostrings::pairwiseAlignment(ref_one_seq, target_one_seq_rev, type = pairwiseAlignment_type,
                                                                         substitutionMatrix = mat_pairwiseAlignment,
                                                                                 gapOpening = pairwiseAlignment_gapOpening,
                                                                               gapExtension = pairwiseAlignment_gapExtension)
                  reverse_score <- aligned_ref_one_seq_r@score
                 reverse_nindel <- Biostrings::nindel(aligned_ref_one_seq_r)
       reverse_insertion_length <- reverse_nindel@insertion[1]
     reverse_insertion_widthsum <- reverse_nindel@insertion[2]
        reverse_deletion_length <- reverse_nindel@deletion[1]
      reverse_deletion_widthsum <- reverse_nindel@deletion[2]
                    reverse_pid <- Biostrings::pid(aligned_ref_one_seq_r)
                 reverse_nmatch <- Biostrings::nmatch(aligned_ref_one_seq_r)
              reverse_nmismatch <- Biostrings::nmismatch(aligned_ref_one_seq_r)
                  reverse_nedit <- Biostrings::nedit(aligned_ref_one_seq_r)
                reverse_aligned <- Biostrings::alignedSubject(aligned_ref_one_seq_r)
           reverse_aligned_char <- as.character(reverse_aligned)
    names(reverse_aligned_char) <- NULL
          pattern_reverse_aligned <- Biostrings::xscat(c(Biostrings::alignedPattern(aligned_ref_one_seq_r), Biostrings::alignedSubject(aligned_ref_one_seq_r)))
     reverse_aligned_distance_m <- DECIPHER::DistanceMatrix(pattern_reverse_aligned,
                                                            includeTerminalGaps = FALSE,
                                                          penalizeGapGapMatches = TRUE,
                                                                     processors = NULL,
                                                                        verbose = FALSE)
     reverse_aligned_distance_m <- round(reverse_aligned_distance_m, digits = 5)
       reverse_aligned_distance <- reverse_aligned_distance_m[1,2]

        # which choose forward or reverse

        if(forward_nmatch >= reverse_nmatch) {

                             best_seq_ref <- data.frame(ref_name = names(one_ref_sequence),
                                        dissimilarity_distance = forward_aligned_distance,
                                                        nmatch = forward_nmatch,
                                                      seq_name = names(forward_aligned),
                                                   align_score = forward_score,
                                     levenshtein_edit_distance = forward_nedit,
                                     percent_sequence_identity = forward_pid,
                                                     nmismatch = forward_nmismatch,
                                              insertion_length = forward_insertion_length,
                                            insertion_widthsum = forward_insertion_widthsum,
                                               deletion_length = forward_deletion_length,
                                             deletion_widthsum = forward_deletion_widthsum,
                                                   seq_reverse = "no",
                                                      sequence = forward_aligned_char,
                                              stringsAsFactors = FALSE)

           best_DNAstring_df_list[[counter]] <- best_seq_ref

                                          } else {

                    best_seq_ref <- data.frame(ref_name = names(one_ref_sequence),
                                        dissimilarity_distance = reverse_aligned_distance,
                                                        nmatch = reverse_nmatch,
                                                      seq_name = names(reverse_aligned),
                                                   align_score = reverse_score,
                                     levenshtein_edit_distance = reverse_nedit,
                                     percent_sequence_identity = reverse_pid,
                                                     nmismatch = reverse_nmismatch,
                                              insertion_length = reverse_insertion_length,
                                            insertion_widthsum = reverse_insertion_widthsum,
                                               deletion_length = reverse_deletion_length,
                                             deletion_widthsum = reverse_deletion_widthsum,
                                                   seq_reverse = "yes",
                                                      sequence = reverse_aligned_char,
                                              stringsAsFactors = FALSE)

           best_DNAstring_df_list[[counter]] <- best_seq_ref
                                         }
                         }
                      rm(i)
                      rm(counter)
                      cat("\n *** DONE *** \n")

## create df of sequences

best_DNAstring_df <- do.call(rbind, best_DNAstring_df_list)

## order by nmatch or dissimilarity_distance

if(order_by == "nmatch") {

best_DNAstring_df <- best_DNAstring_df[order(best_DNAstring_df$nmatch, decreasing = TRUE),]

                          }

 if(order_by == "dissimilarity_distance") {

best_DNAstring_df <- best_DNAstring_df[order(best_DNAstring_df$dissimilarity_distance, decreasing = FALSE),]
                          }


# filter if requested based on condition


if(!is.null(filter_condition)) {

  if(condition_operator == "more") {
            best_DNAstring_filter <- subset(best_DNAstring_df, best_DNAstring_df[,filter_condition] >= condition_value)
                                   } else {
            best_DNAstring_filter <- subset(best_DNAstring_df, best_DNAstring_df[,filter_condition] <= condition_value)
                                   }
                               } else {
              best_DNAstring_filter <- best_DNAstring_df
                               }

rownames(best_DNAstring_filter) <- NULL

# retain_top_n_seq

if(!is.null(retain_top_n_seq)) {
            retain_top_n_seq <- ifelse(nrow(best_DNAstring_filter) >= retain_top_n_seq, retain_top_n_seq,  nrow(best_DNAstring_filter))
       best_DNAstring_filter <- best_DNAstring_filter[1:retain_top_n_seq,]
                               }

# construct DNAstring

              best_DNAstring_seq <- best_DNAstring_filter$sequence
              names(best_DNAstring_seq) <- best_DNAstring_filter$seq_name
              best_DNAstring_seq <- c(one_ref_vec, best_DNAstring_seq)

# remove na rows
             best_DNAstring_seq <- best_DNAstring_seq [!is.na(best_DNAstring_seq)]

             best_DNAstringset <- Biostrings::DNAStringSet(best_DNAstring_seq, use.names=TRUE)

cat("\n****** Total number of sequences retained are -- ", length(best_DNAstring_seq) - 1, "\n")

# write data.frame for information

file_fasta_df_name <- paste0(sequence_file_for_naming,"_ref_", names(one_ref_sequence),"_df_info_seq.txt")

utils::write.table(best_DNAstring_df, file = file_fasta_df_name, sep = "\t", row.names = FALSE)

# write fasta file

file_fasta_name <- paste0(sequence_file_for_naming,"_ref_", names(one_ref_sequence), "_biostrings_out.fasta")

if(degap_before_multialign == TRUE) {
                  best_DNAstringset <- DECIPHER::RemoveGaps(best_DNAstringset, removeGaps = "all", processors = NULL)
                                    }

Biostrings::writeXStringSet(best_DNAstringset, filepath = file_fasta_name, append=FALSE, compress=FALSE, compression_level=NA, format="fasta")


cat("\n******  fasta file written as -- ", file_fasta_name, " -- \n")

constructed_best_DNAstringset_sets[[uu]] <- best_DNAstringset
constructed_best_DNAstringset_sets_df[[uu]] <- best_DNAstring_df

                                      }
                                      rm(uu)

# need to combine all in a single fasta file for multialigner

master_DNAstringset <- do.call(c, constructed_best_DNAstringset_sets)
file_fasta_name <- paste0(sequence_file_for_naming,"_master_", length(master_DNAstringset@ranges@group), "_seq_biostrings_out.fasta")
Biostrings::writeXStringSet(master_DNAstringset, filepath = file_fasta_name, append=FALSE, compress=FALSE, compression_level=NA, format="fasta")

best_DNAstring_df <- do.call(rbind, constructed_best_DNAstringset_sets_df)
file_fasta_name_df <- paste0(sequence_file_for_naming,"_master_", nrow(best_DNAstring_df), "_df_info_seq.txt")
write.table(best_DNAstring_df, file = file_fasta_name_df, sep = "\t", row.names = FALSE)

# for consistency for decipher

best_DNAstringset <- master_DNAstringset

# collecting things to return

seq_items_return <- list(best_DNAstring_df, master_DNAstringset)


                                    } else { # else if not pairwaise alignment

 if(seqinr_seqtype == "DNA") {
         best_DNAstringset <- Biostrings::DNAStringSet(source_sequence_vector, use.names=TRUE)
                            }

if(seqinr_seqtype == "AA") {
         best_DNAstringset <-  Biostrings::AAStringSet(source_sequence_vector, use.names=TRUE)
                            }

cat("\n****** Total number of sequences retained are -- ", length(source_sequence_vector), "\n")

# write fasta file

file_fasta_name <- paste0(sequence_file_for_naming, "_biostrings_out.fasta")

if(degap_before_multialign == TRUE) {
                  best_DNAstringset <- DECIPHER::RemoveGaps(best_DNAstringset, removeGaps = "all", processors = NULL)
                                    }

Biostrings::writeXStringSet(best_DNAstringset, filepath = file_fasta_name, append=FALSE, compress=FALSE, compression_level=NA, format="fasta")

cat("\n******  fasta file written as -- ", file_fasta_name, " -- \n")


                                    }

############ END: pairwise alignment evaluation
####################################################################################

####################################################################################
############ multiple alignment evaluation

if(do_multipleAligment == TRUE) {

list_aligner_scripts <- list()
name_of_aligner_analyses <- character()
name_of_align_file <- character()
counter <- 0

#############          do clustalo
   if("clustalo" %in% aligner_name) {

          counter <- counter + 1

         clustalo_out_file_fasta <- paste0(sequence_file_for_naming, "_clustalo_out.fasta")

# prepare clustalo commands

                            line_clustalo <- paste0("clustalo   -i ", file_fasta_name, "   -o ", clustalo_out_file_fasta,
                              " --threads=", clustal_n_threads," --auto --force -v")

if(aligner_quiet) { line_clustalo <- paste0("clustalo   -i ", file_fasta_name, "   -o ", clustalo_out_file_fasta,
                              " --threads=", clustal_n_threads," --auto --force")}

        name_of_aligner_analyses[counter] <- paste0("clustalo_", counter)
             name_of_align_file [counter] <- clustalo_out_file_fasta

# all lines together
        list_aligner_scripts[[counter]] <- paste0(line_clustalo, collapse = " ")

                                  }

#############          do muscle -- DOES NOT WORK

   if("muscle" %in% aligner_name) {

          counter <- counter + 1

         muscle_out_file_fasta <- paste0(sequence_file_for_naming, "_muscle_out.fasta")

# prepare muscle commands

                            line_muscle <- paste0("muscle   -in ", file_fasta_name, " -out ", muscle_out_file_fasta,
                             " -maxiters ", muscle_n_iter)

if(aligner_quiet) { line_muscle <- paste0("muscle   -in ", file_fasta_name,  " -out ", muscle_out_file_fasta,
                             " -maxiters ", muscle_n_iter, " -quiet")}

        name_of_aligner_analyses[counter] <- paste0("muscle_", counter)
             name_of_align_file [counter] <- muscle_out_file_fasta

# all lines together
        list_aligner_scripts[[counter]] <- paste0(line_muscle, collapse = " ")

                                  }

#############          do mafft

   if("mafft" %in% aligner_name) {

          counter <- counter + 1
         mafft_out_file_fasta <- paste0(sequence_file_for_naming, "_mafft_out.fasta")

# prepare muscle commands

list_of_mafft <- character()
list_of_mafft[1] <- paste0("mafft.bat ")
counter_mafft <- 1

# mafft_accuracy_method
if(!is.null(mafft_accuracy_method)) {counter_mafft <- counter_mafft + 1; out_line <- paste0(" --", mafft_accuracy_method, " "); list_of_mafft[counter_mafft] <- out_line}

# common mafft parameters

if(!is.null(mafft_retree)) {counter_mafft <- counter_mafft + 1; out_line <- paste0(" --retree ", mafft_retree, " "); list_of_mafft[counter_mafft] <- out_line}
if(!is.null(mafft_maxiterate)) {counter_mafft <- counter_mafft + 1; out_line <- paste0(" --maxiterate ", mafft_maxiterate, " "); list_of_mafft[counter_mafft] <- out_line}

# penalty parameters:

counter_mafft <- counter_mafft + 1; out_line <- paste0(" --op ", mafft_gap_opening_penalty, " "); list_of_mafft[counter_mafft] <- out_line
counter_mafft <- counter_mafft + 1; out_line <- paste0(" --ep ", mafft_gap_extension_penalty, " "); list_of_mafft[counter_mafft] <- out_line

# quiet

if(aligner_quiet) {counter_mafft <- counter_mafft + 1; out_line <- paste0(" --quiet "); list_of_mafft[counter_mafft] <- out_line}

# end

counter_mafft <- counter_mafft + 1; out_line <- paste0(file_fasta_name, " > ", mafft_out_file_fasta); list_of_mafft[counter_mafft] <- out_line

# name of run

name_of_aligner_analyses[counter] <- paste0("mafft_", counter)
name_of_align_file [counter] <- mafft_out_file_fasta

# all lines together

list_aligner_scripts[[counter]] <- paste0(list_of_mafft, collapse = " ")

                                  }


#####################################################################################
#### open and run aligner from R

## add to return list

if(exists("seq_items_return")) {
                       counter_3 <- length(seq_items_return)
                               } else {
                  seq_items_return <- list()
                         counter_3 <- 0
                               }

# create vector to export path -- export PATH=/opt/local/bin:/opt/local/sbin:$PATH

export_path_align_bins <- normalizePath(aligner_bin_location)
export_path_align_vector <- paste0("export PATH=",export_path_align_bins,":$PATH")

## prank execute function

execute_aligner_terminal <- function(export_path_section = export_path_align_vector,
                                   aligner_all_commands,
                                   intern = FALSE,
                                    wait = FALSE){

system(paste0(export_path_section, "\n",
                    aligner_all_commands, "\n"),
                                intern = intern,
                                  wait = wait)
                                                 }


## run list elements with aligner commands


cat("\n****** started serial analisis of: ", length(list_aligner_scripts), "\n" )

if(any(c("clustalo","mafft", "muscle") %in% aligner_name)) {

for(j in 1:length(list_aligner_scripts)) {

cat("***** aligner analysis name: ",name_of_aligner_analyses[j], "\n")
cat("***** aligner commands: ", list_aligner_scripts[[j]], "\n")

execute_aligner_terminal(export_path_section = export_path_align_vector ,
                             aligner_all_commands = list_aligner_scripts[[j]],
                                    intern = FALSE,
                                      wait = TRUE)

cat("\n***** DONE ***** \n")

Sys.sleep(5)

counter_3 <- counter_3 + 1

if(seqinr_seqtype == "AA") {
  seq_items_return[[counter_3]] <- Biostrings::readAAMultipleAlignment(name_of_align_file[j], format = "fasta")
                            }

if(seqinr_seqtype == "DNA") {
  seq_items_return[[counter_3]] <- Biostrings::readDNAMultipleAlignment(name_of_align_file[j], format = "fasta")
                            }
                                        }
                                        rm(j)

                                                        }

#############          do DECIPHER

   if("DECIPHER" %in% aligner_name) {

# decipher requires to remove gaps ("-")

best_DNAstringset_decipher <- DECIPHER::RemoveGaps(best_DNAstringset, removeGaps = "all", processors = NULL)

cat("***** aligner analysis name: DECIPHER \n\n")

decipher_alignment <- DECIPHER::AlignSeqs(best_DNAstringset_decipher,
                                                guideTree = NULL,
                                               iterations = decipher_iterations,
                                              refinements = decipher_refinements,
                                               gapOpening = decipher_gapOpening,
                                             gapExtension = decipher_gapExtension,
                                            useStructures = decipher_useStructures,
                                               structures = decipher_structures,
                                                      FUN = AdjustAlignment,
                                                   levels = decipher_levels,
                                                 alphabet = AA_REDUCED[[1]],
                                               processors = NULL,
                                                  verbose = !aligner_quiet)

# write fasta file

file_fasta_name <- paste0(sequence_file_for_naming, "_decipher_out.fasta")

Biostrings::writeXStringSet(decipher_alignment, filepath = file_fasta_name, append=FALSE, compress=FALSE, compression_level=NA, format="fasta")

cat("\n******  DECIPHER alignment fasta file written as -- ", file_fasta_name, " -- \n")

# add stringset to list

counter_3 <- counter_3 + 1

seq_items_return[[counter_3]] <- decipher_alignment


                                       }


                         }

############ END: multiple alignment evaluation
####################################################################################


setwd(out_directory)

cat("\n\n *************      alignments and pairwise analyses DONE        ************* \n")

if(exists("seq_items_return")) {
                       return(seq_items_return)
                               }


                                           }
carvajalc/BarcoR documentation built on Dec. 19, 2021, 1:54 p.m.