#' @title Quality check graphics for DNA Barcodes
#'
#' @name sequence_selector_and_aligner
#'
#' @description This function performs an alignment using DECIPHER
#'
#' @param input_fasta_files_user fasta file with sequences to be align
#'
#' @param input_fasta_reference_file_user an optional fasta file with a sequence
#' to be used as reference for a pairwise
#' alignment. All parameter related with
#' pairwise alignment are aligment against
#' a reference sequence(s).
#' do_pairwiseAlignment_user must be TRUE
#' @param reference_taxa_name_user a optional sequence name(s) to be selected
#' as reference reference sequence in
#' input_fasta_reference_file_user
#' @param seqinr_seqtype_user Charater indicating if sequence data are "DNA
#' or "AA"
#' @param do_pairwiseAlignment_user Logical. If it is TRUE performs a pairwise
#' alignment with reference sequence in
#' input_fasta_reference_file_user
#' @param pairwiseAlignment_type_user Character indicating the type pairwise alignment
#' (see Biostrings::pairwiseAlignment function), example "global"
#' @param nucleotideSubstitutionMatrix_match_user Numeric. indicating the
#' parameter match in pairwise alignment (see
#' Biostrings::nucleotideSubstitutionMatrix).
#' @param nucleotideSubstitutionMatrix_mismatch_user Numeric indicating the
#' parameter mismatch in pairwise alignment (see
#' Biostrings::nucleotideSubstitutionMatrix).
#' @param nucleotideSubstitutionMatrix_baseOnly_user Numeric indicating the
#' parameter baseOnly in pairwise alignment (See
#' Biostrings::nucleotideSubstitutionMatrix).
#' @param nucleotideSubstitutionMatrix_type_user Numeric indicating the
#' parameter type in pairwise alignment (See
#' Biostrings::nucleotideSubstitutionMatrix).
#' @param pairwiseAlignment_gapOpening_user Numeric. indicating the
#' parameter gapOpening in pairwise alignment (See
#' Biostrings::pairwiseAlignment)
#' @param pairwiseAlignment_gapExtension_user Numeric indicating the
#' parameter gapExtension in pairwise alignment (See
#' Biostrings::pairwiseAlignment)
#' @param order_by_pairwiseAlignment_user Character indicating the criterion to
#' order the sequence in the pairwise alignment. Example:
#' "nmatch" or based in "dissimilarity_distance"
#' @param pairwiseAlignment_filter_condition_user Character indicating the
#' Criterion to filter the sequence in the pairwise alignment.
#' Example: "nmatch" or based in "dissimilarity_distance"
#' @param filter_condition_value_user Numeric indicating the filter value in
#' pairwiseAlignment_filter_condition_user
#' @param filter_condition_operator_user Character indicating if the filter is
#' performed to values higher "more" than
#' filter_condition_value_user or "less" than
#' filter_condition_value_user
#' @param retain_top_n_seq_user Number of sequence to keep based filter condiction
#' against reference sequence in pairwise alignment
#' @param degap_before_multialign_user Logical. True remove all gaps in sequence
#' in pairwise alignment
#'
#' @param do_multipleAligment_user Logical. True perform a multiple alignment
#' with differents aligners
#' ("clustalo", "mafft", "DECIPHER")
#' @param aligner_name_user Character indicating the aligner to use if
#' do_multipleAligment_user is TRUE
#' ("clustalo", "mafft", "DECIPHER")
#' @param aligner_bin_location_user path directory where clustal and mafft
#' bin folder are localetd
#' @param aligner_quiet_user Logical indicating whether to display alignment
#' progress
#' @param clustal_n_threads_user Numeric indicating number of cores to be used
#' in clustal alignment
#' (if proper parameters are activated)
#'
#' @param muscle_n_iter_user Numeric indicating Maximum number of iterations in
#' muscle alignment (if proper parameters are activated)
#' @param mafft_accuracy_method_user Character indicating Accuracy-oriented
#' methods in maftt alignment.
#' (if proper parameters are activated)
#' @param mafft_retree_user Numeric indicating the alignment strategies in mafft
#' alignment. FFT-NS-1 (1) or FFT-NS-2 (2).
#' @param mafft_maxiterate_user Numeric indicating the Iterative refinement method
#' (max. 1000) in maftt alignment.
#' @param mafft_gap_opening_penalty_user Numeric indicating Gap opening penalty
#' at group-to-group alignment in maftt
#' alignment.
#' @param mafft_gap_extension_penalty_user Numeric indicating gap extension
#' penalty maftt alignment.
#' @param decipher_iterations_user Number of iteration to perform in DECIPHER
#' alignment. See iterations parameter in
#' DECIPHER::AlignSeqs
#' @param decipher_refinements_user Number of refinement to perform in DECIPHER
#' See refinements parameter in
#' DECIPHER::AlignSeqs
#' @param decipher_gapOpening_user Numeric indicating cost for opening a gap in
#' the DECIPHER alignment. See gapOpening
#' parameter in DECIPHER::AlignSeqs
#' @param decipher_gapExtension_user Numeric indicating cost for extending an
#' open gap in the DECIPHER alignment. See
#' gapExtension parameter in DECIPHER::AlignSeqs.
#' @param decipher_useStructures_user Logical indicating whether to use
#' secondary structure predictions during
#' DECIPHER alignment. See UseStructures
#' parameter in DECIPHER::AlignSeqs.
#' @param decipher_structures_user Character indicating list of secondary
#' structure probabilities for
#' decipher_useStructures_user. see DECIPHER::AlignSeqs
#' for more details.
#' @param decipher_levels_user Numeric with eight elements specifying the levels
#' at which to trigger events. see DECIPHER::AlignSeqs
#' for more details.
#' @param file_out_user Character to be added in the output file
#' @param out_directory_user output directory
#'
#' @return
#' @export
#'
#' @importFrom seqinr "read.fasta"
#' @importFrom Biostrings "readAAMultipleAlignment"
#' @importFrom Biostrings "writeXStringSet"
#' @importFrom Biostrings "DNAStringSet"
#' @importFrom Biostrings "reverseComplement"
#' @importFrom Biostrings "AAStringSet"
#' @importFrom Biostrings "pairwiseAlignment"
#' @importFrom Biostrings "nindel"
#' @importFrom Biostrings "pid"
#' @importFrom Biostrings "nmatch"
#' @importFrom Biostrings "nmismatch"
#' @importFrom Biostrings "nedit"
#' @importFrom Biostrings "alignedSubject"
#' @importFrom Biostrings "xscat"
#' @importFrom Biostrings "pairwiseAlignment"
#' @importFrom Biostrings "DNAStringSet"
#' @importFrom Biostrings "writeXStringSet"
#' @importFrom Biostrings "writeXStringSet"
#' @importFrom Biostrings "DNAStringSet"
#' @importFrom Biostrings "AAStringSet"
#' @importFrom DECIPHER "AlignSeqs"
#' @importFrom DECIPHER "DistanceMatrix"
#' @importFrom DECIPHER "RemoveGaps"
#' @importFrom seqinr "read.fasta"
#' @importFrom utils "write.table"
#' @examples
#'
#'
#'
#'
#'
#'
#'
#'
sequence_selector_and_aligner <- function( input_fasta_files_user,
input_fasta_reference_file_user = NULL,
reference_taxa_name_user = NULL,
seqinr_seqtype_user = c("DNA","AA"),
do_pairwiseAlignment_user = TRUE,
pairwiseAlignment_type_user = "global",
nucleotideSubstitutionMatrix_match_user = 1,
nucleotideSubstitutionMatrix_mismatch_user = -3,
nucleotideSubstitutionMatrix_baseOnly_user = FALSE,
nucleotideSubstitutionMatrix_type_user = "DNA",
pairwiseAlignment_gapOpening_user = 5,
pairwiseAlignment_gapExtension_user = 2,
order_by_pairwiseAlignment_user = c("nmatch", "dissimilarity_distance"),
pairwiseAlignment_filter_condition_user = "nmatch",
filter_condition_value_user = 300,
filter_condition_operator_user = "more",
retain_top_n_seq_user = NULL,
degap_before_multialign_user = TRUE,
do_multipleAligment_user = TRUE,
aligner_name_user = c("clustalo", "mafft", "DECIPHER"),
aligner_bin_location_user = NULL,
aligner_quiet_user = FALSE,
clustal_n_threads_user = 1,
muscle_n_iter_user = 16,
mafft_accuracy_method_user = NULL,
mafft_retree_user = 2,
mafft_maxiterate_user = 1000,
mafft_gap_opening_penalty_user = 1.53,
mafft_gap_extension_penalty_user = 0.123,
decipher_iterations_user = 3,
decipher_refinements_user = 2,
decipher_gapOpening_user = c(-18,-16),
decipher_gapExtension_user = c(-2, -1),
decipher_useStructures_user = TRUE,
decipher_structures_user = NULL,
decipher_levels_user = c(0.9, 0.7, 0.7, 0.4, 10, 5, 5, 2),
file_out_user = NULL,
out_directory_user = NULL ) {
# user input
sequence_fasta_file <- input_fasta_files_user
ref_fasta_file <- input_fasta_reference_file_user
ref_taxa_name_pattern <- reference_taxa_name_user
is_pairwiseAlignment <- do_pairwiseAlignment_user
pairwiseAlignment_type <- pairwiseAlignment_type_user
mat_pairwiseAlignment <- Biostrings::nucleotideSubstitutionMatrix(match = nucleotideSubstitutionMatrix_match_user,
mismatch = nucleotideSubstitutionMatrix_mismatch_user,
baseOnly = nucleotideSubstitutionMatrix_baseOnly_user,
type = nucleotideSubstitutionMatrix_type_user)
pairwiseAlignment_gapOpening <- pairwiseAlignment_gapOpening_user
pairwiseAlignment_gapExtension <- pairwiseAlignment_gapExtension_user
order_by <- order_by_pairwiseAlignment_user
filter_condition <- pairwiseAlignment_filter_condition_user
condition_value <- filter_condition_value_user
condition_operator <- filter_condition_operator_user
retain_top_n_seq <- retain_top_n_seq_user
degap_before_multialign <- degap_before_multialign_user
do_multipleAligment <- do_multipleAligment_user
aligner_name <- aligner_name_user
aligner_bin_location <- aligner_bin_location_user
aligner_quiet <- aligner_quiet_user
clustal_n_threads <- clustal_n_threads_user
muscle_n_iter <- muscle_n_iter_user
mafft_accuracy_method <- mafft_accuracy_method_user
mafft_retree <- mafft_retree_user
mafft_maxiterate <- mafft_maxiterate_user
mafft_gap_opening_penalty <- mafft_gap_opening_penalty_user
mafft_gap_extension_penalty <- mafft_gap_extension_penalty_user
decipher_iterations <- decipher_iterations_user
decipher_refinements <- decipher_refinements_user
decipher_gapOpening <- decipher_gapOpening_user
decipher_gapExtension <- decipher_gapExtension_user
decipher_useStructures <- decipher_useStructures_user
decipher_structures <- decipher_structures_user
decipher_levels <- decipher_levels_user
seqinr_seqtype <- seqinr_seqtype_user
file_out <- file_out_user
## outdir arguments
if(is.null(out_directory_user)) {
out_directory <- getwd()
} else {
setwd(out_directory_user)
out_directory <- getwd()
}
#### one or combine many fasta
if(length(sequence_fasta_file) > 1) {
one_file_name_1 <- sub(".*/", "", sequence_fasta_file[1])
one_file_name <- gsub("(.*)[.].*","\\1",one_file_name_1)
one_file_name <- paste0(one_file_name, "_combination_of_", length(sequence_fasta_file), ".fasta")
if (file.exists(one_file_name)) { file.remove(one_file_name) }
for(i in 1:length(sequence_fasta_file)) {
one_file <- readLines(sequence_fasta_file[i])
one_file <- c(one_file, "\n")
write(one_file,file=one_file_name,append=TRUE)
}
sequence_fasta_file <- one_file_name
}
######## get input sequence name
sequence_file_for_naming_long <- sub(".*/", "", sequence_fasta_file)
sequence_file_for_naming <- gsub("(.*)[.].*","\\1",sequence_file_for_naming_long)
############ read fasta file
## source sequence
source_sequence_list <- seqinr::read.fasta(file = sequence_fasta_file,
seqtype = seqinr_seqtype,
set.attributes = FALSE,
apply.mask = FALSE,
whole.header = TRUE)
source_sequence_vector <- lapply(source_sequence_list, paste, collapse = "")
source_sequence_vector <- toupper(source_sequence_vector)
names(source_sequence_vector) <- names(source_sequence_list)
######################################################
############ pairwise alignment evaluation
if(is_pairwiseAlignment == TRUE) {
## ref sequence
ref_sequence_list <- seqinr::read.fasta(file = ref_fasta_file,
seqtype = seqinr_seqtype,
set.attributes = FALSE,
apply.mask = FALSE,
whole.header = TRUE)
## select references
if(!is.null(ref_taxa_name_pattern)) {
select_ref_sequence <- ref_sequence_list[grep(ref_taxa_name_pattern,names(ref_sequence_list))]
} else {
select_ref_sequence <- ref_sequence_list
}
## iterate on references
constructed_best_DNAstringset_sets <- list()
constructed_best_DNAstringset_sets_df <- list()
for(uu in 1:length(select_ref_sequence)) {
# uu <- 2
one_ref_sequence <- select_ref_sequence[uu]
one_ref_vec <- as.character(lapply(one_ref_sequence, paste, collapse = ""))
one_ref_vec <- toupper(one_ref_vec)
names(one_ref_vec) <- names(one_ref_sequence)
# combine ro single string
ref_source_vector <- c(one_ref_vec, source_sequence_vector)
if(seqinr_seqtype == "DNA") {
ref_source_string_for <- Biostrings::DNAStringSet(ref_source_vector, use.names=TRUE)
ref_source_string_rev <- Biostrings::reverseComplement(ref_source_string_for)
}
if(seqinr_seqtype == "AA") {
ref_source_string <- Biostrings::AAStringSet(ref_source_vector, use.names=TRUE)
}
# function to move from XstringSet to data.frame
dss2df <- function(dss) data.frame(width=width(dss), seq=as.character(dss), names=names(dss), stringsAsFactors = FALSE)
######################################## pairwise aligment -- Biostrings
# iterate sequences to find
counter_ref <- length(ref_source_vector)
best_DNAstring_df_list <- list()
counter <- 0
ref_one_seq <- ref_source_string_for[1]
cat("\n***** reference sequence used is: \n")
print(ref_one_seq)
cat("\n******* processing sequence -- \n")
for(i in 2:counter_ref) {
# i <- 8
counter <- counter + 1
# forward
target_one_seq_for <- ref_source_string_for[i]
aligned_ref_one_seq_f <- try(Biostrings::pairwiseAlignment(ref_one_seq, target_one_seq_for, type = pairwiseAlignment_type,
substitutionMatrix = mat_pairwiseAlignment,
gapOpening = pairwiseAlignment_gapOpening,
gapExtension = pairwiseAlignment_gapExtension), silent = TRUE)
if(class(aligned_ref_one_seq_f) == "try-error") {
cat("********* SEQUENCE HAS AMBIGUITIES THAT CANNOT BE HANDLED BY THE Biostrings::nucleotideSubstitutionMatrix -- check sequence ********* ")
print(target_one_seq_for)
cat("\n")
problematic_seq <- as.character(target_one_seq_for)
names(problematic_seq) <- NULL
best_seq_ref <- data.frame(ref_name = names(one_ref_sequence),
dissimilarity_distance = 1,
nmatch = NA,
seq_name = names(target_one_seq_for),
align_score = NA,
levenshtein_edit_distance = NA,
percent_sequence_identity = NA,
nmismatch = NA,
insertion_length = NA,
insertion_widthsum = NA,
deletion_length = NA,
deletion_widthsum = NA,
seq_reverse = NA,
sequence = problematic_seq,
stringsAsFactors = FALSE)
best_DNAstring_df_list[[counter]] <- best_seq_ref
next
}
forward_score <- aligned_ref_one_seq_f@score
forward_nindel <- Biostrings::nindel(aligned_ref_one_seq_f)
forward_insertion_length <- forward_nindel@insertion[1]
forward_insertion_widthsum <- forward_nindel@insertion[2]
forward_deletion_length <- forward_nindel@deletion[1]
forward_deletion_widthsum <- forward_nindel@deletion[2]
forward_pid <- Biostrings::pid(aligned_ref_one_seq_f)
forward_nmatch <- Biostrings::nmatch(aligned_ref_one_seq_f)
forward_nmismatch <- Biostrings::nmismatch(aligned_ref_one_seq_f)
forward_nedit <- Biostrings::nedit(aligned_ref_one_seq_f)
forward_aligned <- Biostrings::alignedSubject(aligned_ref_one_seq_f)
forward_aligned_char <- as.character(forward_aligned)
names(forward_aligned_char) <- NULL
pattern_forward_aligned <- Biostrings::xscat(c(Biostrings::alignedPattern(aligned_ref_one_seq_f), Biostrings::alignedSubject(aligned_ref_one_seq_f)))
forward_aligned_distance_m <- DECIPHER::DistanceMatrix(pattern_forward_aligned,
includeTerminalGaps = FALSE,
penalizeGapGapMatches = TRUE,
processors = NULL,
verbose = FALSE)
forward_aligned_distance_m <- round(forward_aligned_distance_m, digits = 5)
forward_aligned_distance <- forward_aligned_distance_m[1,2]
cat(names(forward_aligned), "\n")
# reverse
target_one_seq_rev <- ref_source_string_rev[i]
aligned_ref_one_seq_r <- Biostrings::pairwiseAlignment(ref_one_seq, target_one_seq_rev, type = pairwiseAlignment_type,
substitutionMatrix = mat_pairwiseAlignment,
gapOpening = pairwiseAlignment_gapOpening,
gapExtension = pairwiseAlignment_gapExtension)
reverse_score <- aligned_ref_one_seq_r@score
reverse_nindel <- Biostrings::nindel(aligned_ref_one_seq_r)
reverse_insertion_length <- reverse_nindel@insertion[1]
reverse_insertion_widthsum <- reverse_nindel@insertion[2]
reverse_deletion_length <- reverse_nindel@deletion[1]
reverse_deletion_widthsum <- reverse_nindel@deletion[2]
reverse_pid <- Biostrings::pid(aligned_ref_one_seq_r)
reverse_nmatch <- Biostrings::nmatch(aligned_ref_one_seq_r)
reverse_nmismatch <- Biostrings::nmismatch(aligned_ref_one_seq_r)
reverse_nedit <- Biostrings::nedit(aligned_ref_one_seq_r)
reverse_aligned <- Biostrings::alignedSubject(aligned_ref_one_seq_r)
reverse_aligned_char <- as.character(reverse_aligned)
names(reverse_aligned_char) <- NULL
pattern_reverse_aligned <- Biostrings::xscat(c(Biostrings::alignedPattern(aligned_ref_one_seq_r), Biostrings::alignedSubject(aligned_ref_one_seq_r)))
reverse_aligned_distance_m <- DECIPHER::DistanceMatrix(pattern_reverse_aligned,
includeTerminalGaps = FALSE,
penalizeGapGapMatches = TRUE,
processors = NULL,
verbose = FALSE)
reverse_aligned_distance_m <- round(reverse_aligned_distance_m, digits = 5)
reverse_aligned_distance <- reverse_aligned_distance_m[1,2]
# which choose forward or reverse
if(forward_nmatch >= reverse_nmatch) {
best_seq_ref <- data.frame(ref_name = names(one_ref_sequence),
dissimilarity_distance = forward_aligned_distance,
nmatch = forward_nmatch,
seq_name = names(forward_aligned),
align_score = forward_score,
levenshtein_edit_distance = forward_nedit,
percent_sequence_identity = forward_pid,
nmismatch = forward_nmismatch,
insertion_length = forward_insertion_length,
insertion_widthsum = forward_insertion_widthsum,
deletion_length = forward_deletion_length,
deletion_widthsum = forward_deletion_widthsum,
seq_reverse = "no",
sequence = forward_aligned_char,
stringsAsFactors = FALSE)
best_DNAstring_df_list[[counter]] <- best_seq_ref
} else {
best_seq_ref <- data.frame(ref_name = names(one_ref_sequence),
dissimilarity_distance = reverse_aligned_distance,
nmatch = reverse_nmatch,
seq_name = names(reverse_aligned),
align_score = reverse_score,
levenshtein_edit_distance = reverse_nedit,
percent_sequence_identity = reverse_pid,
nmismatch = reverse_nmismatch,
insertion_length = reverse_insertion_length,
insertion_widthsum = reverse_insertion_widthsum,
deletion_length = reverse_deletion_length,
deletion_widthsum = reverse_deletion_widthsum,
seq_reverse = "yes",
sequence = reverse_aligned_char,
stringsAsFactors = FALSE)
best_DNAstring_df_list[[counter]] <- best_seq_ref
}
}
rm(i)
rm(counter)
cat("\n *** DONE *** \n")
## create df of sequences
best_DNAstring_df <- do.call(rbind, best_DNAstring_df_list)
## order by nmatch or dissimilarity_distance
if(order_by == "nmatch") {
best_DNAstring_df <- best_DNAstring_df[order(best_DNAstring_df$nmatch, decreasing = TRUE),]
}
if(order_by == "dissimilarity_distance") {
best_DNAstring_df <- best_DNAstring_df[order(best_DNAstring_df$dissimilarity_distance, decreasing = FALSE),]
}
# filter if requested based on condition
if(!is.null(filter_condition)) {
if(condition_operator == "more") {
best_DNAstring_filter <- subset(best_DNAstring_df, best_DNAstring_df[,filter_condition] >= condition_value)
} else {
best_DNAstring_filter <- subset(best_DNAstring_df, best_DNAstring_df[,filter_condition] <= condition_value)
}
} else {
best_DNAstring_filter <- best_DNAstring_df
}
rownames(best_DNAstring_filter) <- NULL
# retain_top_n_seq
if(!is.null(retain_top_n_seq)) {
retain_top_n_seq <- ifelse(nrow(best_DNAstring_filter) >= retain_top_n_seq, retain_top_n_seq, nrow(best_DNAstring_filter))
best_DNAstring_filter <- best_DNAstring_filter[1:retain_top_n_seq,]
}
# construct DNAstring
best_DNAstring_seq <- best_DNAstring_filter$sequence
names(best_DNAstring_seq) <- best_DNAstring_filter$seq_name
best_DNAstring_seq <- c(one_ref_vec, best_DNAstring_seq)
# remove na rows
best_DNAstring_seq <- best_DNAstring_seq [!is.na(best_DNAstring_seq)]
best_DNAstringset <- Biostrings::DNAStringSet(best_DNAstring_seq, use.names=TRUE)
cat("\n****** Total number of sequences retained are -- ", length(best_DNAstring_seq) - 1, "\n")
# write data.frame for information
file_fasta_df_name <- paste0(sequence_file_for_naming,"_ref_", names(one_ref_sequence),"_df_info_seq.txt")
utils::write.table(best_DNAstring_df, file = file_fasta_df_name, sep = "\t", row.names = FALSE)
# write fasta file
file_fasta_name <- paste0(sequence_file_for_naming,"_ref_", names(one_ref_sequence), "_biostrings_out.fasta")
if(degap_before_multialign == TRUE) {
best_DNAstringset <- DECIPHER::RemoveGaps(best_DNAstringset, removeGaps = "all", processors = NULL)
}
Biostrings::writeXStringSet(best_DNAstringset, filepath = file_fasta_name, append=FALSE, compress=FALSE, compression_level=NA, format="fasta")
cat("\n****** fasta file written as -- ", file_fasta_name, " -- \n")
constructed_best_DNAstringset_sets[[uu]] <- best_DNAstringset
constructed_best_DNAstringset_sets_df[[uu]] <- best_DNAstring_df
}
rm(uu)
# need to combine all in a single fasta file for multialigner
master_DNAstringset <- do.call(c, constructed_best_DNAstringset_sets)
file_fasta_name <- paste0(sequence_file_for_naming,"_master_", length(master_DNAstringset@ranges@group), "_seq_biostrings_out.fasta")
Biostrings::writeXStringSet(master_DNAstringset, filepath = file_fasta_name, append=FALSE, compress=FALSE, compression_level=NA, format="fasta")
best_DNAstring_df <- do.call(rbind, constructed_best_DNAstringset_sets_df)
file_fasta_name_df <- paste0(sequence_file_for_naming,"_master_", nrow(best_DNAstring_df), "_df_info_seq.txt")
write.table(best_DNAstring_df, file = file_fasta_name_df, sep = "\t", row.names = FALSE)
# for consistency for decipher
best_DNAstringset <- master_DNAstringset
# collecting things to return
seq_items_return <- list(best_DNAstring_df, master_DNAstringset)
} else { # else if not pairwaise alignment
if(seqinr_seqtype == "DNA") {
best_DNAstringset <- Biostrings::DNAStringSet(source_sequence_vector, use.names=TRUE)
}
if(seqinr_seqtype == "AA") {
best_DNAstringset <- Biostrings::AAStringSet(source_sequence_vector, use.names=TRUE)
}
cat("\n****** Total number of sequences retained are -- ", length(source_sequence_vector), "\n")
# write fasta file
file_fasta_name <- paste0(sequence_file_for_naming, "_biostrings_out.fasta")
if(degap_before_multialign == TRUE) {
best_DNAstringset <- DECIPHER::RemoveGaps(best_DNAstringset, removeGaps = "all", processors = NULL)
}
Biostrings::writeXStringSet(best_DNAstringset, filepath = file_fasta_name, append=FALSE, compress=FALSE, compression_level=NA, format="fasta")
cat("\n****** fasta file written as -- ", file_fasta_name, " -- \n")
}
############ END: pairwise alignment evaluation
####################################################################################
####################################################################################
############ multiple alignment evaluation
if(do_multipleAligment == TRUE) {
list_aligner_scripts <- list()
name_of_aligner_analyses <- character()
name_of_align_file <- character()
counter <- 0
############# do clustalo
if("clustalo" %in% aligner_name) {
counter <- counter + 1
clustalo_out_file_fasta <- paste0(sequence_file_for_naming, "_clustalo_out.fasta")
# prepare clustalo commands
line_clustalo <- paste0("clustalo -i ", file_fasta_name, " -o ", clustalo_out_file_fasta,
" --threads=", clustal_n_threads," --auto --force -v")
if(aligner_quiet) { line_clustalo <- paste0("clustalo -i ", file_fasta_name, " -o ", clustalo_out_file_fasta,
" --threads=", clustal_n_threads," --auto --force")}
name_of_aligner_analyses[counter] <- paste0("clustalo_", counter)
name_of_align_file [counter] <- clustalo_out_file_fasta
# all lines together
list_aligner_scripts[[counter]] <- paste0(line_clustalo, collapse = " ")
}
############# do muscle -- DOES NOT WORK
if("muscle" %in% aligner_name) {
counter <- counter + 1
muscle_out_file_fasta <- paste0(sequence_file_for_naming, "_muscle_out.fasta")
# prepare muscle commands
line_muscle <- paste0("muscle -in ", file_fasta_name, " -out ", muscle_out_file_fasta,
" -maxiters ", muscle_n_iter)
if(aligner_quiet) { line_muscle <- paste0("muscle -in ", file_fasta_name, " -out ", muscle_out_file_fasta,
" -maxiters ", muscle_n_iter, " -quiet")}
name_of_aligner_analyses[counter] <- paste0("muscle_", counter)
name_of_align_file [counter] <- muscle_out_file_fasta
# all lines together
list_aligner_scripts[[counter]] <- paste0(line_muscle, collapse = " ")
}
############# do mafft
if("mafft" %in% aligner_name) {
counter <- counter + 1
mafft_out_file_fasta <- paste0(sequence_file_for_naming, "_mafft_out.fasta")
# prepare muscle commands
list_of_mafft <- character()
list_of_mafft[1] <- paste0("mafft.bat ")
counter_mafft <- 1
# mafft_accuracy_method
if(!is.null(mafft_accuracy_method)) {counter_mafft <- counter_mafft + 1; out_line <- paste0(" --", mafft_accuracy_method, " "); list_of_mafft[counter_mafft] <- out_line}
# common mafft parameters
if(!is.null(mafft_retree)) {counter_mafft <- counter_mafft + 1; out_line <- paste0(" --retree ", mafft_retree, " "); list_of_mafft[counter_mafft] <- out_line}
if(!is.null(mafft_maxiterate)) {counter_mafft <- counter_mafft + 1; out_line <- paste0(" --maxiterate ", mafft_maxiterate, " "); list_of_mafft[counter_mafft] <- out_line}
# penalty parameters:
counter_mafft <- counter_mafft + 1; out_line <- paste0(" --op ", mafft_gap_opening_penalty, " "); list_of_mafft[counter_mafft] <- out_line
counter_mafft <- counter_mafft + 1; out_line <- paste0(" --ep ", mafft_gap_extension_penalty, " "); list_of_mafft[counter_mafft] <- out_line
# quiet
if(aligner_quiet) {counter_mafft <- counter_mafft + 1; out_line <- paste0(" --quiet "); list_of_mafft[counter_mafft] <- out_line}
# end
counter_mafft <- counter_mafft + 1; out_line <- paste0(file_fasta_name, " > ", mafft_out_file_fasta); list_of_mafft[counter_mafft] <- out_line
# name of run
name_of_aligner_analyses[counter] <- paste0("mafft_", counter)
name_of_align_file [counter] <- mafft_out_file_fasta
# all lines together
list_aligner_scripts[[counter]] <- paste0(list_of_mafft, collapse = " ")
}
#####################################################################################
#### open and run aligner from R
## add to return list
if(exists("seq_items_return")) {
counter_3 <- length(seq_items_return)
} else {
seq_items_return <- list()
counter_3 <- 0
}
# create vector to export path -- export PATH=/opt/local/bin:/opt/local/sbin:$PATH
export_path_align_bins <- normalizePath(aligner_bin_location)
export_path_align_vector <- paste0("export PATH=",export_path_align_bins,":$PATH")
## prank execute function
execute_aligner_terminal <- function(export_path_section = export_path_align_vector,
aligner_all_commands,
intern = FALSE,
wait = FALSE){
system(paste0(export_path_section, "\n",
aligner_all_commands, "\n"),
intern = intern,
wait = wait)
}
## run list elements with aligner commands
cat("\n****** started serial analisis of: ", length(list_aligner_scripts), "\n" )
if(any(c("clustalo","mafft", "muscle") %in% aligner_name)) {
for(j in 1:length(list_aligner_scripts)) {
cat("***** aligner analysis name: ",name_of_aligner_analyses[j], "\n")
cat("***** aligner commands: ", list_aligner_scripts[[j]], "\n")
execute_aligner_terminal(export_path_section = export_path_align_vector ,
aligner_all_commands = list_aligner_scripts[[j]],
intern = FALSE,
wait = TRUE)
cat("\n***** DONE ***** \n")
Sys.sleep(5)
counter_3 <- counter_3 + 1
if(seqinr_seqtype == "AA") {
seq_items_return[[counter_3]] <- Biostrings::readAAMultipleAlignment(name_of_align_file[j], format = "fasta")
}
if(seqinr_seqtype == "DNA") {
seq_items_return[[counter_3]] <- Biostrings::readDNAMultipleAlignment(name_of_align_file[j], format = "fasta")
}
}
rm(j)
}
############# do DECIPHER
if("DECIPHER" %in% aligner_name) {
# decipher requires to remove gaps ("-")
best_DNAstringset_decipher <- DECIPHER::RemoveGaps(best_DNAstringset, removeGaps = "all", processors = NULL)
cat("***** aligner analysis name: DECIPHER \n\n")
decipher_alignment <- DECIPHER::AlignSeqs(best_DNAstringset_decipher,
guideTree = NULL,
iterations = decipher_iterations,
refinements = decipher_refinements,
gapOpening = decipher_gapOpening,
gapExtension = decipher_gapExtension,
useStructures = decipher_useStructures,
structures = decipher_structures,
FUN = AdjustAlignment,
levels = decipher_levels,
alphabet = AA_REDUCED[[1]],
processors = NULL,
verbose = !aligner_quiet)
# write fasta file
file_fasta_name <- paste0(sequence_file_for_naming, "_decipher_out.fasta")
Biostrings::writeXStringSet(decipher_alignment, filepath = file_fasta_name, append=FALSE, compress=FALSE, compression_level=NA, format="fasta")
cat("\n****** DECIPHER alignment fasta file written as -- ", file_fasta_name, " -- \n")
# add stringset to list
counter_3 <- counter_3 + 1
seq_items_return[[counter_3]] <- decipher_alignment
}
}
############ END: multiple alignment evaluation
####################################################################################
setwd(out_directory)
cat("\n\n ************* alignments and pairwise analyses DONE ************* \n")
if(exists("seq_items_return")) {
return(seq_items_return)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.