#' Align sequences
#'
#' Perform multiple sequence alignment on all the fasta files
#' saved in a given folder. Alignment is conducted using
#' \code{"BiocManager::install("DECIPHER")"}. Note that this function first
#' optimizes the direction of the sequences, then aligns
#' using \code{"BDECIPHER"}, and finally masks the resulting
#' alignment (optionally but does it per default). The
#' masking step includes removing common gaps across all the species and removing
#' highly ambiguous positions. The resulting aligned sequences
#' are stored to a new folder
#' \code{"2.Alignments"}.'
#'
#' @param folder Name of the folder where the sequences to align
#' are stored (character).
#' @param FilePatterns A string that is common to all the target
#' files in the relevant folder (character). Note that
#' this argument can be set to \code{"NULL"} if no specific
#' pattern wants to be analyzed.
#' @param sqs.object A list of sequences generated from sq.curate. Only use if you're
#' not interested in download sequences locally.
#' @param mask Removes ambiguous sites (Logical, TRUE or FALSE).
#' @param maxFractionGapsSpecies Maximum fraction of gaps per species (when masked)
#' @param ... Arguments passed to \code{"DECIPHER::AlignSeqs"}.
#'
#' @importFrom methods as
#' @importFrom Biostrings writeXStringSet
#' @importFrom Biostrings readDNAStringSet
#' @import DECIPHER
#' @import msa
#'
#' @return This function will return an object of class \code{list} including the
#' original and renamed sequence alignments. Optionally, this object will
#' also include the masked alignments.
#'
#' @examples
#' \dontrun{
#' sq.retrieve.direct(
#' clades = c("Felis", "Vulpes", "Phoca"),
#' species = "Manis_pentadactyla",
#' genes = c("ADORA3", "CYTB")
#' )
#' sq.curate(
#' filterTaxonomicCriteria = "Felis|Vulpes|Phoca|Manis",
#' kingdom = "animals", folder = "0.Sequences"
#' )
#' sq.aln(folder = "1.CuratedSequences")
#' }
#' @export
sq.aln <- function(folder = "1.CuratedSequences",
FilePatterns = "renamed",
sqs.object = NULL,
mask = TRUE,
maxFractionGapsSpecies = 0.01,
...) {
if (is.null(folder))
stop("Folder where curated sequences are saved must be provided")
if (!is.logical(mask))
stop("The mask argument must be TRUE or FALSE")
if(is.null(sqs.object)){
##Over-writing?
if( !isTRUE(pkg.env$.testMode) ) {
UI <- readline(paste0("This function might overwrite ",
"2.Alignments", ". Are you sure you want to continue? (y/n) "))
if(UI != 'y') stop('Exiting since you did not press y')
}
files <- list.files(folder, FilePatterns)
files <- sub("renamed_", "", files)
filesComplete <- list.files(folder, FilePatterns, full.names = TRUE)
unlink("2.Alignments", recursive = TRUE)
dir.create("2.Alignments")
invisible(
lapply(seq_along(filesComplete), function(x) {
seqs <- Biostrings::readDNAStringSet(filesComplete[x])
seqs <- DECIPHER::OrientNucleotides(seqs)
aligned <- DECIPHER::AlignSeqs(seqs, ...)
if (mask) {
alignedNoGaps <- DECIPHER::RemoveGaps(aligned, removeGaps = "common")
alignedMasked <- DECIPHER::MaskAlignment(alignedNoGaps,
correction = if (length(alignedNoGaps) < 200) {
TRUE
} else {
FALSE
})
DNAStr <- as(alignedMasked, "DNAStringSet")
if (max(nchar(as.character(DNAStr))) != 0) {
##Species removed while masking the aln
RemMasking <- !names(aligned) %in% names(DNAStr)
##Remove species with not enough data
NonGaps <- nchar(gsub("-", "", as.character(DNAStr)))
MaxNumberNonGaps <- as.vector(max(nchar(as.character(DNAStr)))*maxFractionGapsSpecies)
#if(MaxNumberNonGaps > 100){
rem <- NonGaps < MaxNumberNonGaps
SumRem <- cbind.data.frame(Species = names(NonGaps),
NonGaps,
#MaxNumberNonGaps,
removedPerGaps = rem,
removedMasking = RemMasking)
DNAStr <- DNAStr[!rem]
DNAStr <- DNAStr[!duplicated(names(DNAStr))]
#Export everything
write.csv(SumRem, paste0("2.Alignments/0.Masked.Information_", files[x], ".csv"))
writeXStringSet(DNAStr,
filepath = paste0("2.Alignments/Masked_",
files[x]))
}else{
write.csv("Masking failed", paste0("2.Alignments/0.Masked.Information_", files[x], ".csv"))
}
}
writeXStringSet(aligned, filepath = paste0("2.Alignments/Raw_", files[x]))
})
)
} else{
alns <- lapply(sqs.object$Sequences, function(x) {
if(!is.null(x)){
seqs <- Biostrings::DNAStringSet(unlist(lapply(as.character(x$Renamed),paste0, collapse="")))
seqs <- DECIPHER::OrientNucleotides(seqs)
aligned <- DECIPHER::AlignSeqs(seqs)
#if (mask) {
alignedNoGaps <- DECIPHER::RemoveGaps(aligned, removeGaps = "common")
alignedMasked <- DECIPHER::MaskAlignment(alignedNoGaps,
correction = if (length(alignedNoGaps) < 200) {
TRUE
} else {
FALSE
})
DNAStr <- as(alignedMasked, "DNAStringSet")
no.masking <- max(nchar(as.character(DNAStr)))
if (no.masking != 0) {
##Species removed while masking the aln
RemMasking <- !names(aligned) %in% names(DNAStr)
##Remove species with not enough data
NonGaps <- nchar(gsub("-", "", as.character(DNAStr)))
MaxNumberNonGaps <- as.vector(max(nchar(as.character(DNAStr)))*maxFractionGapsSpecies)
#if(MaxNumberNonGaps > 100){
rem <- NonGaps < MaxNumberNonGaps
SumRem <- cbind.data.frame(Species = names(NonGaps),
NonGaps,
#MaxNumberNonGaps,
removedPerGaps = rem,
removedMasking = RemMasking)
DNAStr <- DNAStr[!rem]
DNAStr <- DNAStr[!duplicated(names(DNAStr))]
}
#}
toRet <- if(no.masking != 0) {
list("Aln.Original" = as.DNAbin(aligned), "Info.Masked" = SumRem, "Aln.Masked" = as.DNAbin(DNAStr))
}else{
list("Aln.Original" = as.DNAbin(aligned))
}
return(toRet)
}
})
names(alns) <- names(sqs.object$Sequences)
return(alns)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.