#' @title MSA Reliability Assessment with GUIDANCE2
#' @description Calculate MSA reliability scores with GUIDANCE2 (Sela et al.
#' 2015).
#' @param sequences An object of class \code{\link{DNAbin}} or
#' \code{\link{AAbin}} containing unaligned sequences of DNA or amino acids.
#' @param bootstrap An integer giving the number of alternative MSAs to be
#' computed.
#' @param method A character string containing further arguments passed to
#' MAFFT; default is \code{"auto"}.
#' @param n.coopt An integer giving the number of sampled co-optimal MSAa.
#' @param msa.exec A character string giving the path to the executable of the
#' alignment program (e.g. \code{/usr/local/bin/mafft}); possible programs are
#' \code{MAFFT}, \code{MUSCLE}, and \code{ClustalW}.
#' For details see \code{\link{clustal}}, \code{\link{mafft}}
#' @param ncore An integer specifying the number of cores; default = 1 (i.e.
#' serial execution); \code{"auto"} can be used for automated usage of all
#' detected cores.
#' @param zip.file A character string giving the name of zip-compressed file,
#' which contains the alternative MSAs. If left empty (default), the
#' alternative MSA will not be stored and cannot be assessed by the user.
#' @return An object of class \code{\linkS4class{guidanceDNA}} or
#' \code{\linkS4class{guidanceAA}}.
#' @details Calculates column confidence (and other scors) by comparing
#' alternative MSAs generated by the GUIDANCE with varying gap opening panelty
#' and the HoT methodology. First 100 alternative MSAs (with BP guide trees)
#' with varying gap opening panelty are produced, then for each n (default =
#' 4) co-optimal alignments are produced using HoT. The basic comparison
#' between the BP MSAs and a reference MSA is if column residue pairs are
#' identically aligned in all alternative MSAs compared with the base MSA (see
#' \code{compareMSAs}).
#' @details For an example workflow see Vignette
#' @references Felsenstein, J. 1985. Confidence limits on phylogenies: an
#' approach using the bootstrap. \emph{Evolution} \strong{39}:783--791.
#' @references Landan and Graur. 2008. Local reliability measures from
#' sets of co-optimal multiple sequence alignments. \emph{Pacific Symposium on
#' Biocomputing} \strong{13}:15--24.
#' @references Penn et al. 2010. An
#' alignment confidence score capturing robustness to guide tree uncertainty.
#' \emph{Molecular Biology and Evolution} \strong{27}:1759--1767.
#' @references Sela et al. 2015. GUIDANCE2: accurate detection of unreliable
#' alignment regions accounting for the uncertainty of multiple parameters.
#' \emph{Nucleic Acids Research} \strong{43}:W7--W14
#'
#' @examples
#' \dontrun{
#' # run GUIDANCE on example data using MAFFT
#' fpath <- system.file("extdata", "BB30015.fasta", package="rGUIDANCE") # random example from BALiBASE
#' fas <- ape::read.FASTA(fpath)
#' g <- guidance2(sequences = fas, msa.exec= "/usr/local/bin/mafft")
#' scores <- scores(g, score = "column")
#' plot(scores$column$score, xlab = "Site",
#' ylab = "Column score",
#' main = "GUIDANCE2", type ="l")
#' }
#' @author Franz-Sebastian Krah
#' @seealso \code{\link{guidance}}, \code{\link{HoT}}
#' @importFrom ips mafft
#' @import doSNOW
#' @import foreach
#' @importFrom parallel makeCluster
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#' @importFrom stringr str_extract
#' @importFrom stats runif
#' @export
guidance2 <- function(sequences,
bootstrap = 100,
method = "auto",
n.coopt = 4,
msa.exec = "/usr/local/bin/mafft",
ncore = 1,
zip.file){
##############################################
## SOME CHECKS
##############################################
if (!inherits(sequences, c("DNAbin", "AAbin")))
stop("sequences not of class DNAbin or AAbin (ape)")
## Look up MSA program specified
msa.program <- str_extract(msa.exec, "mafft|muscle|clustalo|clustalw|prank")
if(!msa.program %in% c("mafft", "muscle", "clustalw"))
stop("Currently only MAFFT, MUSCLE or ClustalW")
nseq <- ifelse(is.matrix(sequences), nrow(sequences), length(sequences))
if (nseq > 199)
warning("alignments with more than 200 sequences may run into computional problems")
## Check for MSA program
## ---------------------
out <- system(paste(msa.exec, "--v"), ignore.stdout = TRUE, ignore.stderr = TRUE)
if (out == 127)
stop("please provide exec path or install MSA program in root \n
i.e. in Unix: '/usr/local/bin/mafft'")
## Number of cores
if (ncore == "auto") ncore <- detectCores(all.tests = FALSE, logical = TRUE)
##############################################
## PART I
##############################################
## BASE and PERTUBATED MSAs
##############################################
## Generate base alignment
## -----------------------
cat("Generating the base alignment\n")
if (msa.program == "mafft") {
base_msa <- mafft(x = sequences,
exec = msa.exec, method = method,
maxiterate = 0, op = 1.53, ep = 0,
thread = -1)
}
if (msa.program == "clustalw") {
base_msa <- clustal(x = sequences,
exec = msa.exec,
pw.gapopen = 10, pw.gapext = 0.1,
gapopen = 10, gapext = 0.2,
MoreArgs = "")
}
if (msa.program == "muscle") {
base_msa <- muscle(x = sequences,
exec = msa.exec,
MoreArgs = "")
}
## Compute NJ guide trees
## ----------------------
cat("Generating NJ guide trees\n")
pb <- txtProgressBar(max = bootstrap, style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
cl <- makeCluster(ncore)
registerDoSNOW(cl)
nj_guidetrees <- foreach(i = 1:bootstrap,
.options.snow = opts,
.packages = "phangorn",
.export = 'msaBP_nj_tree') %dopar% {
msaBP_nj_tree(base_msa, outgroup = "auto")
}
stopCluster(cl)
close(pb)
## Alignment of MSA with NJ guide trees bases on pertubated base MSA
## -----------------------------------------------------------------
cat("Alignment of alternative MSAs using NJ guide trees (GUIDANCE)\n")
## Align perturbated MSAs (GUIDANCE)
pb <- txtProgressBar(max = bootstrap, style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
cl <- makeCluster(ncore)
registerDoSNOW(cl)
if (msa.program == "mafft") {
alt_msa <- foreach(i = 1:bootstrap,
.packages = c('ips'),
# .export = c("sequences", "nj_guidetrees", "msa.exec", "method"),
.options.snow = opts) %dopar% {
mafft(x = sequences, gt = nj_guidetrees[[i]],
exec = msa.exec, method = method,
op = runif(1,0,5))
}
}
if (msa.program == "clustalw") {
alt_msa <- foreach(i = 1:bootstrap,
# .packages = c('ips', 'ape'),
# .export = c("sequences", "nj_guidetrees", "msa.exec", "method"),
.options.snow = opts) %dopar% {
clustal(x = sequences, guide.tree = nj_guidetrees[[i]],
exec = msa.exec,
MoreArgs = paste('-PAIRGAP=', format(runif(1, 1, 9), digits = 0)))
}
}
if (msa.program == "muscle") {
alt_msa <- foreach(i = 1:bootstrap,
# .packages = c('ips', 'ape'),
# .export = c("sequences", "nj_guidetrees", "msa.exec", "method"),
.options.snow = opts) %dopar% {
muscle(x = sequences, guide.tree = nj_guidetrees[[i]],
exec = msa.exec,
MoreArgs = paste('-gapopen ', format(runif(1, -400, -10), digits = 0)))
}
}
stopCluster(cl)
close(pb)
mafft_created <- list.files(getwd(),
full.names = TRUE)[grep("tree.mafft", list.files(getwd()))]
if (length(mafft_created)){
file.remove(mafft_created)
}
##############################################
## PART III
##############################################
## Generate co-optimal alignments with HoT
##############################################
cat(paste("Sampling", n.coopt, "co-optimal MSAs (HoT) for each alternative MSA\n"))
# predifined file allocation
start <- seq(1, n.coopt * bootstrap, n.coopt)
end <- seq(n.coopt, n.coopt * bootstrap, n.coopt)
stend <- data.frame(start, end)
stend <- stend + bootstrap
## run HoT
## -------
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
cl <- makeCluster(ncore)
registerDoSNOW(cl)
alt_msa <- foreach(i = 1:bootstrap,
.options.snow = opts,
# .packages = c('ips', 'adephylo', 'foreach', 'phangorn'),
# .export = c("n.coopt", "sequences", "msa.program",
# "method", "msa.exec", "msa_out")
.export = "Hot_GUIDANCE2"
) %dopar% {
Hot_GUIDANCE2(alt_msa[[i]], n.coopt = n.coopt,
raw_seq = sequences,
method = method, msa.exec = msa.exec)
}
stopCluster(cl)
close(pb)
## Unlist nested list
alt_msa <- foreach(i = seq_along(alt_msa), .combine = c) %do% {
alt_msa[[1]]
}
##############################################
## PART IV
##############################################
## Computation of GUIDANCE 2 scores
##############################################
cat("\nCalculating GUIDANCE2 scores\n")
score <- msa_set_score(ref = base_msa,
alt = alt_msa)
## Store alternative MSAs in a zip file (optional)
## -----------------------------------------------
if (!missing(zip.file)){
for (i in seq_along(alt_msa)){
write.FASTA(alt_msa[[i]], file = paste0(zip.file, "alt_msa_", i, ".fas"))
}
files <- list.files(zip.file, full.names = TRUE)
files <- files[grep("alt_msa", files)]
zip(zipfile = paste0(zip.file, "guidance2_alt_msas_", Sys.Date(), ".zip"), files = files)
file.remove(files)
}
## Prepare and return output
## -------------------------
if (inherits(sequences, "AAbin")){
guidanceAA(base_msa, score, "guidance2", msa.program)
} else {
guidanceDNA(base_msa, score, "guidance2", msa.program)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.