#' GUIDetree-based AligNment ConficencE
#'
#' @description MSA reliability assessment GUIDANCE (Penn et al. 2010)
#'
#' @param sequences An object of class \code{\link{DNAbin}} or
#' \code{\link{AAbin}} containing unaligned sequences of DNA or amino acids.
#' @param parallel logical, if TRUE, specify the number of cores
#' @param ncore number of cores (default is maxinum of local threads)
#' @param bootstrap An integer giving the number of perturbated MSAs.
#' @param msa.program A charcter string giving the name of the MSA program,
#' currelty one of c("mafft", "muscle", "clustalo", "clustalw2"); MAFFT is
#' default
#' @param exec A character string giving the path to the executable of the
#' alignment program.
#' @param method further arguments passed to mafft, default is \code{"auto"}
#' @param col.cutoff numberic between 0 and 1; specifies a cutoff to remove
#' unreliable columns below the cutoff; either user supplied or \code{"auto"}
#' (0.73)
#' @param seq.cutoff numberic between 0 and 1; specifies a cutoff to remove
#' unreliable sequences below the cutoff; either user supplied of
#' \code{"auto"} (0.5)
#' @param mask.cutoff specific residues below a certain cutoff are masked ('N'
#' for DNA, 'X' for AA); either user supplied or \code{"auto"} (0.5)
#' @param nj.program specify if R functions or SEMPHY should be used for guide-tree estimation
#' @return list containing following scores and alignments:
#' @return mean_scores residue pair score and mean column score
#' @return column_score
#' @return residue_column_score GUIDANCE score
#' @return residue_pair_residue_score
#' @return residual_pair_sequence_pair_score
#' @return residual_pair_sequence_score
#' @return residue_pair_score
#' @return base_msa
#' @return guidance_msa is the base_MSA removed from unreliable
#' residues/columns/sequences below cutoffs
#'
#' @details Calculates column confidence (and other scors) by comparing
#' alternative MSAs generated by alternative guide trees derived from
#' bootstrap MSAs (Felsenstein 1985). The basic comparison between the BP MSAs
#' and a reference MSA is if column residue pairs are identically aligned in
#' all alternative MSAs compared with the base MSA (see \code{compareMSAs}).
#' @references Felsenstein J. 1985. Confidence limits on phylogenies: an
#' approach using the bootstrap. Evolution 39:783–791
#' @references Penn et al. (2010). An alignment confidence score capturing
#' robustness to guide tree uncertainty. Molecular Biology and Evolution
#' 27:1759--1767
#' @references G. Landan and D. Graur (2008). Local reliability measures from
#' sets of co-optimal multiple sequencesuence alignments. 13:15--24
#'
#' @import ips
#' @import doSNOW
#' @import foreach
#' @import parallel
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#'
#' @seealso \code{\link{compareMSAs}}, \code{\link{guidance2}},
#' \code{\link{HoT}}
#'
#' @examples
#' \dontrun{
#' # first run GUIDANCE on example data using MAFFT
#' file <- system.file("extdata", "BB50009.fasta", package = "rpg")
#' aa_seq<- read.fas(file)
#' g_msa <- guidance(sequences = aa_seq,
#' msa.program = "mafft",
#' exec = "/usr/local/bin/mafft",
#' bootstrap = 100,
#' parallel = FALSE,
#' method = "retree 1")
#' h.p <- confidence.heatmap(g_msa, title = "GUIDANCE BB50009 benchmark",
#' legend = TRUE,guidance_score = TRUE)
#' h.p
#' # again with Muscle
#' g_msa_m <- guidance(sequences = aa_seq,
#' msa.program = "muscle",
#' exec = "/Applications/muscle",
#' bootstrap = 100,
#' parallel = FALSE,
#' method = "retree 1")
#' h.p <- confidence.heatmap(g_msa_m, title = "GUIDANCE BB50009 benchmark",
#' legend = TRUE,guidance_score = TRUE)
#' h.p
#'
#' ## Plot both for comparison
#' h.p.mafft <- confidence.heatmap(g_msa, title = "MAFFT",
#' legend = FALSE, guidance_score = FALSE)
#' h.p.muscle <- confidence.heatmap(g_msa_m, title = "MUSCLE",
#' legend = FALSE, guidance_score = FALSE)
#' library(cowplot)
#' plot_grid(h.p.mafft, h.p.muscle, ncol = 1, nrow = 2)
#' }
#'
#' @author Franz-Sebastian Krah
#' @author Christoph Heibl
#' @export
guidance <- function(sequences,
msa.program = "mafft", exec,
bootstrap = 100,
col.cutoff = "auto",
seq.cutoff = "auto",
mask.cutoff = "auto",
parallel = FALSE, ncore ="auto",
method = "auto",
alt.msas.file,
nj.program){
##############################################
## SOME CHECKS
##############################################
if (!inherits(sequences, c("DNAbin", "AAbin")))
stop("sequences not of class DNAbin or AAbin (ape)")
if (length(labels(sequences)) < 8)
warning("GUIDANCE is not suitable for alignments of very few sequences.\n
As a rule of thumb, use GUIDANCE2 or HoT for less than 8 sequences.")
## Check for MSA program
if (missing(exec)){
os <- Sys.info()
os <- os[grep("sysname", names(os))]
if (msa.program == "mafft") {
exec <- switch(os, Linux = "mafft", Darwin = "mafft",
Windows = "mafft.bat")
}
if (msa.program == "muscle") {
exec <- switch(os, Linux = "muscle", Darwin = "muscle",
Windows = "muscle3.8.31_i86win32.exe")
}
if (msa.program == "clustalo") {
exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
Windows = "clustalo.exe")
}
if (msa.program == "clustalw2") {
exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
Windows = "clustalw2.exe")
}
}
out <- system(paste(exec, "--v"), ignore.stdout = TRUE, ignore.stderr = TRUE)
if (out == 127)
stop("please provide exec path or install MSA program in root \n
i.e. in Unix: '/usr/local/bin/mafft'")
## generate some parameters if not specified
## -----------------------------------------
## number of cores
if (ncore == "auto"){
ncore <- detectCores(all.tests = FALSE, logical = TRUE)
}
## Sequence type
# type <- class(sequences)
# type <- gsub("bin", "", type)
##############################################
## PART I
##############################################
## BASE and ALTERNATIVE MSAs
##############################################
## Generate BASE MSA
#---------------------------------------------
cat("Generating the base alignment")
if (msa.program == "mafft"){
base.msa <- mafft(x = sequences, method = method, exec = exec)
}
if (msa.program == "muscle"){
base.msa <- muscle2(x = sequences, exec = exec)
}
if (msa.program == "clustalo"){
base.msa <- clustalo(x = sequences, exec = exec)
}
if (msa.program == "clustalw2"){
base.msa <- clustalw2(x = sequences, exec = exec)
}
cat("... done \n")
## Constructing BP guide-trees for the pertubated MSAs
#--------------------------------------------------------
cat("Generating NJ guide trees \n")
## Compute NJ guide trees
if (nj.program =="semphy"){
#### SEMPHY ###
if(nrow(base.msa) > 150){
if (type == "DNA")
model <- "--nucjc"
if (type == "AA")
model <- "--aaJC"
} else {
if (tpye == "DNA")
model <- "--hky"
if (type == "AA")
model <- "--jtt"
}
nj.guide.trees <- semphy(msa = base.msa,
dist.table<- "-J",
model <- model,
arsv <- "-H",
bootstrap = bootstrap,
verbose = verbose)
# lapply(nj.guide.trees, root, outgroup = )
nj.guide.trees <- lapply(nj.guide.trees, compute.brlen)
}
pb <- txtProgressBar(max = bootstrap, style = 3)
if (nj.program == "R"){
if (parallel){
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
cl <- makeCluster(ncore)
registerDoSNOW(cl)
nj.guide.trees <- foreach(i = 1:bootstrap,
.options.snow = opts,
.packages = "phangorn", .export = 'msaBP_nj_tree') %dopar% {
msaBP_nj_tree(base.msa, outgroup = "auto")
}
stopCluster(cl)
}
if (!parallel){
nj.guide.trees <- foreach(i = 1:bootstrap, .packages = "phangorn") %do% {
setTxtProgressBar(pb, i)
msaBP_nj_tree(base.msa, outgroup = "auto")
}
}
close(pb)
}
## Alignment of MSA BP times with new NJ guide trees
## -------------------------------------------------
cat("Alignment of sequences using NJ guide trees \n")
msa_out <- vector(length = bootstrap)
for (i in seq_along(msa_out))
msa_out[i] <- tempfile(pattern = "mafft", tmpdir = tempdir(), fileext = ".fas")
unlink(msa_out[file.exists(msa_out)])
pb <- txtProgressBar(max = bootstrap, style = 3)
if (parallel){
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
cl <- makeCluster(ncore)
registerDoSNOW(cl)
if (msa.program == "mafft"){
foreach(i = 1:bootstrap, .packages = c('ips', 'ape'),
.options.snow = opts) %dopar% {
mafft(x = sequences, gt = nj.guide.trees[[i]],
exec = exec, file = msa_out[i], method = method)
}
}
if (msa.program == "muscle"){
foreach(i = 1:bootstrap, .packages = c('ips', 'ape'),
.options.snow = opts) %dopar% {
muscle2(x = sequences, gt = nj.guide.trees[[i]],
exec = exec, file = msa_out[i])
}
}
if (msa.program == "clustalo"){
foreach(i = 1:bootstrap, .packages = c('ips', 'ape'),
.options.snow = opts) %dopar% {
clustalo(x = sequences, gt = nj.guide.trees[[i]],
exec = exec, file = msa_out[i])
}
}
if (msa.program == "clustalw2"){
foreach(i = 1:bootstrap, .packages = c('ips', 'ape'),
.options.snow = opts) %dopar% {
clustalw2(x = sequences, gt = nj.guide.trees[[i]],
exec = exec, file = msa_out[i])
}
}
stopCluster(cl)
}
if (!parallel){
if (msa.program == "mafft"){
foreach(i = 1:bootstrap, .packages = c('ips', 'ape')) %do% {
setTxtProgressBar(pb, i)
mafft2(x = sequences, gt = nj.guide.trees[[i]],
method = method, exec = exec, file = msa_out[i])
}
}
if (msa.program == "muscle"){
foreach(i = 1:bootstrap, .packages = c('ips', 'ape')) %do% {
setTxtProgressBar(pb, i)
muscle2(x = sequences, gt = nj.guide.trees[[i]],
exec = exec, file = msa_out[i])
}
}
if (msa.program == "clustalo"){
foreach(i = 1:bootstrap, .packages = c('ips', 'ape')) %do% {
setTxtProgressBar(pb, i)
clustalo(x = sequences, gt = nj.guide.trees[[i]],
exec = exec, file = msa_out[i])
}
}
if (msa.program == "clustalw2"){
foreach(i = 1:bootstrap, .packages = c('ips', 'ape')) %do% {
setTxtProgressBar(pb, i)
clustalw2(x = sequences, gt = nj.guide.trees[[i]],
exec = exec, file = msa_out[i])
}
}
}
close(pb)
mafft_created <- list.files(getwd(),
full.names = TRUE)[grep("tree.mafft", list.files(getwd()))]
if (length(mafft_created)){
file.remove(mafft_created)
}
##############################################
## PART II
##############################################
## Computation of GUIDANCE scores
##############################################
cat("Calculating GUIDANCE scores \n")
# ## GUIDANCE Score
#----------------------------------------------
## Create temporary dir for alternative MSAs
dir.create(paste(tempdir(), "alt", sep = "/"))
files_from <- list.files(tempdir(), full.names = TRUE)
files_from <- files_from[grep("\\.fas", files_from)]
files_to <- list.files(tempdir())
files_to <- files_to[grep("\\.fas", files_to)]
files_to <- paste(tempdir(), "alt", files_to, sep="/")
## Move files into new dir
file.rename(files_from, files_to)
## Run msa_set_score
#---------------------
scores <- compareMSAs(ref = base.msa,
dir_path = paste(tempdir(), "alt", sep = "/"),
com = NULL)
## if wanted, store alternative MSAs into a zip file
if (!missing(alt.msas.file)){
files <- list.files(paste(tempdir(), "alt", sep = "/"))
files <- files[grep("HoT", files)]
for (i in 1:(n.coopt*bootstrap)){
file.rename(paste(tempdir(), files[i], sep = "/"),
paste(tempdir(), paste0("altMSA", i, ".fas"), sep = "/"))}
files <- list.files(tempdir(), full.names = TRUE)
files <- files[grep("altMSA*", files)]
zip(zipfile = alt.msas.file, files = files)
# maybe better to use gzfile
## currently zip creates many wired subfolders
}
## Delete temporary files
# this deletion approach has proven best to delete everything
files <- list.files(tempdir(), full.names = TRUE)
files <- files[-grep("rs-graphics", files)]
unlink(files, force = TRUE, recursive = TRUE)
## Reduce MSA according to cutoffs, it wanted
## WILL BE TRASFERRED TO METHOD FOR polentaDNA class
#---------------------------------------------------
# msa <- guidance.msa <- base.msa
# msa <- as.character(msa)
# ## masking residues below cutoff
# if (mask.cutoff > 0){
# txt <- as.vector(as.character(base.msa))
# # mat <- data.frame(rpr.sc, txt)
# mat <- data.frame(scores$residue_pair_residue_score, txt)
# rown <- max(mat$residue)
# coln <- max(mat$col)
# res_mat <- matrix(mat$score, nrow = rown, ncol = coln)
#
# if (mask.cutoff == "auto"){ mask.cutoff <- 0.50 }
#
# if (inherits(sequences, "DNAbin")){
# msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "N"
# msa <- as.DNAbin(msa)
# }
# if (inherits(sequences, "AAbin")) {
# msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "X"
# rownames(msa) <- labels(sequences)
# class(msa) <- "AAbin"
# }
# guidance.msa <- msa
# }
# ## remove unreliable columns
# if (col.cutoff > 0){
# if (mask.cutoff) {
# msa <- guidance.msa
# } else {
# msa <- base.msa
# }
# if (col.cutoff == "auto"){ col.cutoff <- 0.97 }
# # remove_cols <- g.cs$res_pair_col_score < col.cutoff
# remove_cols <- scores$column_score$CS < col.cutoff
# guidance.msa <- msa[, !remove_cols]
# }
# ## remove unreliable sequences
# if (seq.cutoff > 0){
# if (mask.cutoff) { msa <- guidance.msa
# } else { msa <- base.msa }
# if (seq.cutoff =="auto"){seq.cutoff <- 0.5}
# # remove_sequences <- rps.sc$res_pair_seq_score < seq.cutoff
# remove_sequences <- scores$residual_pair_sequence_score$score < seq.cutoff
# guidance.msa <- msa[!remove_sequences,]
# }
#
# ## Generate output
# res <- list(scores = scores,
# GUIDANCE_msa = guidance.msa,
# base_msa = base.msa)
## Prepare and return output
## -------------------------
m <- matrix(scores$residue_pair_residue_score$score, nrow = nrow(base.msa))
polentaDNA(base.msa, m, "guidance")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.