#' @title Heads or Tails Alignment Reliability
#' @description MSA reliability assessment HoT (Landan and Graur 2008)
#' @param sequences object of class \code{\link{DNAbin}} or \code{\link{AAbin}}
#' containing unaligned sequences of DNA or amino acids.
#' @param method further argument passed to MAFFT, default is \code{"auto"}
#' @param bootstrap integer giving the number of alternative MSAs to be computed
#' @param n.coopt Character string, xxx.
#' @param plot_guide Logical, xxx.
#' @param store_msas Logical, xxx.
#' @param msa.exec character string giving the path to the executable of the
#' alignment program (e.g. "/usr/local/bin/mafft"); Must be on of: 'mafft',
#' 'muscle', 'clustalo', 'clustalw2'
#' @param ncore integer specifying the number of cores; default = 1 (serial),
#' "auto" can be used for automated usage of all detected cores.
#' @param zip.file A character string giving the name for the output zip file.
#' @return An object of class \code{\linkS4class{guidanceDNA}} or
#' \code{\linkS4class{guidanceAA}}.
#' @details Calculates column reliability (and other scors) by comparing
#' alternative MSAs generated by aligning guide tree partitions as described
#' in Landan and Graur (2008). For details see \code{compareMSAs}. 8*(N-3)
#' alternative MSAs are generated by default, where N is the number of
#' sequences.
#' @references G. Landan and D. Graur. 2008. Local reliability measures from
#' sets of co-optimal multiple sequencesuence alignments. 13:15--24
#' @seealso \code{\link{msa_set_scoreR}}, \code{\link{guidance}},
#' \code{\link{guidance2}}
#'
#' @author Franz-Sebastian Krah
#' @import adephylo
#' @importFrom ape compute.brlen ladderize multi2di Ntip
#' @importFrom ips mafft read.fas
#' @import doSNOW
#' @import foreach
#' @importFrom graphics legend
#' @import parallel
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#' @importFrom utils setTxtProgressBar txtProgressBar zip
#' @export
HoT <- function(sequences, method = "auto",
bootstrap,
n.coopt ="auto",
plot_guide = TRUE,
store_msas = FALSE,
msa.exec = "/usr/local/bin/mafft",
ncore = 1,
zip.file) {
##############################################
## SOME CHECKS
##############################################
# if (!is.object(sequences)){
# read.fas(sequences)
# }
if (!inherits(sequences, c("DNAbin","AAbin")))
stop("sequences not of classes DNAbin or AAbin (ape)")
## look up MSA program specified
msa.program <- str_extract(msa.exec, "mafft|muscle|clustal\\w")
## Check for MSA program
if (missing(msa.exec)){
os <- Sys.info()[1]
if (msa.program == "mafft") {
exec <- switch(os, Linux = "mafft", Darwin = "mafft",
Windows = "mafft.bat")
}
if (msa.program == "muscle") {
exec <- switch(os, Linux = "muscle", Darwin = "muscle",
Windows = "muscle3.8.31_i86win32.exe")
}
if (msa.program == "clustalo") {
exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
Windows = "clustalo.exe")
}
if (msa.program == "clustalw2") {
exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
Windows = "clustalw2.exe")
}
out <- system(paste(exec, "--v"), ignore.stdout = TRUE, ignore.stderr = TRUE)
if (out == 127)
stop("please provide exec path or install MSA program in root \n
i.e. in Unix: '/usr/local/bin/mafft'")
}
if (missing(zip.file)) zip.file <- paste0("guidance_altMSA_", Sys.Date())
## generate some parameters if not specified
#---------------------------------------------
## number of cores
if (ncore == "auto") {
ncore <- detectCores(all.tests = FALSE, logical = TRUE)
}
## if more than 200 species intermediate results are processed via files
int_file <- length(sequences) > 200
##############################################
## PART I
##############################################
## BASE and ALTERNATIVE MSAs
##############################################
cat("Generating the base alignment \n")
## create loop input
mafft_method <- ifelse(msa.program == "mafft", ", method = method", "")
base.msa <- paste0(msa.program,
"(x = sequences, exec = msa.exec",
mafft_method, ")")
## Make alignment
base.msa <- eval(parse(text = base.msa))
## Calculate start guide tree
#----------------------------------------------
cat("Calculate start tree \n")
base.msa.ml <- as.phyDat(base.msa)
# find ML distance as input to nj tree search
ml.dist.msa <- dist.ml(base.msa.ml)
# NJ
start_tree <- nj(ml.dist.msa)
start_tree <- multi2di(start_tree)
start_tree <- compute.brlen(start_tree)
## produce MSA partitions
align_parts <- partitions(start_tree)
# here could be a sampling of co-opts like in
# guidance2. now we sample all
n.coopt.sub <- rep("all", ncol(align_parts))
n.coopt <- (Ntip(start_tree) - 3) * 8
##############################################
## PART II
##############################################
## Co-optimal MSAs
##############################################
cat(paste("Sampling", n.coopt, "co-optimal alignments \n", sep = " "))
## Create temporary files
#----------------------------------------------
msa_out <- vector(length = n.coopt)
for (i in seq_along(msa_out))
msa_out[i] <- tempfile(pattern = "HoT", tmpdir = tempdir(), fileext = ".fas")
unlink(msa_out[file.exists(msa_out)])
# predifined file storage allocation (because it runs in batches of 8)
start <- seq(1, n.coopt, 8)
end <- seq(8, n.coopt, 8)
stend <- data.frame(start, end)
## Function implementing batch alignment and storage in batches
align_part_set2 <- function(x, partition_set, method,
msa.exec, msa.program, coopt.sub, files,
int_file){
alt_msas <- align_part_set(x = x,
partition_set = partition_set,
method = method, msa.exec = msa.exec, msa.program = msa.program,
coopt.sub = coopt.sub)
if (int_file){
for (j in 1:8)
write.fas(alt_msas[[j]], files[j])
} else {
return(alt_msas)
}
}
## Run batch alignments
#----------------------------------------------
pb <- txtProgressBar(max = ncol(align_parts), style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
cl <- makeCluster(ncore)
registerDoSNOW(cl)
mafft_method <- ifelse(msa.program == "mafft", paste0(", method =", "'", method, "'"), "")
intfile <- ifelse(int_file, ", file = msa_out[i]", "")
FUN <- function(i) {
paste0(
"align_part_set2(x = sequences, partition_set = align_parts[,",
i,
"], coopt.sub = n.coopt.sub[",
i,
"]",
mafft_method,
", msa.exec = msa.exec, msa.program = msa.program, files = msa_out[stend[",
i,
", 1]:stend[",
i,
", 2]]",
", int_file = ",
int_file,
")"
)
}
alt.msa <- foreach(i = 1:ncol(align_parts),
.options.snow = opts,
.packages = c('ips', 'adephylo', 'foreach', 'phangorn'),
.export = c("align_part_set2", "sequences", "align_parts", "n.coopt.sub",
"method", "msa.exec", "msa.program", "msa_out", "stend")
) %dopar% {
eval(parse(text = FUN(i)))
}
stopCluster(cl)
close(pb)
#### unlist nested list
if (!int_file){
alt.msa <- foreach(i = 1:length(alt.msa), .combine = c) %do% {
alt.msa[[1]]
}
}
##############################################
## PART III
##############################################
## Computation of reliability scores
##############################################
cat("Calculating reliability scores \n")
## HoT Score
#----------------------------------------------
## Create temporary dir for alternative MSAs
if (int_file){
dir.create(paste(tempdir(), "alt", sep = "/"))
files_from <- list.files(tempdir(), full.names = TRUE)
files_from <- files_from[grep("\\.fas", files_from)]
files_to <- list.files(tempdir())
files_to <- files_to[grep("\\.fas", files_to)]
files_to <- paste(tempdir(), "alt", files_to, sep = "/")
## Move files into new dir
file.rename(files_from, files_to)
rm(alt.msa)
alt.msa <- paste(tempdir(), "alt", sep = "/")
}
## Run msa_set_score
# switch(score_method,
# "Rcpp" = {
# ## Rcpp functions are called
score <- msa_set_scoreR(ref = base.msa,
alt = alt.msa)
# },
# "SA" = {
# ## msa_set_score from GUIDANCE package is called
# score <- msa_set_scoreSA(ref = base.msa,
# alt = alt.msa,
# bootstrap = n.coopt, exec = msa_set_score.exec)
# }
# )
## if wanted, store alternative MSAs into a zip file
if (!missing(store_msas)){
files <- list.files(tempdir())
files <- files[grep("HoT", files)]
for (i in 1:(n.coopt * bootstrap)){
file.rename(paste(tempdir(), files[i], sep = "/"),
paste(tempdir(), paste0("altMSA", i, ".fas"), sep = "/"))}
files <- list.files(tempdir(), full.names = TRUE)
files <- files[grep("altMSA*", files)]
if (missing(zip.file)) zip.file <- paste0("guidance_altMSA_", Sys.Date())
zip(zipfile = zip.file, files = files)
# maybe better to use gzfile
## currently zip creates many wired subfolders
}
## Delete temporary files
# this deletion approach has proven best to delete everything
files <- list.files(tempdir(), full.names = TRUE)
files <- files[-grep("rs-graphics", files)]
unlink(files, force = TRUE, recursive = TRUE)
## Prepare and return output
## -------------------------
# if(score_method=="SA"){
# score <- score$residue_pair_score
# }
if (inherits(sequences, "AAbin")){
guidanceAA(base.msa, score, "HoT")
} else {
guidanceDNA(base.msa, score, "HoT")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.