#' Heads or Tails alignment reliability
#'
#' @description MSA reliability assessment HoT (Landan and Graur 2008)
#'
#' @param sequences An object of class \code{\link{DNAbin}} or \code{\link{AAbin}}
#' containing unaligned sequences of DNA or amino acids.
#' @param cutoff specifies a cutoff to remove unreliable columns below the cutoff
#' @param col.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable columns below the cutoff; either user supplied or "auto" (0.73)
#' @param seq.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable sequences below the cutoff; either user supplied of "auto" (0.5)
#'@param mask.cutoff specific residues below a certain cutoff are masked ('N' for DNA, 'X' for AA); either user supplied of "auto" (0.5)
#' @param parallel logical, if TRUE, specify the number of cores
#' @param ncore number of cores (default is 'auto')
#' @param msa.program A charcter string giving the name of the MSA program,
#' one of c("mafft", "muscle", "clustalo", "clustalw2"); MAFFT is default
#' @param exec A character string giving the path to the executable of the
#' alignment program.
#' @param method further arguments passed to MAFFT, default is "auto"
#'
#' @return list containing following scores and alignments:
#' @return mean_scores residue pair score and mean column score
#' @return column_score
#' @return residue_column_score column reliability
#' @return residue_pair_residue_score
#' @return residual_pair_sequence_pair_score
#' @return residual_pair_sequence_score
#' @return residue_pair_score
#' @return base_msa
#' @return HoT_msa is the base_MSA removed from unreliable residues/columns/sequences below cutoffs
#'
#' @details Calculates column reliability (and other scors) by comparing alternative
#' MSAs generated by aligning guide tree partitions as described in Landan and Graur (2008). For details see \code{compareMSAs}. 8*(N-3) alternative MSAs are generated by default, where N is the number of sequences.
#' @references G. Landan and D. Graur (2008). Local reliability measures from sets of co-optimal multiple sequencesuence alignments. 13:15--24
#'
#' @import ips
#' @import doSNOW
#' @import foreach
#' @import parallel
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#' @import adephylo
#' @importFrom phytools plotTree
#'
#' @seealso \code{\link{compareMSAs}}, \code{\link{guidance}}, \code{\link{guidance2}}
#'
#' @author Franz-Sebastian Krah
#' @author Christoph Heibl
#' @export
HoT <- function(sequences,
msa.program = "mafft", exec,
type,
n.coopt ="auto",
col.cutoff = "auto",
seq.cutoff = "auto",
mask.cutoff = "auto",
parallel = FALSE, ncore = "auto",
method = "auto",
plot_guide = TRUE,
alt.msas.file) {
##############################################
## SOME CHECKS
##############################################
if(!is.object(sequences)){
read.fas(sequences, type = type)
}
if (!inherits(sequences, c("DNAbin","AAbin")))
stop("sequencesuences not of class DNAbin or AAbin (ape)")
if(length(sequences)>200)
message("N seq > 200: consider using 'pasta' with desired MSA confidence program")
## Check for MSA program
if(missing(exec)){
os <- Sys.info()[1]
if (msa.program =="mafft") {
exec <- switch(os, Linux = "mafft", Darwin = "mafft",
Windows = "mafft.bat")
}
if (msa.program =="muscle") {
exec <- switch(os, Linux = "muscle", Darwin = "muscle",
Windows = "muscle3.8.31_i86win32.exe")
}
if (msa.program =="clustalo") {
exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
Windows = "clustalo.exe")
}
if (msa.program =="clustalw2") {
exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
Windows = "clustalw2.exe")
}
out <- system(paste(exec, "--v", sep=" "), ignore.stdout = TRUE, ignore.stderr = TRUE)
if (out == 127)
stop("please provide exec path or install MSA program in root \n
i.e. in Unix: '/usr/local/bin/mafft'")
}
## generate some parameters if not specified
#---------------------------------------------
## number of cores
if(ncore=="auto"){
ncore <- detectCores(all.tests = FALSE, logical = TRUE)
}
## Sequence type, needed for ips::read.fas
type <- class(sequences)
type <- gsub("bin", "", type)
##############################################
## PART I
##############################################
## BASE and ALTERNATIVE MSAs
##############################################
cat("Generating the base alignment")
if (msa.program == "mafft"){
base.msa <- mafft(sequences, method = method, exec = exec)
}
if (msa.program == "muscle"){
base.msa <- muscle2(sequences, exec = exec)
}
if (msa.program == "clustalo"){
base.msa <- clustalo(x = sequences, exec = exec)
}
if (msa.program == "clustalw2"){
base.msa <- clustalw2(x = sequences, exec = exec)
}
cat("... done \n")
## Calculate start guide tree
#----------------------------------------------
cat("Calculate start tree")
base.msa.ml <- as.phyDat(base.msa)
# find ML distance as input to nj tree search
ml.dist.msa <- dist.ml(base.msa.ml)
# NJ
start_tree <- ape::nj(ml.dist.msa)
start_tree <- multi2di(start_tree)
start_tree <- compute.brlen(start_tree)
# plot guide tree
if(plot_guide){
phytools::plotTree(start_tree)
legend("bottomleft",
paste(Ntip(start_tree)," tips","; ",
Ntip(start_tree)-3, " partitions", sep =""),
bty = "n")
}
cat("... done \n")
## produce MSA partitions
align_parts <- partitions(start_tree)
# here could be a sampling of co-opts like in
# guidance2. now we sample all
n.coopt.sub <- rep("all", ncol(align_parts))
n.coopt <- (Ntip(start_tree)-3)*8
##############################################
## PART II
##############################################
## Co-optimal MSAs
##############################################
cat(paste("Sampling", n.coopt, "co-optimal alignments \n", sep=" "))
## Create temporary files
#----------------------------------------------
msa_out <- vector(length = n.coopt)
for (i in seq_along(msa_out))
msa_out[i] <- tempfile(pattern = "HoT", tmpdir = tempdir(), fileext = ".fas")
unlink(msa_out[file.exists(msa_out)])
# predifined file storage allocation (because it runs in batches of 8)
start <- seq(1,n.coopt,8)
end <- seq(8,n.coopt,8)
stend <- data.frame(start, end)
## Function implementing batch alignment and storage in batches
align_part_set2 <- function(x, partition_set, method,
exec, msa.program, coopt.sub, files){
alt_msas <- align_part_set(x = x,
partition_set = partition_set,
method = method, exec = exec, msa.program = msa.program,
coopt.sub = coopt.sub)
for(j in 1:8)
write.fas(alt_msas[[j]], files[j])
}
## Run batch alignments
#----------------------------------------------
pb <- txtProgressBar(max = ncol(align_parts), style = 3)
if (parallel){
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
cl <- makeCluster(ncore)
registerDoSNOW(cl)
foreach(i = 1:ncol(align_parts), .options.snow = opts,
.packages = c('rpg', 'ips', 'adephylo', 'foreach', 'phangorn')) %dopar% {
# setTxtProgressBar(pb, i)
align_part_set2(x = sequences, partition_set = align_parts[,i],
coopt.sub = n.coopt.sub[i],
method = method, exec = exec, msa.program = msa.program,
files = msa_out[stend[i,1]:stend[i,2]])
}
stopCluster(cl)
}
if (!parallel){
foreach(i = 1:ncol(align_parts)) %do% {
setTxtProgressBar(pb, i)
align_part_set2(x = sequences, partition_set = align_parts[,i],
coopt.sub = n.coopt.sub,
method = method, exec = exec, msa.program = msa.program,
files = msa_out[stend[i,1]:stend[i,2]])
}
}
close(pb)
##############################################
## PART III
##############################################
## Computation of reliability scores
##############################################
cat("Calculating reliability scores \n")
## HoT Score
#----------------------------------------------
## Create temporary dir for alternative MSAs
dir.create(paste(tempdir(), "alt", sep="/"))
files_from <- list.files(tempdir(), full.names = TRUE)
files_from <- files_from[grep("\\.fas", files_from)]
files_to <- list.files(tempdir())
files_to <- files_to[grep("\\.fas", files_to)]
files_to <- paste(tempdir(), "alt", files_to, sep="/")
## Move files into new dir
file.rename(files_from, files_to)
## Run msa_set_score
#---------------------
scores <- compareMSAs(ref = base.msa,
dir_path = paste(tempdir(), "alt", sep ="/"),
com = NULL)
## if wanted, store alternative MSAs into a zip file
if(!missing(alt.msas.file)){
files <- list.files(tempdir())
files <- files[grep("HoT", files)]
for(i in 1:(n.coopt*bootstrap)){
file.rename(paste(tempdir(), files[i], sep="/"),
paste(tempdir(), paste("altMSA", i, ".fas",sep=""), sep="/"))}
files <- list.files(tempdir(), full.names = TRUE)
files <- files[grep("altMSA*", files)]
zip(zipfile = alt.msas.file, files = files)
# maybe better to use gzfile
## currently zip creates many wired subfolders
}
## Delete temporary files
# this deletion approach has proven best to delete everything
files <- list.files(tempdir(), full.names = TRUE)
files <- files[-grep("rs-graphics", files)]
unlink(files, force = TRUE, recursive = TRUE)
## Reduce MSA according to cutoffs, it wanted
#---------------------------------------------
msa <- HoT.msa <- base.msa
msa <- as.character(msa)
## masking residues below cutoff
if (mask.cutoff>0){
txt <- as.vector(as.character(base.msa))
mat <- data.frame(scores$residue_pair_residue_score, txt)
rown <- max(mat$residue)
coln <- max(mat$col)
res_mat <- matrix(mat$score, nrow = rown, ncol = coln)
if (mask.cutoff=="auto"){ mask.cutoff <- 0.50 }
if (inherits(sequences, "DNAbin")){
msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "N"
msa <- as.DNAbin(msa)
}
if (inherits(sequences, "AAbin")) {
msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "X"
rownames(msa) <- labels(sequences)
class(msa) <- "AAbin"
}
HoT.msa <- msa
}
## remove unreliable columns
if (col.cutoff>0){
if (mask.cutoff>0) { msa <- HoT.msa
} else { msa <- base.msa }
if(col.cutoff =="auto"){col.cutoff <- 0.97}
remove_cols <- scores$column_score$CS < col.cutoff
HoT.msa <- msa[,!remove_cols]
}
## remove unreliable sequences
if (seq.cutoff>0){
if (mask.cutoff>0) { msa <- HoT.msa
} else { msa <- base.msa }
if (seq.cutoff =="auto"){seq.cutoff <- 0.5}
remove_sequences <- scores$residual_pair_sequence_score$score < seq.cutoff
HoT.msa <- msa[!remove_sequences,]
}
# ## prepare base.msa for output
# if(inherits(sequences, "DNAbin") & !inherits(base.msa, "DNAbin"))
# base.msa <- as.DNAbin(base.msa)
# if(inherits(sequences, "AAbin") & !inherits(base.msa, "AAbin"))
# base.msa <- as.AAbin(base.msa)
## Generate output
res <- list(scores = scores,
HoT_msa = HoT.msa,
base_msa = base.msa)
## Return output
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.