#' #' GUIDetree-based AligNment ConficencE
#' #'
#' #' @param sequencess An object of class \code{\link{DNAbin}} or \code{\link{AAbin}}
#' #' containing unaligned sequences of DNA or amino acids.
#' #' @param parallel logical, if TRUE, specify the number of cores
#' #' @param ncore number of cores (default is maxinum of local threads)
#' #' @param bootstrap An integer giving the number of perturbated MSAs.
#' #' @param msa.program A charcter string giving the name of the MSA program,
#' #' currelty one of c("mafft", "muscle", "clustalo", "clustalw2"); MAFFT is default
#' #' @param exec A character string giving the path to the executable of the
#' #' alignment program.
#' #' @param method further arguments passed to mafft, default is "auto"
#' #' @param col.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable columns below the cutoff; either user supplied or "auto" (0.73)
#' #' @param seq.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable sequences below the cutoff; either user supplied of "auto" (0.5)
#' #' @param mask.cutoff specific residues below a certain cutoff are masked ('N' for DNA, 'X' for AA); either user supplied of "auto" (0.5)
#' #' @description Assess uncertainty in multiple sequence aligment (MSA) with the
#' #' GUIDANCE algorithm (Penn et al. 2010).
#' #' @return alignment_score
#' #' @return GUIDANCE_residue_score
#' #' @return GUIDANCE_score: is the GUIDANCE column score
#' #' @return GUIDANCE_sequence_score
#' #' @return guidance_msa: is the base MSA removed from unreliable residues/columns/sequences below \code{cutoffs}
#' #' @return base_msa
#' #'
#' #' @details Calculates column confidence by comparing alternative MSAS generated by bootstrap MSAs (Felsenstein 1985) and derived guide trees. The basic comparison between the BP MSAs and a reference MSA is if column residue pairs are identical in both MSAs.
#' #' @author Franz-Sebastian Krah
#' #' @references Felsenstein J. 1985. Confidence limits on phylogenies: an
#' #' approach using the bootstrap. Evolution 39:783–791
#' #' @references Penn et al. (2010). An alignment confidence score capturing
#' #' robustness to guide tree uncertainty. Molecular Biology and Evolution
#' #' 27:1759--1767
#' #'
#' #' @import ips
#' #' @import doSNOW
#' #' @import foreach
#' #' @import parallel
#' #' @import pbmcapply
#' #' @import plyr
#' #' @importFrom phangorn as.phyDat dist.ml
#' #'
#' #' @author Franz-Sebastian Krah
#' #' @author Christoph Heibl
#' #' @export
#'
#' guidance <- function(sequences,
#' msa.program = "mafft", exec,
#' bootstrap = 100,
#' col.cutoff = "auto",
#' seq.cutoff = "auto",
#' mask.cutoff = "auto",
#' parallel = FALSE, ncore ="auto",
#' method = "auto",
#' alt.msas.file){
#'
#' ##############################################
#' ## SOME CHECKS
#' ##############################################
#' if (!inherits(sequences, c("DNAbin", "AAbin")))
#' stop("sequences not of class DNAbin or AAbin (ape)")
#'
#'
#' if (length(labels(sequences)) < 8)
#' warning("GUIDANCE is not suitable for alignments of very few sequences.\n
#' As a rule of thumb, use GUIDANCE2 or HoT for less than 8 sequences.")
#'
#' ## Check for MSA program
#' if(missing(exec)){
#' os <- Sys.info()[1]
#' if (msa.program =="mafft") {
#' exec <- switch(os, Linux = "mafft", Darwin = "mafft",
#' Windows = "mafft.bat")
#' }
#' if (msa.program =="muscle") {
#' exec <- switch(os, Linux = "muscle", Darwin = "muscle",
#' Windows = "muscle3.8.31_i86win32.exe")
#' }
#' if (msa.program =="clustalo") {
#' exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
#' Windows = "clustalo.exe")
#' }
#' if (msa.program =="clustalw2") {
#' exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
#' Windows = "clustalw2.exe")
#' }
#' }
#' out <- system(paste(exec, "--v", sep=" "), ignore.stdout = TRUE, ignore.stderr = TRUE)
#' if (out == 127)
#' stop("please provide exec path or install MSA program in root \n
#' i.e. in Unix: '/usr/local/bin/mafft'")
#'
#'
#' type <- class(sequences)
#' type <- gsub("bin", "", type)
#'
#' if(ncore=="auto"){
#' ncore <- detectCores(all.tests = FALSE, logical = TRUE)
#' }
#'
#' ##############################################
#' ## PART I
#' ##############################################
#' ## BASE and PERTUBATED MSAs
#' ##############################################
#'
#' ## Generate BASE alignment
#' ###########################
#' cat("Generating the base alignment")
#' if (msa.program == "mafft"){
#' base.msa <- mafft(x = sequences, method = method, exec = exec)
#' }
#' if (msa.program == "muscle"){
#' base.msa <- muscle2(x = sequences, exec = exec)
#' }
#' if (msa.program == "clustalo"){
#' base.msa <- clustalo(x = sequences, exec = exec)
#' }
#' if (msa.program == "clustalw2"){
#' base.msa <- clustalw2(x = sequences, exec = exec)
#' }
#'
#' cat("... done \n")
#'
#' ## form into matrix for perturbation
#' # base.msa <- as.character(base.msa)
#'
#' ## Constructing BP guide-trees for the pertubated MSAs
#' #######################################################
#' cat("Generating NJ guide trees \n")
#' ## Compute NJ guide trees
#'
#' if(semphy){
#' #### SEMPHY ###
#' if(nrow(base.msa)>150){
#' if(tpye == "DNA")
#' model <- "--nucjc"
#' if(type == "AA")
#' model <- "--aaJC"
#' }else{
#' if(tpye == "DNA")
#' model <- "--hky"
#' if(type == "AA")
#' model <- "--jtt"
#' }
#' nj.guide.trees <- semphy(msa = base.msa,
#' dist.table<- "-J",
#' model <- model,
#' arsv <- "-H",
#' bootstrap = bootstrap,
#' verbose = verbose)
#'
#' # lapply(nj.guide.trees, root, outgroup = )
#' nj.guide.trees <- lapply(nj.guide.trees, compute.brlen)
#' }
#'
#' Rnj = TRUE
#' if(Rnj){
#' if (parallel){
#' progress <- function(n) setTxtProgressBar(pb, n)
#' opts <- list(progress = progress)
#'
#' cl <- makeCluster(ncore)
#' registerDoSNOW(cl)
#'
#' nj.guide.trees <- foreach(i = 1:bootstrap,
#' .options.snow = opts,
#' .packages = "phangorn", .export = 'msaBP_nj_tree') %dopar% {
#' msaBP_nj_tree(base.msa, outgroup = "auto")
#' }
#' stopCluster(cl)
#' }
#' if (!parallel){
#' nj.guide.trees <- foreach(i = 1:bootstrap, .packages = "phangorn") %do%{
#' setTxtProgressBar(pb, i)
#' msaBP_nj_tree(base.msa, outgroup = "auto")
#' }
#' }
#' close(pb)
#' }
#'
#'
#'
#' ## Alignment of MSA BP times with new NJ guide trees
#' ## -------------------------------------------------
#' cat("Alignment of sequences using NJ guide trees \n")
#'
#' msa_out <- vector(length = bootstrap)
#' for (i in seq_along(msa_out))
#' msa_out[i] <- tempfile(pattern = "mafft", tmpdir = tempdir(), fileext = ".fas")
#' unlink(msa_out[file.exists(msa_out)])
#'
#' pb <- txtProgressBar(max = bootstrap, style = 3)
#'
#' if (parallel){
#' progress <- function(n) setTxtProgressBar(pb, n)
#' opts <- list(progress = progress)
#' cl <- makeCluster(ncore)
#' registerDoSNOW(cl)
#'
#' if (msa.program == "mafft"){
#' foreach(i = 1:bootstrap, .packages=c('ips', 'ape'),
#' .options.snow = opts) %dopar% {
#' mafft2(x = sequences, gt = nj.guide.trees[[i]],
#' exec = exec, file = msa_out[i], method = method)
#' }
#' }
#' if (msa.program == "muscle"){
#' foreach(i = 1:bootstrap, .packages=c('ips', 'ape'),
#' .options.snow = opts) %dopar% {
#' muscle2(x = sequences, gt = nj.guide.trees[[i]],
#' exec = exec, file = msa_out[i])
#' }
#' }
#' if (msa.program == "clustalo"){
#' foreach(i = 1:bootstrap, .packages =c('ips', 'ape'),
#' .options.snow = opts) %dopar% {
#' clustalo(x = sequences, gt = nj.guide.trees[[i]],
#' exec = exec, file = msa_out[i])
#'
#' }
#' }
#' if (msa.program == "clustalw2"){
#' foreach(i = 1:bootstrap, .packages=c('ips', 'ape'),
#' .options.snow = opts) %dopar% {
#' clustalw2(x = sequences, gt = nj.guide.trees[[i]],
#' exec = exec, file = msa_out[i])
#' }
#' }
#' stopCluster(cl)
#' }
#'
#' if (!parallel){
#' if (msa.program == "mafft"){
#' foreach(i = 1:bootstrap, .packages=c('ips', 'ape')) %do% {
#' setTxtProgressBar(pb, i)
#' mafft2(x = sequences, gt = nj.guide.trees[[i]],
#' method = method, exec = exec, file = msa_out[i])
#' }
#' }
#' if (msa.program == "muscle"){
#' foreach(i = 1:bootstrap, .packages=c('ips', 'ape')) %do% {
#' setTxtProgressBar(pb, i)
#' muscle2(x = sequences, gt = nj.guide.trees[[i]],
#' exec = exec, file = msa_out[i])
#' }
#' }
#' if (msa.program == "clustalo"){
#' foreach(i = 1:bootstrap, .packages =c('ips', 'ape')) %do% {
#' setTxtProgressBar(pb, i)
#' clustalo(x = sequences, gt = nj.guide.trees[[i]],
#' exec = exec, file = msa_out[i])
#' }
#' }
#' if (msa.program == "clustalw2"){
#' foreach(i = 1:bootstrap, .packages=c('ips', 'ape')) %do% {
#' setTxtProgressBar(pb, i)
#' clustalw2(x = sequences, gt = nj.guide.trees[[i]],
#' exec = exec, file = msa_out[i])
#' }
#' }
#' }
#' close(pb)
#'
#' mafft_created <- list.files(getwd(),
#' full.names = T)[grep("tree.mafft", list.files(getwd()))]
#' if(length(mafft_created)>0){
#' file.remove(mafft_created)
#' }
#'
#' # removing some intermediate objects not further needed
#' # rm(base.msa.bp)
#' # rm(nj.guide.trees)
#'
#' ##############################################
#' ## PART II
#' ##############################################
#' ## Computation of GUIDANCE scores
#' ##############################################
#' cat("Calculating GUIDANCE scores \n")
#'
#' # ## GUIDANCE Score
#' # ##############################################
#'
#' ## Transform MSAs for input to *msa_set_score*
#' # base.msa.t <- data.frame(t(as.character(base.msa)))
#'
#'
#' # compareMSAs2 <- function(ref, com = msa_out[i], type =type){
#' # guide.msa <- read.fas(com, type = type)
#' # guide.msa <- guide.msa[match(rownames(ref), rownames(guide.msa)),]
#' # compareMSAs(ref = ref, com = guide.msa)
#' # }
#' #
#' # ## Run scores program msa_set_score
#' # system.time(
#' # if (parallel){
#' # pb <- txtProgressBar(max = bootstrap, style = 3)
#' # progress <- function(n) setTxtProgressBar(pb, n)
#' # opts <- list(progress = progress)
#' #
#' # cl <- makeCluster(ncore)
#' # registerDoSNOW(cl)
#' # altres <- foreach(i = 1:bootstrap, .options.snow = opts,
#' # .export = 'compareMSAs', .packages = 'ips') %dopar% {
#' # compareMSAs2(ref = base.msa, com = list.files( paste(tempdir(), "alt", sep="/"), full.names = TRUE)[i], type =type)
#' # }
#' # stopCluster(cl)
#' # }
#' # )
#'
#' # if (!parallel){
#' # altres <- foreach(i = 1:bootstrap,
#' # .export = 'compareMSAs') %do% {
#' # setTxtProgressBar(pb, i)
#' # compareMSAs2(ref = base.msa, com = msa_out[i])
#' # }
#' # }
#' # close(pb)
#'
#' # running it as list does not require loading the intermediate results (read.table)
#' # especially RPRSC is very time consuming.
#' # so running msa_set_score with -d is appr. 6 times faster
#' dir.create(paste(tempdir(), "alt", sep="/"))
#' files_from <- list.files(tempdir(), full.names = TRUE)
#' files_from <- files_from[grep("\\.fas", files_from)]
#' files_to <- list.files(tempdir())
#' files_to <- files_to[grep("\\.fas", files_to)]
#' files_to <- paste(tempdir(), "alt", files_to, sep="/")
#'
#' file.rename(files_from, files_to)
#'
#'
#' scores <- compareMSAs(ref = base.msa, dir_path = paste(tempdir(), "alt", sep ="/"), com = NULL)
#'
#'
#'
#' # ### Calculate mean scores
#' # #------------------------------
#' # # Mean score
#' # msc <- do.call(rbind, lapply(altres, function(x) x[[1]]))
#' # msc[] <- lapply(msc, as.numeric)
#' # msc <- colMeans(msc)
#' # msc <- data.frame(msc)
#' #
#' # ## Column score
#' # CS <- do.call(cbind, lapply(altres, function(x) x[[2]]))
#' # del <- grep("col", names(CS))
#' # CS <- CS[, -del[2:length(del)]]
#' # CS <- data.frame(col = CS[, 1],
#' # column_score = rowMeans(CS[,2:ncol(CS)], na.rm = TRUE))
#' #
#' # ## Residue pair column score (GUIDANCE Score)
#' # g.cs <- do.call(cbind, lapply(altres, function(x) x[[3]]))
#' # del <- grep("col\\b", names(g.cs))
#' # g.cs <- g.cs[, -del[2:length(del)]]
#' # g.cs <- data.frame(col = g.cs[, 1],
#' # res_pair_col_score = rowMeans(g.cs[,2:ncol(g.cs)], na.rm = TRUE))
#' #
#' # ## GUIDANCE Alignment score
#' # alignment_score <- mean(g.cs[, 2])
#' # msc <- rbind(msc, MEAN_GUIDANCE_SCORE = alignment_score)
#' #
#' # # Residue pair residue score
#' # rpr.sc <- do.call(cbind, lapply(altres, function(x) x[[4]]))
#' # del <- grep("col\\b|residue", names(rpr.sc))
#' # rpr.sc <- rpr.sc[, -del[3:length(del)]]
#' # rpr.sc <- data.frame(col = rpr.sc[, 1], residue = rpr.sc[, 2],
#' # res_pair_res_score = rowMeans(rpr.sc[,3:ncol(rpr.sc)], na.rm = TRUE))
#' #
#' # # Residual pair sequence pair score
#' # rpsp.sc <- do.call(cbind, lapply(altres, function(x) x[[5]]))
#' # del <- grep("seq_row1|seq_row2", names(rpsp.sc))
#' # rpsp.sc <- rpsp.sc[, -del[3:length(del)]]
#' # rpsp.sc <- data.frame(seq1 = rpsp.sc[, 1], seq2 = rpsp.sc[, 2],
#' # res_pair_seq_pair_score = rowMeans(rpsp.sc[,3:ncol(rpsp.sc)], na.rm = TRUE))
#' #
#' # # Residual pair sequence score
#' # rps.sc <- do.call(cbind, lapply(altres, function(x) x[[6]]))
#' # del <- grep("seq", names(rps.sc))
#' # rps.sc <- rps.sc[, -del[2:length(del)]]
#' # rps.sc <- data.frame(seq = rps.sc[, 1],
#' # res_pair_seq_score = rowMeans(rps.sc[,2:ncol(rps.sc)], na.rm = TRUE))
#' #
#' # # Residue pair score
#' # rp.sc <- do.call(cbind, lapply(altres, function(x) x[[7]]))
#' # del <- grep("col|row1|row2", names(rp.sc))
#' # rp.sc <- rp.sc[, -del[4:length(del)]]
#' # rp.sc <- data.frame(col = rp.sc[, 1], row1 = rp.sc[,2], col2 = rp.sc[,3],
#' # res_pair_score = rowMeans(rp.sc[,4:ncol(rp.sc)], na.rm = TRUE))
#' ### Calculate mean scores DONE
#'
#' ## if wanted, store alternative MSAs into a zip file
#' if(!missing(alt.msas.file)){
#' files <- list.files(paste(tempdir(), "alt", sep="/"))
#' files <- files[grep("HoT", files)]
#' for(i in 1:(n.coopt*bootstrap)){
#' file.rename(paste(tempdir(), files[i], sep="/"),
#' paste(tempdir(), paste("altMSA", i, ".fas",sep=""), sep="/"))}
#' files <- list.files(tempdir(), full.names = TRUE)
#' files <- files[grep("altMSA*", files)]
#' zip(zipfile = alt.msas.file, files = files)
#'
#' # maybe better to use gzfile
#' ## currently zip creates many wired subfolders
#' }
#'
#' ## delete temporary files in temporary directory
#' unlink(msa_out[file.exists(msa_out)], force = TRUE, recursive = TRUE)
#' unlink(list.files(paste(tempdir(),"alt", sep="/"), full.names = TRUE), force = TRUE, recursive = TRUE)
#' files <- list.files(tempdir(), full.names = TRUE)
#' files <- files[-grep("rs-graphics", files)]
#' unlink(files, force = TRUE, recursive = TRUE)
#'
#' msa <- guidance.msa <- base.msa
#' msa <- as.character(msa)
#' ## masking residues below cutoff
#' if (mask.cutoff>0){
#' txt <- as.vector(as.character(base.msa))
#' # mat <- data.frame(rpr.sc, txt)
#' mat <- data.frame(scores$residue_pair_residue_score, txt)
#' rown <- max(mat$residue)
#' coln <- max(mat$col)
#' res_mat <- matrix(mat$score, nrow = rown, ncol = coln)
#'
#' if (mask.cutoff=="auto"){ mask.cutoff <- 0.50 }
#'
#' if (inherits(sequences, "DNAbin")){
#' msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "N"
#' msa <- as.DNAbin(msa)
#' }
#' if (inherits(sequences, "AAbin")) {
#' msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "X"
#' rownames(msa) <- labels(sequences)
#' class(msa) <- "AAbin"
#' }
#' guidance.msa <- msa
#' }
#' ## remove unreliable columns
#' if (col.cutoff>0){
#' if (mask.cutoff>0) { msa <- guidance.msa
#' } else { msa <- base.msa }
#' if(col.cutoff =="auto"){col.cutoff <- 0.97}
#' # remove_cols <- g.cs$res_pair_col_score < col.cutoff
#' remove_cols <- scores$column_score$CS < col.cutoff
#' guidance.msa <- msa[,!remove_cols]
#' }
#' ## remove unreliable sequences
#' if (seq.cutoff>0){
#' if (mask.cutoff>0) { msa <- guidance.msa
#' } else { msa <- base.msa }
#' if (seq.cutoff =="auto"){seq.cutoff <- 0.5}
#' # remove_sequences <- rps.sc$res_pair_seq_score < seq.cutoff
#' remove_sequences <- scores$residual_pair_sequence_score$score < seq.cutoff
#' guidance.msa <- msa[!remove_sequences,]
#' }
#'
#' ## prepare base.msa for output
#' if(inherits(sequences, "DNAbin") & !inherits(base.msa, "DNAbin"))
#' base.msa <- as.DNAbin(base.msa)
#' if(inherits(sequences, "AAbin") & !inherits(base.msa, "AAbin"))
#' base.msa <- as.AAbin(base.msa)
#'
#' ## Produce output
#' # res <- list(mean_score = msc,
#' # column_score = CS,
#' # GUIDANCE_column_score = g.cs,
#' # residue_pair_residue_score = rpr.sc,
#' # residual_pair_sequence_pair_score = rpsp.sc,
#' # residual_pair_sequence_score = rps.sc,
#' # residue_pair_score = rp.sc,
#' # GUIDANCE_msa = guidance.msa,
#' # base_msa = base.msa)
#'
#' res <- list(scores = scores,
#' GUIDANCE_msa = guidance.msa,
#' base_msa = base.msa)
#' return(res)
#' }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.