polenta: Multiple Sequence Alignment with PASTA and GUIDANCE

Documented in guidance2

## This code is part of the polenta package
## © F.-S. Krah and C. Heibl (last update 2018-01-24)

#' @title MSA Reliability Assessment with GUIDANCE2
#' @description Calculate MSA reliability scores with GUIDANCE2 (Sela et al.
#'   2015).
#' @param sequences An object of class \code{\link{DNAbin}} or
#'   \code{\link{AAbin}} containing unaligned sequences of DNA or amino acids.
#' @param bootstrap An integer giving the number of alternative MSAs to be
#'   computed.
#' @param method A character string containing further arguments passed to
#'   MAFFT; default is \code{"auto"}.
#' @param n.coopt An integer giving the number of sampled co-optimal MSAa.
#' @param n.part XXX.
#' @param msa.exec A character string giving the path to the executable of the
#'   alignment program (e.g. \code{/usr/local/bin/mafft}); possible programs are
#'   \code{MAFFT}, \code{MUSCLE}, \code{ClustalO}, and \code{ClustalW}.
#' @param ncore An integer specifying the number of cores; default = 1 (i.e.
#'   serial execution); \code{"auto"} can be used for automated usage of all
#'   detected cores.
#' @param zip.file A character string giving the name of zip-compressed file,
#'   which contains the alternative MSAs. If left empty (default), the
#'   alternative MSA will not be stored and cannot be assessed by the user.
#' @return An object of class \code{\linkS4class{polentaDNA}} or
#'   \code{\linkS4class{polentaAA}}.
#' @details Calculates column confidence (and other scors) by comparing
#'   alternative MSAs generated by the GUIDANCE with varying gap opening panelty
#'   and the HoT methodology. First 100 alternative MSAs (with BP guide trees)
#'   with varying gap opening panelty are produced, then for each n (default =
#'   4) co-optimal alignments are produced using HoT. The basic comparison
#'   between the BP MSAs and a reference MSA is if column residue pairs are
#'   identically aligned in all alternative MSAs compared with the base MSA (see
#'   \code{compareMSAs}).
#'
#' @references Felsenstein, J. 1985. Confidence limits on phylogenies: an
#'   approach using the bootstrap. \emph{Evolution} \strong{39}:783--791.
#' @references Landan, G. and D. Graur. 2008. Local reliability measures from
#'   sets of co-optimal multiple sequence alignments. \emph{Pacific Symposium on
#'   Biocomputing} \strong{13}:15--24.
#' @references Penn, O., E. Privman, G. Landan, D. Graur, and T. Pupko. 2010. An
#'   alignment confidence score capturing robustness to guide tree uncertainty.
#'   \emph{Molecular Biology and Evolution} \strong{27}:1759--1767.
#' @references Sela et al. 2015. GUIDANCE2: accurate detection of unreliable
#'   alignment regions accounting for the uncertainty of multiple parameters.
#'   \emph{Nucleic Acids Research} \strong{43}:W7--W14
#' @author Franz-Sebastian Krah
#' @seealso \code{\link{guidance}}, \code{\link{HoT}}
#' @importFrom ips mafft
#' @import doSNOW
#' @import foreach
#' @importFrom parallel makeCluster
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#' @importFrom stringr str_extract
#' @export

guidance2 <- function(sequences,
                      bootstrap = 100,
                      method = "auto",
                      n.coopt = 4,
                      n.part = "auto",
                      msa.exec = "/usr/local/bin/mafft",
                      ncore = 1,
                      store_msas = FALSE,
                      zip.file){
  
  ##############################################
  ## SOME CHECKS
  ##############################################
  if (!inherits(sequences, c("DNAbin", "AAbin")))
    stop("sequences not of class DNAbin or AAbin (ape)")
  if (length(sequences) > 200)
    message("More than 200 sequences: consider using 'polenta'")
  
  ## Look up MSA program specified
  msa_program <- str_extract(msa.exec, "mafft|muscle|clustal\\w")
  
  ## Check for MSA program
  ## ---------------------
  if (missing(msa.exec)){
    os <- Sys.info()[1]
    if (msa_program == "mafft") {
      msa.exec <- switch(os, Linux = "mafft", Darwin = "mafft",
                         Windows = "mafft.bat")
    }
    if (msa_program == "muscle") {
      msa.exec <- switch(os, Linux = "muscle", Darwin = "muscle",
                         Windows = "muscle3.8.31_i86win32.exe")
    }
    if (msa_program == "clustalw2") {
      msa.exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
                         Windows = "clustalw2.exe")
    }
  }
  out <- system(paste(msa.exec, "--v"), ignore.stdout = TRUE, ignore.stderr = TRUE)
  if (out == 127)
    stop("please provide exec path or install MSA program in root \n
      i.e. in Unix: '/usr/local/bin/mafft'")
  
  ## Number of cores
  if (ncore == "auto") ncore <- detectCores(all.tests = FALSE, logical = TRUE)

  ## If more than 200 species intermediate results are processed via files
  int_file <- ifelse(length(sequences) > 200, TRUE, FALSE)
  
  ##############################################
  ## PART I
  ##############################################
  ## BASE and PERTUBATED MSAs
  ##############################################
  
  ## Generate base alignment
  ## -----------------------
  cat("Generating the base alignment\n")
  mafft_method <- ifelse(msa_program == "mafft", 
                         ", method = method, thread = ncore", "")
  base.msa <- paste0(msa_program, "(",
                     "x = sequences, exec = msa.exec",
                     mafft_method, ")")
  base.msa <- eval(parse(text = base.msa))
  
  ## Compute NJ guide trees
  ## ----------------------
  cat("Generating NJ guide trees\n")
  pb <- txtProgressBar(max = bootstrap, style = 3)
  progress <- function(n) setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  
  cl <- makeCluster(ncore)
  registerDoSNOW(cl)
  
  nj_guidetrees <- foreach(i = 1:bootstrap,
                           .options.snow = opts,
                           .packages = "phangorn", 
                           .export = 'msaBP_nj_tree') %dopar% {
                             msaBP_nj_tree(base.msa, outgroup = "auto")
                           }
  stopCluster(cl)
  close(pb)
  
  
  ## Alignment of MSA with NJ guide trees bases on pertubated base MSA
  ## -----------------------------------------------------------------
  cat("Alignment of alternative MSAs using NJ guide trees (GUIDANCE)\n")
  
  ## Prepare TEMP files for all outputs (GUIDANCE + HoT)
  if (int_file){
    msa_out <- vector(length = (bootstrap + (bootstrap*n.coopt)))
    for (i in 1:bootstrap)
      msa_out[i] <- tempfile(pattern = "mafft", tmpdir = tempdir(), fileext = ".fas")
    for (i in 1:(bootstrap * n.coopt))
      msa_out[i + bootstrap] <- tempfile(pattern = "HoT", tmpdir = tempdir(), fileext = ".fas")
    # unlink(msa_out[file.exists(msa_out)])
  }
  
  ## Align perturbated MSAs (GUIDANCE)
  pb <- txtProgressBar(max = bootstrap, style = 3)
  progress <- function(n) setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  cl <- makeCluster(ncore)
  registerDoSNOW(cl)
  
  intfile <- ifelse(int_file, ", file = msa_out[i]", "")
  
  switch(msa_program,
         
         mafft = {
           FUN <- function(i) {
             paste0("mafft(x = sequences, gt = nj_guidetrees[[", i, "]],
      exec = msa.exec, method = method,
      op = runif(1,0,5))", intfile)
           }},
         muscle = {
           FUN <- function(i) {
             paste0("muscle2(x = sequences, gt = nj_guidetrees[[,", i, ", ]],
        exec = msa.exec, MoreArgs = paste('-gapopen ', format(runif(1, -400, -10), digits = 0)",
                   intfile,")")
           }},
         clustalw2 = {
           FUN <- function(i) {
             paste0("clustalw2(x = sequences, gt = nj_guidetrees[[,", i, "]],
          exec = msa.exec, file = msa_out[i],
        MoreArgs = paste('-PAIRGAP=', format(runif(1, 1, 9), digits = 0), sep =')')",
                    intfile, ")")
           }})
  
  # FUN <- switch(msa_program,
  #        
  #        mafft = function(i) paste0("mafft(x = sequences, gt = nj_guidetrees[[", i, "]],
  #                   exec = msa.exec, method = method,
  #                   op = runif(1,0,5))", intfile),
  #        muscle = function(i) paste0("muscle2(x = sequences, gt = nj_guidetrees[[,", i, ", ]],
  #                   exec = msa.exec, MoreArgs = paste('-gapopen ', format(runif(1, -400, -10), digits = 0)",
  #                   intfile,")"),
  #        clustalw2 = function(i) paste0("clustalw2(x = sequences, gt = nj_guidetrees[[,", i, "]],
  #                   exec = msa.exec, file = msa_out[i],
  #                   MoreArgs = paste('-PAIRGAP=', format(runif(1, 1, 9), digits = 0), sep =')')",
  #                   intfile, ")"))
  
  msa_out <- foreach(i = 1:bootstrap, .packages = c('ips', 'ape', 'polenta'),
                     .export = c("sequences", "nj_guidetrees", "msa.exec", "method"),
                     .options.snow = opts)  %dopar% {
                       
                       eval(parse(text = FUN(i)))
                       
                     }
  stopCluster(cl)
  close(pb)
  
  mafft_created <- list.files(getwd(),
                              full.names = TRUE)[grep("tree.mafft", list.files(getwd()))]
  if (length(mafft_created)){
    file.remove(mafft_created)
  }
  
  
  
  ##############################################
  ## PART III
  ##############################################
  ## Generate co-optimal alignments with HoT
  ##############################################
  cat(paste("Sampling", n.coopt, "co-optimal MSAs (HoT) for each alternative MSA\n"))
  
  # predifined file allocation
  
  start <- seq(1, n.coopt * bootstrap, n.coopt)
  end <- seq(n.coopt, n.coopt * bootstrap, n.coopt)
  stend <- data.frame(start, end)
  stend <- stend + bootstrap
  
  ## run HoT
  ## -------
  progress <- function(n) setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  cl <- makeCluster(ncore)
  registerDoSNOW(cl)
  
  FUN <- function(i){
    paste0("Hot_GUIDANCE2(msa = msa_out",
          ifelse(int_file, "[", "[["), i, 
          ifelse(int_file, "]", "]]"),
          ", n.coopt = n.coopt, files = msa_out[stend[" , i, 
          ",1]:stend[" , i, ",2]], int_file = ", int_file, 
          ", raw_seq = sequences, msa.program = msa_program, ", 
          "method = method, msa.exec = msa.exec)")
  }
  
  alt.msa <- foreach(i = 1:bootstrap,
                     .options.snow = opts,
                     .packages = c('polenta', 'ips', 'adephylo', 'foreach', 'phangorn'),
                     .export = c("n.coopt", "sequences", "msa_program",
                                 "method", "msa.exec", "msa_out")) %dopar% {
                                   eval(parse(text = FUN(i)))
                                 }
  stopCluster(cl)
  close(pb)
  
  ## Unlist nested list
  if (!int_file){
    alt.msa <- foreach(i = 1:length(alt.msa), .combine = c) %do% {
      alt.msa[[1]]
    }
  }
  
  ##############################################
  ## PART IV
  ##############################################
  ## Computation of GUIDANCE 2 scores
  ##############################################
  cat("\nCalculating GUIDANCE2 scores\n")
  
  if (int_file){
    dir.create(paste(tempdir(), "alt", sep = "/"))
    files_from <- list.files(tempdir(), full.names = TRUE)
    files_from <- files_from[grep("\\.fas", files_from)]
    files_to <- list.files(tempdir())
    files_to <- files_to[grep("\\.fas", files_to)]
    files_to <- paste(tempdir(), "alt", files_to, sep = "/")
    file.rename(files_from, files_to)
    rm(alt.msa)
    alt.msa <- paste(tempdir(), "alt", sep = "/")
  }
  
  ## Run msa_set_score
  # switch(score_method,
  #   "Rcpp" = {
  #     ## Rcpp functions are called
  score <- msa_set_scoreR(ref = base.msa,
                          alt = alt.msa)
  #   },
  #   "SA" = {
  #     ## msa_set_score from GUIDANCE package is called
  #     score <- msa_set_scoreSA(ref = base.msa,
  #       alt = alt.msa, bootstrap = bootstrap,
  #       exec = msa_set_score.exec)
  #   }
  # )
  
  
  ## Store alternative MSAs in a zip file (optional)
  ## -----------------------------------------------
  if (!missing(zip.file)){
    files <- list.files(tempdir())
    files <- files[grep("HoT", files)]
    for (i in 1:(n.coopt * bootstrap)){
      file.rename(paste(tempdir(), files[i], sep = "/"),
                  paste(tempdir(), paste0("altMSA", i, ".fas"), sep = "/"))}
    files <- list.files(tempdir(), full.names = TRUE)
    files <- files[grep("altMSA*", files)]
    zip(zipfile = zip.file, files = files)
    ## NOTE: maybe better to use gzfile,
    ## currently zip creates many weird subfolders
  }
  
  ## delete temporary files in temporary directory
  # unlink(msa_out[file.exists(msa_out)], force = TRUE)
  # unlink(list.files(paste(tempdir(), "alt", sep = "/"), 
  #                   full.names = TRUE), force = TRUE, recursive = TRUE)
  # unlink(tempdir(), force = TRUE) # do not use this, it causes problems
  
  
  ## Prepare and return output
  ## -------------------------
  # if(score_method=="SA"){
  #   score <- score$residue_pair_score
  # }
  if (inherits(sequences, "AAbin")){
    polentaAA(base.msa, score, "guidance2")
  } else {
    polentaDNA(base.msa, score, "guidance2")
  }
}