polenta: Multiple Sequence Alignment with PASTA and GUIDANCE

Documented in guidance

## This code is part of the polenta package
## © F.-S. Krah (last update 2018-01-24)

#' @title MSA Reliability Assessment with GUIDANCE
#' @description Calculate MSA reliability scores with GUIDANCE (Penn et al.
#'   2010).
#' @param sequences An object of class \code{\link{DNAbin}} or
#'   \code{\link{AAbin}} containing unaligned sequences of DNA or amino acids.
#' @param bootstrap An integer giving the number of alternative MSAs to be
#'   computed.
#' @param method A character string containing further arguments passed to
#'   MAFFT; default is \code{"auto"}.
#' @param msa.exec A character string giving the path to the executable of the
#'   alignment program (e.g. \code{/usr/local/bin/mafft}); possible programs are
#'   \code{MAFFT}, \code{MUSCLE}, \code{ClustalO}, and \code{ClustalW}.
#' @param ncore An integer specifying the number of cores; default = 1 (i.e.
#'   serial execution); \code{"auto"} can be used for automated usage of all
#'   detected cores.
#' @param zip.file A character string giving the name of zip-compressed file, which contains the 
#'   alternative MSAs. If left empty (default), the alternative
#'   MSA will not be stored and cannot be assessed by the user.
#' @return An object of class \code{\linkS4class{polentaDNA}} or
#'   \code{\linkS4class{polentaAA}}.
#' @details Calculates column confidence (and other scors) by comparing
#'   alternative MSAs generated by alternative guide trees derived from
#'   bootstrap MSAs (Felsenstein 1985). The basic comparison between the BP MSAs
#'   and a reference MSA is if column residue pairs are identically aligned in
#'   all alternative MSAs compared with the base MSA (see \code{compareMSAs}).
#' @references Felsenstein, J. 1985. Confidence limits on phylogenies: an
#'   approach using the bootstrap. \emph{Evolution} \strong{39}:783–791.
#' @references Landan, G. and Graur, D. 2008. Local reliability measures from
#'   sets of co-optimal multiple sequence alignments. \emph{Pacific Symposium on
#'   Biocomputing} \strong{13}:15--24.
#' @references Penn, O., Privman, E., Landan, G., Graur, D. and Pupko, T. 2010.
#'   An alignment confidence score capturing robustness to guide tree
#'   uncertainty. \emph{Molecular Biology and Evolution} \strong{27}:1759--1767.
#' @seealso \code{\link{msa_set_scoreR}}, \code{\link{guidance2}},
#'   \code{\link{HoT}}
#' @import ips
#' @importFrom doSNOW registerDoSNOW
#' @import foreach
#' @importFrom parallel detectCores makeCluster
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml

#' @examples
#' \dontrun{
#' # run GUIDANCE on example data using MAFFT
#' file <- system.file("extdata", "BB50009.fasta", package = "polenta")
#' aa_seq <- read.fas(file)
#' g_res <- guidance(sequences = aa_seq)
#' scores <- daughter_scores(g_res, score = c("gcsc", "rprsc"))
#' hist(scores$gcsc$score, xlab = "Column score", main = "GUIDANCE")
#' }
#'
#' @author Franz-Sebastian Krah
#' @export

guidance <- function(sequences,
                     bootstrap = 100,
                     method = "auto",
                     msa.exec = "/usr/local/bin/mafft",
                     ncore = 1,
                     zip.file){

  ##############################################
  ## SOME CHECKS
  ##############################################
  if (!inherits(sequences, c("DNAbin", "AAbin")))
    stop("sequences not of class DNAbin or AAbin (ape)")

  if (length(labels(sequences)) < 8)
    warning("GUIDANCE is not suitable for alignments of very few sequences.\n
      As a rule of thumb, use GUIDANCE2 or HoT for < 8 sequences.")

  if (length(sequences) > 200)
    message("More than 200 sequences: consider using 'polenta'")

  ## look up MSA program specified
  msa_program <- str_extract(msa.exec, "mafft|muscle|clustalo|clustalw2")

  ## Check for MSA program
  if (missing(msa.exec)){
    os <- Sys.info()
    os <- os[grep("sysname", names(os))]
    if (msa_program == "mafft") {
      msa.exec <- switch(os, Linux = "mafft", Darwin = "mafft",
                         Windows = "mafft.bat")
    }
    if (msa_program == "muscle") {
      msa.exec <- switch(os, Linux = "muscle", Darwin = "muscle",
                         Windows = "muscle3.8.31_i86win32.exe")
    }
    if (msa_program == "clustalo") {
      msa.exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
                         Windows = "clustalo.exe")
    }
    if (msa_program == "clustalw2") {
      msa.exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
                         Windows = "clustalw2.exe")
    }
  }
  out <- system(paste(msa.exec, "--v"), ignore.stdout = TRUE, ignore.stderr = TRUE)
  if (out == 127)
    stop("please provide msa.exec path or install MSA program in root \n
      i.e. in Unix: '/usr/local/bin/mafft'")
  
  ## generate some parameters if not specified
  ## -----------------------------------------
  ## number of cores
  if (ncore == "auto"){
    ncore <- detectCores(all.tests = FALSE, logical = TRUE)
  }

  ## if more than 200 species intermediate results are processed via files
  # int_file <- ifelse(length(sequences) > 200, TRUE, FALSE)
  int_file <- FALSE
  ##############################################
  ## PART I
  ##############################################
  ## BASE and ALTERNATIVE MSAs
  ##############################################

  ## Make base alignment
  ## -------------------
  cat("Generating the base alignment \n")
  mafft_method <- ifelse(msa_program == "mafft", 
                         ", method = method, thread = ncore", "")
  base_msa <- paste0(msa_program, "(",
                    "x = sequences, exec = msa.exec",
                    mafft_method, ")")
  base_msa <- eval(parse(text = base_msa))

  ## Compute NJ guide trees
  ## ----------------------
  cat("Generating NJ guide trees\n")
  pb <- txtProgressBar(max = bootstrap, style = 3)
  progress <- function(n) setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  
  cl <- makeCluster(ncore)
  registerDoSNOW(cl)
  
  nj_guidetrees <- foreach(i = 1:bootstrap,
                           .options.snow = opts,
                           .packages = "phangorn", 
                           .export = 'msaBP_nj_tree') %dopar% {
                             msaBP_nj_tree(base_msa, outgroup = "auto")
                           }
  stopCluster(cl)
  close(pb)

  ## Alignment of MSA BP times with new NJ guide trees
  ## -------------------------------------------------
  cat("Alignment of sequences using NJ guide trees\n")

  ## store intermediate results in the temporary files
  if (int_file){
    msa_out <- vector(length = bootstrap)
    for (i in seq_along(msa_out))
      msa_out[i] <- tempfile(pattern = "mafft",
                             tmpdir = tempdir(), fileext = ".fas")
    # unlink(msa_out[file.exists(msa_out)])
  }

  ## Construct alignment function
  ## ----------------------------
  mafft_method <- ifelse(msa_program == "mafft", 
                         paste0(", method =", "'", method, "'"), "")
  intfile <- ifelse(int_file, ", file = msa_out[i]", "")
  aligner <- paste0(msa_program, "(x = sequences, gt = nj_guidetrees[[i]], ", 
                    "exec = msa.exec", mafft_method, intfile, ")")

  ## loop
  progress <- function(n) setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  cl <- makeCluster(ncore)
  registerDoSNOW(cl)

  alt_msa <- foreach(i = 1:bootstrap, 
                     .packages = c('ips', 'ape'), 
                     .options.snow = opts,
                     .export = c("sequences", "nj_guidetrees", "msa.exec")) %dopar% {
                       eval(parse(text = aligner))
                     }

  stopCluster(cl)
  close(pb)

  ## Delete leftover files from MAFFT
  file.remove(list.files(pattern = "tree.mafft", full.names = TRUE))


  ##############################################
  ## PART II
  ##############################################
  ## Computation of GUIDANCE scores
  ##############################################
  cat("\nCalculating GUIDANCE scores \n")

  if (int_file){
    ## Create temporary dir for alternative MSAs
    dir.create(paste(tempdir(), "alt", sep = "/"))
    files_from <- list.files(tempdir(), full.names = TRUE)
    files_from <- files_from[grep("\\.fas", files_from)]
    files_to <- list.files(tempdir())
    files_to <- files_to[grep("\\.fas", files_to)]
    files_to <- paste(tempdir(), "alt", files_to, sep = "/")

    ## Move files into new dir
    file.rename(files_from, files_to)

    rm(alt_msa)
    alt_msa <- paste(tempdir(), "alt", sep = "/")

  }

  ## Run msa_set_score
  score.method <-"Rcpp" ## REMOVE THIS EVENTUALLY!
  switch(score.method,
         "Rcpp" = {
           ## Rcpp functions are called
           score <- msa_set_scoreR(ref = base_msa,
                                   alt = alt_msa)
         },
         "SA" = {
           ## msa_set_score from GUIDANCE package is called
           score <- msa_set_scoreSA(ref = base_msa,
                                    alt = alt_msa,
                                    bootstrap = bootstrap, 
                                    exec = msa_set_score.exec)
         }
  )

  ## Store alternative MSAs in a zip file (optional)
  ## -----------------------------------------------
  if (!missing(zip.file)){
    files <- list.files(tempdir())
    files <- files[grep("HoT", files)]
    for (i in 1:(n.coopt * bootstrap)){
      file.rename(paste(tempdir(), files[i], sep = "/"),
                  paste(tempdir(), paste0("altMSA", i, ".fas"), sep = "/"))}
    files <- list.files(tempdir(), full.names = TRUE)
    files <- files[grep("altMSA*", files)]
    zip(zipfile = zip.file, files = files)
    ## NOTE: maybe better to use gzfile,
    ## currently zip creates many weird subfolders
  }

  ## Delete temporary files
  # this deletion approach has proven best to delete everything
  files <- list.files(tempdir(), full.names = TRUE)
  files <- files[-grep("rs-graphics", files)]
  unlink(files, force = TRUE, recursive = TRUE)

  ## Prepare and return output
  ## -------------------------
  # if(score_method=="SA"){
  #   score <- score$residue_pair_score
  # }
  if (inherits(sequences, "AAbin")){
    polentaAA(base_msa, score, "guidance")
  } else {
    polentaDNA(base_msa, score, "guidance")
  }
}