polenta: Multiple Sequence Alignment with PASTA and GUIDANCE

Documented in HoT

## This code is part of the polenta package
## © F.-S. Krah (last update 2017-11-13)

#' @title Heads or Tails Alignment Reliability
#' @description MSA reliability assessment HoT (Landan and Graur 2008)
#' @param sequences object of class \code{\link{DNAbin}} or \code{\link{AAbin}}
#'   containing unaligned sequences of DNA or amino acids.
#' @param method further argument passed to MAFFT, default is \code{"auto"}
#' @param bootstrap integer giving the number of alternative MSAs to be computed
#' @param n.coopt Character string, xxx.
#' @param plot_guide Logical, xxx.
#' @param store_msas Logical, xxx.
#' @param msa.exec character string giving the path to the executable of the
#'   alignment program (e.g. "/usr/local/bin/mafft"); Must be on of: 'mafft',
#'   'muscle', 'clustalo', 'clustalw2'
#' @param ncore integer specifying the number of cores; default = 1 (serial),
#'   "auto" can be used for automated usage of all detected cores.
#' @param zip.file A character string giving the name for the output zip file.
#' @return object of class \code{polenta}: msa original MSA as computed by
#'   \code{msa.program} scores residue pair score
#' @details Calculates column reliability (and other scors) by comparing
#'   alternative MSAs generated by aligning guide tree partitions as described
#'   in Landan and Graur (2008). For details see \code{compareMSAs}. 8*(N-3)
#'   alternative MSAs are generated by default, where N is the number of
#'   sequences.
#' @references G. Landan and D. Graur. 2008. Local reliability measures from
#'   sets of co-optimal multiple sequencesuence alignments. 13:15--24
#' @seealso \code{\link{msa_set_scoreR}}, \code{\link{guidance}},
#'   \code{\link{guidance2}}
#'
#' @author Franz-Sebastian Krah
#' @import adephylo
#' @importFrom ape compute.brlen ladderize multi2di Ntip
#' @importFrom ips mafft read.fas
#' @import doSNOW
#' @import foreach
#' @importFrom graphics legend
#' @import parallel
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#' @importFrom phytools plotTree
#' @importFrom utils setTxtProgressBar txtProgressBar zip
#' @export


HoT <- function(sequences, method = "auto",
                bootstrap,
                n.coopt ="auto",
                plot_guide = TRUE,
                store_msas = FALSE,
                msa.exec = "/usr/local/bin/mafft",
                ncore = 1,
                zip.file) {
  
  ##############################################
  ## SOME CHECKS
  ##############################################
  # if (!is.object(sequences)){
  #   read.fas(sequences)
  # }
  
  if (!inherits(sequences, c("DNAbin","AAbin")))
    stop("sequences not of classes DNAbin or AAbin (ape)")
  
  if (length(sequences) > 200)
    message("more than 200 sequences: consider using 'polenta'")
  
  ## look up MSA program specified
  msa.program <- str_extract(msa.exec, "mafft|muscle|clustal\\w")
  
  ## Check for MSA program
  if (missing(msa.exec)){
    os <- Sys.info()[1]
    if (msa.program == "mafft") {
      exec <- switch(os, Linux = "mafft", Darwin = "mafft",
                     Windows = "mafft.bat")
    }
    if (msa.program == "muscle") {
      exec <- switch(os, Linux = "muscle", Darwin = "muscle",
                     Windows = "muscle3.8.31_i86win32.exe")
    }
    if (msa.program == "clustalo") {
      exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
                     Windows = "clustalo.exe")
    }
    if (msa.program == "clustalw2") {
      exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
                     Windows = "clustalw2.exe")
    }
    out <- system(paste(exec, "--v"), ignore.stdout = TRUE, ignore.stderr = TRUE)
    if (out == 127)
      stop("please provide exec path or install MSA program in root \n
        i.e. in Unix: '/usr/local/bin/mafft'")
  }
  
  if (missing(zip.file)) zip.file <- paste0("guidance_altMSA_", Sys.Date())
  
  ## generate some parameters if not specified
  #---------------------------------------------
  ## number of cores
  if (ncore == "auto") {
    ncore <- detectCores(all.tests = FALSE, logical = TRUE)
  }
  
  ## if more than 200 species intermediate results are processed via files
  int_file <- length(sequences) > 200
  
  ##############################################
  ## PART I
  ##############################################
  ## BASE and ALTERNATIVE MSAs
  ##############################################
  cat("Generating the base alignment \n")
  
  ## create loop input
  mafft_method <- ifelse(msa.program == "mafft", ", method = method", "")
  base.msa <- paste0(msa.program,
                     "(x = sequences, exec = msa.exec",
                     mafft_method, ")")
  
  ## Make alignment
  base.msa <- eval(parse(text = base.msa))
  
  ## Calculate start guide tree
  #----------------------------------------------
  cat("Calculate start tree \n")
  base.msa.ml <- as.phyDat(base.msa)
  # find ML distance as input to nj tree search
  ml.dist.msa <- dist.ml(base.msa.ml)
  # NJ
  start_tree <- nj(ml.dist.msa)
  start_tree <- multi2di(start_tree)
  start_tree <- compute.brlen(start_tree)
  
  # plot guide tree
  if (plot_guide){
    plotTree(ladderize(start_tree))
    legend("bottomleft",
           paste0(Ntip(start_tree)," tips", "; ",
                  Ntip(start_tree) - 3, " partitions"),
           bty = "n")
  }
  
  ## produce MSA partitions
  align_parts <- partitions(start_tree)
  
  # here could be a sampling of co-opts like in
  # guidance2. now we sample all
  n.coopt.sub <- rep("all", ncol(align_parts))
  n.coopt <- (Ntip(start_tree) - 3) * 8
  
  
  ##############################################
  ## PART II
  ##############################################
  ## Co-optimal MSAs
  ##############################################
  cat(paste("Sampling", n.coopt, "co-optimal alignments \n", sep = " "))
  
  ## Create temporary files
  #----------------------------------------------
  msa_out <- vector(length = n.coopt)
  for (i in seq_along(msa_out))
    msa_out[i] <- tempfile(pattern = "HoT", tmpdir = tempdir(), fileext = ".fas")
  unlink(msa_out[file.exists(msa_out)])
  
  # predifined file storage allocation (because it runs in batches of 8)
  start <- seq(1, n.coopt, 8)
  end <- seq(8, n.coopt, 8)
  stend <- data.frame(start, end)
  
  ## Function implementing batch alignment and storage in batches
  align_part_set2 <- function(x, partition_set, method,
                              msa.exec, msa.program, coopt.sub, files,
                              int_file){
    
    alt_msas <- align_part_set(x = x,
                               partition_set = partition_set,
                               method = method, msa.exec = msa.exec, msa.program = msa.program,
                               coopt.sub = coopt.sub)
    
    if (int_file){
      for (j in 1:8)
        write.fas(alt_msas[[j]], files[j])
    } else { 
      return(alt_msas) 
    }
  }
  
  
  ## Run batch alignments
  #----------------------------------------------
  pb <- txtProgressBar(max = ncol(align_parts), style = 3)
  progress <- function(n) setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  cl <- makeCluster(ncore)
  registerDoSNOW(cl)
  
  mafft_method <- ifelse(msa.program == "mafft", paste0(", method =", "'", method, "'"), "") 
  intfile <- ifelse(int_file, ", file = msa_out[i]", "")
  
  FUN <- function(i) {
    paste0(
      "align_part_set2(x = sequences, partition_set = align_parts[,",
      i,
      "], coopt.sub = n.coopt.sub[",
      i,
      "]",
      mafft_method,
      ", msa.exec = msa.exec, msa.program = msa.program, files = msa_out[stend[",
      i,
      ", 1]:stend[",
      i,
      ", 2]]",
      ", int_file = ",
      int_file,
      ")"
    )
  }
  
  alt.msa <- foreach(i = 1:ncol(align_parts),
                     .options.snow = opts,
                     .packages = c('polenta', 'ips', 'adephylo', 'foreach', 'phangorn'),
                     .export = c("align_part_set2", "sequences", "align_parts", "n.coopt.sub",
                                 "method", "msa.exec", "msa.program", "msa_out", "stend")
  ) %dopar% {
    eval(parse(text = FUN(i)))
  }
  stopCluster(cl)
  close(pb)
  
  #### unlist nested list
  if (!int_file){
    alt.msa <- foreach(i = 1:length(alt.msa), .combine = c) %do% {
      alt.msa[[1]]
    }
  }
  ##############################################
  ## PART III
  ##############################################
  ## Computation of reliability scores
  ##############################################
  cat("Calculating reliability scores \n")
  
  ## HoT Score
  #----------------------------------------------
  ## Create temporary dir for alternative MSAs
  if (int_file){
    dir.create(paste(tempdir(), "alt", sep = "/"))
    files_from <- list.files(tempdir(), full.names = TRUE)
    files_from <- files_from[grep("\\.fas", files_from)]
    files_to <- list.files(tempdir())
    files_to <- files_to[grep("\\.fas", files_to)]
    files_to <- paste(tempdir(), "alt", files_to, sep = "/")
    
    ## Move files into new dir
    file.rename(files_from, files_to)
    
    rm(alt.msa)
    alt.msa <- paste(tempdir(), "alt", sep = "/")
  }
  
  ## Run msa_set_score
  # switch(score_method,
  #   "Rcpp" = {
  #     ## Rcpp functions are called
  score <- msa_set_scoreR(ref = base.msa,
                          alt = alt.msa)
  #   },
  #   "SA" = {
  #     ## msa_set_score from GUIDANCE package is called
  #     score <- msa_set_scoreSA(ref = base.msa,
  #       alt = alt.msa,
  #       bootstrap = n.coopt, exec = msa_set_score.exec)
  #   }
  # )
  
  ##  if wanted, store alternative MSAs into a zip file
  if (!missing(store_msas)){
    files <- list.files(tempdir())
    files <- files[grep("HoT", files)]
    for (i in 1:(n.coopt * bootstrap)){
      file.rename(paste(tempdir(), files[i], sep = "/"),
                  paste(tempdir(), paste0("altMSA", i, ".fas"), sep = "/"))}
    files <- list.files(tempdir(), full.names = TRUE)
    files <- files[grep("altMSA*", files)]
    if (missing(zip.file)) zip.file <- paste0("guidance_altMSA_", Sys.Date())
    zip(zipfile = zip.file, files = files)
    
    # maybe better to use gzfile
    ## currently zip creates many wired subfolders
  }
  
  ## Delete temporary files
  # this deletion approach has proven best to delete everything
  files <- list.files(tempdir(), full.names = TRUE)
  files <- files[-grep("rs-graphics", files)]
  unlink(files, force = TRUE, recursive = TRUE)
  
  
  ## Prepare and return output
  ## -------------------------
  # if(score_method=="SA"){
  #   score <- score$residue_pair_score
  # }
  if (inherits(sequences, "AAbin")){
    polentaAA(base.msa, score, "HoT")
  } else {
    polentaDNA(base.msa, score, "HoT")
  }
}