rpg: Multiple Sequence Alignment with PASTA and GUIDANCE

Documented in guidance

#' GUIDetree-based AligNment ConficencE
#'
#' @description MSA reliability assessment GUIDANCE (Penn et al. 2010)
#'
#' @param sequences An object of class \code{\link{DNAbin}} or
#'   \code{\link{AAbin}} containing unaligned sequences of DNA or amino acids.
#' @param parallel logical, if TRUE, specify the number of cores
#' @param ncore number of cores (default is maxinum of local threads)
#' @param bootstrap An integer giving the number of perturbated MSAs.
#' @param msa.program A charcter string giving the name of the MSA program,
#'   currelty one of c("mafft", "muscle", "clustalo", "clustalw2"); MAFFT is
#'   default
#' @param exec A character string giving the path to the executable of the
#'   alignment program.
#' @param method further arguments passed to mafft, default is \code{"auto"}
#' @param col.cutoff numberic between 0 and 1; specifies a cutoff to remove
#'   unreliable columns below the cutoff; either user supplied or \code{"auto"}
#'   (0.73)
#' @param seq.cutoff numberic between 0 and 1; specifies a cutoff to remove
#'   unreliable sequences below the cutoff; either user supplied of
#'   \code{"auto"} (0.5)
#' @param mask.cutoff specific residues below a certain cutoff are masked ('N'
#'   for DNA, 'X' for AA); either user supplied or \code{"auto"} (0.5)
#' @return list containing following scores and alignments:
#' @return mean_scores residue pair score and mean column score
#' @return column_score
#' @return residue_column_score GUIDANCE score
#' @return residue_pair_residue_score
#' @return residual_pair_sequence_pair_score
#' @return residual_pair_sequence_score
#' @return residue_pair_score
#' @return base_msa
#' @return guidance_msa is the base_MSA removed from unreliable
#'   residues/columns/sequences below cutoffs
#'
#' @details Calculates column confidence (and other scors) by comparing
#'   alternative MSAs generated by alternative guide trees derived from
#'   bootstrap MSAs (Felsenstein 1985). The basic comparison between the BP MSAs
#'   and a reference MSA is if column residue pairs are identically aligned in
#'   all alternative MSAs compared with the base MSA (see \code{compareMSAs}).
#' @references Felsenstein J. 1985. Confidence limits on phylogenies: an
#'   approach using the bootstrap. Evolution 39:783–791
#' @references Penn et al. (2010). An alignment confidence score capturing
#'   robustness to guide tree uncertainty. Molecular Biology and Evolution
#'   27:1759--1767
#' @references G. Landan and D. Graur (2008). Local reliability measures from
#'   sets of co-optimal multiple sequencesuence alignments. 13:15--24
#'
#' @import ips
#' @import doSNOW
#' @import foreach
#' @import parallel
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#'
#' @seealso \code{\link{compareMSAs}}, \code{\link{guidance2}},
#'   \code{\link{HoT}}
#'
#' @examples
#' \dontrun{
#' # first run GUIDANCE on example data using MAFFT
#' file <- system.file("extdata", "BB50009.fasta", package = "rpg")
#' aa_seq<- read.fas(file)
#' g_msa <- guidance(sequences = aa_seq,
#' msa.program = "mafft",
#' exec = "/usr/local/bin/mafft",
#' bootstrap = 100,
#' parallel = FALSE,
#' method = "retree 1")
#' h.p <- confidence.heatmap(g_msa, title = "GUIDANCE BB50009 benchmark",
#' legend = TRUE,guidance_score = TRUE)
#' h.p
#' # again with Muscle
#' g_msa_m <- guidance(sequences = aa_seq,
#' msa.program = "muscle",
#' exec = "/Applications/muscle",
#' bootstrap = 100,
#' parallel = FALSE,
#' method = "retree 1")
#' h.p <- confidence.heatmap(g_msa_m, title = "GUIDANCE BB50009 benchmark",
#' legend = TRUE,guidance_score = TRUE)
#' h.p
#'
#' ## Plot both for comparison
#' h.p.mafft <- confidence.heatmap(g_msa, title = "MAFFT",
#' legend = FALSE, guidance_score = FALSE)
#' h.p.muscle <- confidence.heatmap(g_msa_m, title = "MUSCLE",
#' legend = FALSE, guidance_score = FALSE)
#' library(cowplot)
#' plot_grid(h.p.mafft, h.p.muscle, ncol = 1, nrow = 2)
#' }
#'
#' @author Franz-Sebastian Krah
#' @author Christoph Heibl
#' @export

guidance <- function(sequences,
  msa.program = "mafft", exec,
  bootstrap = 100,
  col.cutoff = "auto",
  seq.cutoff = "auto",
  mask.cutoff = "auto",
  parallel = FALSE, ncore ="auto",
  method = "auto",
  alt.msas.file){

  ##############################################
  ## SOME CHECKS
  ##############################################
  if (!inherits(sequences, c("DNAbin", "AAbin")))
    stop("sequences not of class DNAbin or AAbin (ape)")


  if (length(labels(sequences)) < 8)
    warning("GUIDANCE is not suitable for alignments of very few sequences.\n
      As a rule of thumb, use GUIDANCE2 or HoT for less than 8 sequences.")

  ## Check for MSA program
  if (missing(exec)){
    os <- Sys.info()[1]
    if (msa.program =="mafft") {
      exec <- switch(os, Linux = "mafft", Darwin = "mafft",
        Windows = "mafft.bat")
    }
    if (msa.program == "muscle") {
      exec <- switch(os, Linux = "muscle", Darwin = "muscle",
        Windows = "muscle3.8.31_i86win32.exe")
    }
    if (msa.program == "clustalo") {
      exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
        Windows = "clustalo.exe")
    }
    if (msa.program == "clustalw2") {
      exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
        Windows = "clustalw2.exe")
    }
  }
  out <- system(paste(exec, "--v"), ignore.stdout = TRUE, ignore.stderr = TRUE)
  if (out == 127)
    stop("please provide exec path or install MSA program in root \n
      i.e. in Unix: '/usr/local/bin/mafft'")

  ## generate some parameters if not specified
  #---------------------------------------------
  ## number of cores
  if (ncore == "auto"){
    ncore <- detectCores(all.tests = FALSE, logical = TRUE)
  }
  ## Sequence type
  type <- class(sequences)
  type <- gsub("bin", "", type)

  ##############################################
  ## PART I
  ##############################################
  ## BASE and ALTERNATIVE MSAs
  ##############################################

  ## Generate BASE MSA
  #---------------------------------------------
  cat("Generating the base alignment")
  if (msa.program == "mafft"){
    base.msa <- mafft(x = sequences, method = method, exec = exec)
  }
  if (msa.program == "muscle"){
    base.msa <- muscle2(x = sequences, exec = exec)
  }
  if (msa.program == "clustalo"){
    base.msa <- clustalo(x = sequences, exec = exec)
  }
  if (msa.program == "clustalw2"){
    base.msa <- clustalw2(x = sequences, exec = exec)
  }
  cat("... done \n")

  ## Constructing BP guide-trees for the pertubated MSAs
  #--------------------------------------------------------
  cat("Generating NJ guide trees \n")
  ## Compute NJ guide trees
  semphy <- FALSE
  if (semphy){
    #### SEMPHY  ###
    if(nrow(base.msa) > 150){
      if (type == "DNA")
        model <- "--nucjc"
      if (type == "AA")
        model <- "--aaJC"
    } else {
      if (tpye == "DNA")
        model <- "--hky"
      if (type == "AA")
        model <- "--jtt"
    }
    nj.guide.trees <- semphy(msa = base.msa,
      dist.table<- "-J",
      model <- model,
      arsv <- "-H",
      bootstrap = bootstrap,
      verbose = verbose)

    # lapply(nj.guide.trees, root, outgroup = )
    nj.guide.trees <- lapply(nj.guide.trees, compute.brlen)
  }

  Rnj <- TRUE
  pb <- txtProgressBar(max = bootstrap, style = 3)

  if (Rnj){
    if (parallel){
      progress <- function(n) setTxtProgressBar(pb, n)
      opts <- list(progress = progress)

      cl <- makeCluster(ncore)
      registerDoSNOW(cl)

      nj.guide.trees <- foreach(i = 1:bootstrap,
        .options.snow = opts,
        .packages = "phangorn", .export = 'msaBP_nj_tree') %dopar% {
          msaBP_nj_tree(base.msa, outgroup = "auto")
        }
      stopCluster(cl)
    }
    if (!parallel){
      nj.guide.trees <- foreach(i = 1:bootstrap, .packages = "phangorn") %do%{
        setTxtProgressBar(pb, i)
        msaBP_nj_tree(base.msa, outgroup = "auto")
      }
    }
    close(pb)
  }

  ## Alignment of MSA BP times with new NJ guide trees
  ## -------------------------------------------------
  cat("Alignment of sequences using NJ guide trees \n")

  msa_out <- vector(length = bootstrap)
  for (i in seq_along(msa_out))
    msa_out[i] <- tempfile(pattern = "mafft", tmpdir = tempdir(), fileext = ".fas")
  unlink(msa_out[file.exists(msa_out)])

  pb <- txtProgressBar(max = bootstrap, style = 3)

  if (parallel){
    progress <- function(n) setTxtProgressBar(pb, n)
    opts <- list(progress = progress)
    cl <- makeCluster(ncore)
    registerDoSNOW(cl)

    if (msa.program == "mafft"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape'),
        .options.snow = opts)  %dopar% {
          mafft(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i], method = method)
        }
    }
    if (msa.program == "muscle"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape'),
        .options.snow = opts)  %dopar% {
          muscle2(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i])
        }
    }
    if (msa.program == "clustalo"){
      foreach(i = 1:bootstrap, .packages =c('ips', 'ape'),
        .options.snow = opts)  %dopar% {
          clustalo(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i])

        }
    }
    if (msa.program == "clustalw2"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape'),
        .options.snow = opts)  %dopar% {
          clustalw2(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i])
        }
    }
    stopCluster(cl)
  }

  if (!parallel){
    if (msa.program == "mafft"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        mafft2(x = sequences, gt = nj.guide.trees[[i]],
          method = method, exec = exec, file = msa_out[i])
      }
    }
    if (msa.program == "muscle"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        muscle2(x = sequences, gt = nj.guide.trees[[i]],
          exec = exec, file = msa_out[i])
      }
    }
    if (msa.program == "clustalo"){
      foreach(i = 1:bootstrap, .packages =c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        clustalo(x = sequences, gt = nj.guide.trees[[i]],
          exec = exec, file = msa_out[i])
      }
    }
    if (msa.program == "clustalw2"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        clustalw2(x = sequences, gt = nj.guide.trees[[i]],
          exec = exec, file = msa_out[i])
      }
    }
  }
  close(pb)

  mafft_created <- list.files(getwd(),
    full.names = TRUE)[grep("tree.mafft", list.files(getwd()))]
  if (length(mafft_created)){
    file.remove(mafft_created)
  }

  ##############################################
  ## PART II
  ##############################################
  ## Computation of GUIDANCE scores
  ##############################################
  cat("Calculating GUIDANCE scores \n")

  # ## GUIDANCE Score
  #----------------------------------------------
  ## Create temporary dir for alternative MSAs
  dir.create(paste(tempdir(), "alt", sep="/"))
  files_from <- list.files(tempdir(), full.names = TRUE)
  files_from <- files_from[grep("\\.fas", files_from)]
  files_to <- list.files(tempdir())
  files_to <- files_to[grep("\\.fas", files_to)]
  files_to <- paste(tempdir(), "alt", files_to, sep="/")

  ## Move files into new dir
  file.rename(files_from, files_to)

  ## Run msa_set_score
  #---------------------
  scores <- compareMSAs(ref = base.msa,
    dir_path = paste(tempdir(), "alt", sep ="/"),
    com = NULL)

  ##  if wanted, store alternative MSAs into a zip file
  if (!missing(alt.msas.file)){
    files <- list.files(paste(tempdir(), "alt", sep="/"))
    files <- files[grep("HoT", files)]
    for(i in 1:(n.coopt*bootstrap)){
      file.rename(paste(tempdir(), files[i], sep="/"),
        paste(tempdir(), paste("altMSA", i, ".fas",sep=""), sep="/"))}
    files <- list.files(tempdir(), full.names = TRUE)
    files <- files[grep("altMSA*", files)]
    zip(zipfile = alt.msas.file, files = files)

    # maybe better to use gzfile
    ## currently zip creates many wired subfolders
  }

  ## Delete temporary files
  # this deletion approach has proven best to delete everything
  files <- list.files(tempdir(), full.names = TRUE)
  files <- files[-grep("rs-graphics", files)]
  unlink(files, force = TRUE, recursive = TRUE)

  ## Reduce MSA according to cutoffs, it wanted
  #---------------------------------------------
  msa <- guidance.msa <- base.msa
  msa <- as.character(msa)
  ## masking residues below cutoff
  if (mask.cutoff > 0){
    txt <- as.vector(as.character(base.msa))
    # mat <- data.frame(rpr.sc, txt)
    mat <- data.frame(scores$residue_pair_residue_score, txt)
    rown <- max(mat$residue)
    coln <- max(mat$col)
    res_mat <- matrix(mat$score, nrow = rown, ncol = coln)

    if (mask.cutoff == "auto"){ mask.cutoff <- 0.50 }

    if (inherits(sequences, "DNAbin")){
      msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "N"
      msa <- as.DNAbin(msa)
    }
    if (inherits(sequences, "AAbin")) {
      msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "X"
      rownames(msa) <- labels(sequences)
      class(msa) <- "AAbin"
    }
    guidance.msa <- msa
  }
  ## remove unreliable columns
  if (col.cutoff > 0){
    if (mask.cutoff) {
      msa <- guidance.msa
    } else {
      msa <- base.msa
      }
    if (col.cutoff == "auto"){ col.cutoff <- 0.97 }
    # remove_cols <- g.cs$res_pair_col_score < col.cutoff
    remove_cols <- scores$column_score$CS < col.cutoff
    guidance.msa <- msa[, !remove_cols]
  }
  ## remove unreliable sequences
  if (seq.cutoff > 0){
    if (mask.cutoff) { msa <- guidance.msa
    } else { msa <- base.msa }
    if (seq.cutoff =="auto"){seq.cutoff <- 0.5}
    # remove_sequences <- rps.sc$res_pair_seq_score < seq.cutoff
    remove_sequences <- scores$residual_pair_sequence_score$score < seq.cutoff
    guidance.msa <- msa[!remove_sequences,]
  }

  ## Generate output
  res <- list(scores = scores,
    GUIDANCE_msa = guidance.msa,
    base_msa = base.msa)

  ## Return output
  return(res)
}