rpg: Multiple Sequence Alignment with PASTA and GUIDANCE

Documented in guidance2

#' GUIDetree-based AligNment ConficencE 2
#'

#' @description MSA reliability assessment GUIDANCE2 (Sela et al. 2015)
#'
#' @param sequences An object of class \code{\link{DNAbin}} or \code{\link{AAbin}}
#'   containing unaligned sequences of DNA or amino acids.
#' @param parallel logical, if TRUE, specify the number of cores
#' @param ncore number of cores
#' @param bootstrap An integer giving the number of perturbated MSAs.
#' @param msa.program A charcter string giving the name of the MSA program,
#'   currenlty one of c("mafft", "muscle", "clustalw2"); MAFFT is default
#' @param col.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable columns below the cutoff; either user supplied or "auto" (0.73)
#' @param seq.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable sequences below the cutoff; either user supplied of "auto" (0.5)
#' @param mask.cutoff specific residues below a certain cutoff are masked ('N' for DNA, 'X' for AA); either user supplied of "auto" (0.5)
#' @param exec A character string giving the path to the executable of the
#'   alignment program.
#' @param method further arguments passed to mafft, default is "auto"
#' @param mask specific residues below a certain cutoff are masked ('N' for DNA, 'X' for AA)
#'
#' @return alignment_score: is the GUIDANCE alignment score
#' @return GUIDANCE_residue_score
#' @return GUIDANCE_score: is the GUIDANCE column score
#' @return GUIDANCE_sequence_score
#' @return guidance_msa: is the base MSA removed from unreliable columns below
#'   cutoff
#' @return base_msa
#'
#' @details Calculates column confidence (and other scors) by comparing alternative MSAs generated by the GUIDANCE with varying gap opening panelty and the HoT methodology. First 100 alternative MSAs (with BP guide trees) with varying gap opening panelty are produced, then for each n (default = 4) co-optimal alignments are produced using HoT. The basic comparison between the BP MSAs and a reference MSA is if column residue pairs are identically aligned in all alternative MSAs compared with the base MSA (see \code{compareMSAs}).
#'
#' @references Felsenstein J. 1985. Confidence limits on phylogenies: an
#'   approach using the bootstrap. Evolution 39:783–791
#' @references Penn et al. (2010). An alignment confidence score capturing
#'   robustness to guide tree uncertainty. Molecular Biology and Evolution
#'   27:1759--1767
#' @references Sela et al. (2015). GUIDANCE2: accurate detection of unreliable alignment regions accounting for the uncertainty of multiple parameters. Nucleic acids research 43:W7--W14
#' @references G. Landan and D. Graur (2008). Local reliability measures from sets of co-optimal multiple sequencesuence alignments. Pacific Symposium on Biocomputin. 13:15--24
#'
#' @importFrom ips mafft
#' @import doSNOW
#' @import foreach
#' @import parallel
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#'
#' @seealso \code{\link{compareMSAs}}, \code{\link{guidance}}, \code{\link{HoT}}
#'
#' @author Franz-Sebastian Krah
#' @author Christoph Heibl
#' @export



guidance2 <- function(sequences,
  msa.program = "mafft", exec,
  bootstrap = 100,
  n.part="auto",
  col.cutoff = "auto",
  seq.cutoff = "auto",
  mask.cutoff = "auto",
  parallel = TRUE, ncore ="auto",
  method = "auto",
  alt.msas.file,
  n.coopt = "auto"){

  ##############################################
  ## SOME CHECKS
  ##############################################
  if (!inherits(sequences, c("DNAbin", "AAbin")))
    stop("sequences not of class DNAbin or AAbin (ape)")


  ## Check for MSA program
  if (missing(exec)){
    os <- Sys.info()[1]
    if (msa.program == "mafft") {
      exec <- switch(os, Linux = "mafft", Darwin = "mafft",
        Windows = "mafft.bat")
    }
    if (msa.program == "muscle") {
      exec <- switch(os, Linux = "muscle", Darwin = "muscle",
        Windows = "muscle3.8.31_i86win32.exe")
    }
    if (msa.program == "clustalw2") {
      exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
        Windows = "clustalw2.exe")
    }
  }
  out <- system(paste(exec, "--v"), ignore.stdout = TRUE, ignore.stderr = TRUE)
  if (out == 127)
    stop("please provide exec path or install MSA program in root \n
      i.e. in Unix: '/usr/local/bin/mafft'")


  ## generate some parameters if not specified
  #---------------------------------------------
  ## number of cores
  if (ncore == "auto"){
    ncore <- detectCores(all.tests = FALSE, logical = TRUE)
  }

  ## specify number of sampled co-optimal MSAs from HoT
  if (n.coopt == "auto"){ n.coopt <- 4 }

  ## set type of the sequences data
  type <- class(sequences)
  type <- gsub("bin", "", type)

  ##############################################
  ## PART I
  ##############################################
  ## BASE and PERTUBATED MSAs
  ##############################################

  ## Generate BASE alignment
  ###########################
  cat("Generating the base alignment")
  if (msa.program == "mafft"){
    base.msa <- mafft(sequences, method = method, exec = exec)
  }
  if (msa.program == "muscle"){
    base.msa <- muscle2(sequences, exec = exec)
  }
  if (msa.program == "clustalw2"){
    base.msa <- clustalw2(x = sequences, exec = exec)
  }

  cat("... done \n")

  ## form into matrix for perturbation
  # base.msa <- as.character(base.msa)

  ## Constructing BP guide-trees for the pertubated MSAs
  #######################################################
  # cat("Pertubating base alignment\n")
  #
  # pb <- txtProgressBar(max = bootstrap, style = 3)
  # if(inherits(sequences, "DNAbin")){
  #   base.msa.bp <- foreach(i = 1:bootstrap) %do% {
  #     setTxtProgressBar(pb, i)
  #     as.DNAbin(base.msa[,sample(ncol(base.msa), replace = TRUE)])
  #   }
  # }
  # if(inherits(sequences, "AAbin")){
  #   base.msa.bp <- foreach(i = 1:bootstrap) %do% {
  #     setTxtProgressBar(pb, i)
  #     as.AAbin(base.msa[,sample(ncol(base.msa), replace = TRUE)])
  #
  #   }
  # }
  # close(pb)

  ## Generating alternative (pertubated) MSAs
  #--------------------------------------------
  # cat("Generating alternative alignments (GUIDANCE) \n")
  #
  # ## Compute NJ guide trees
  # cat("  Generating NJ guide trees \n")
  # if (parallel){
  #   pb <- txtProgressBar(max = bootstrap, style = 3)
  #   progress <- function(n) setTxtProgressBar(pb, n)
  #   opts <- list(progress = progress)
  #
  #   cl <- makeCluster(ncore)
  #   registerDoSNOW(cl)
  #   nj.guide.trees <- foreach(i = 1:bootstrap,
  #     .options.snow = opts, .packages = "phangorn") %dopar% {
  #       # convert to class phyDAT
  #       base.msa.ml <- as.phyDat(as.character(base.msa.bp[[i]]))
  #       # find ML distance as input to nj tree search
  #       ml.dist.msa <- dist.ml(base.msa.ml)
  #       # NJ
  #       ape::nj(ml.dist.msa)
  #     }
  #   stopCluster(cl)
  # }
  # if (!parallel){
  #   nj.guide.trees <- foreach(i = 1:bootstrap, .packages = "phangorn") %do% {
  #     setTxtProgressBar(pb, i)
  #     # convert to class phyDAT
  #     base.msa.ml <- as.phyDat(base.msa.bp[[i]])
  #     # find ML distance as input to nj tree search
  #     ml.dist.msa <- dist.ml(base.msa.ml)
  #     # NJ
  #     ape::nj(ml.dist.msa)
  #   }
  # }
  # close(pb)

  cat("Generating NJ guide trees \n")
  ## Compute NJ guide trees

  pb <- txtProgressBar(max = bootstrap, style = 3)
  if (parallel){
    progress <- function(n) setTxtProgressBar(pb, n)
    opts <- list(progress = progress)

    cl <- makeCluster(ncore)
    registerDoSNOW(cl)

    nj.guide.trees <- foreach(i = 1:bootstrap,
      .options.snow = opts,
      .packages = "phangorn", .export = 'msaBP_nj_tree') %dopar% {
        msaBP_nj_tree(base.msa, outgroup = "auto")
      }
    stopCluster(cl)
  }
  if (!parallel){
    nj.guide.trees <- foreach(i = 1:bootstrap, .packages = "phangorn") %do%{
      setTxtProgressBar(pb, i)
      msaBP_nj_tree(base.msa, outgroup = "auto")
    }
  }
  close(pb)

  # ## Find NJ tree for base MSA
  # ## 1st tip label of base.nj tree as outgroup for all guide.njs
  # base.nj <- ape::nj(dist.ml(as.phyDat(base.msa)))
  # base.nj <- root(base.nj, outgroup = base.nj$tip.label[1])
  #
  # ## Root each tree on the first tip label of the base.nj tree
  # nj.guide.trees <- lapply(nj.guide.trees, root,
  #   outgroup = base.nj$tip.label[1],
  #   resolve.root = TRUE)
  #
  # ## Rescale branch lengths
  # nj.guide.trees <- lapply(nj.guide.trees, multi2di)
  # nj.guide.trees <- lapply(nj.guide.trees, compute.brlen)


  ## Alignment of MSA BP times with new NJ guide trees
  ## -------------------------------------------------
  cat("Alignment of alternative MSAs using NJ guide trees (GUIDANCE)\n")

  ## Prepare TEMP files for all outputs (GUIDANCE + HoT)
  msa_out <- vector(length = (bootstrap+ (bootstrap*n.coopt)))
  for (i in 1:bootstrap)
    msa_out[i] <- tempfile(pattern = "mafft", tmpdir = tempdir(), fileext = ".fas")
  for (i in 1:(bootstrap*n.coopt))
    msa_out[i+bootstrap] <- tempfile(pattern = "HoT", tmpdir = tempdir(), fileext = ".fas")
  unlink(msa_out[file.exists(msa_out)])


  ## Align perturbated MSAs (GUIDANCE)
  pb <- txtProgressBar(max = bootstrap, style = 3)
  if (parallel){
    progress <- function(n) setTxtProgressBar(pb, n)
    opts <- list(progress = progress)
    cl <- makeCluster(ncore)
    registerDoSNOW(cl)

    if (msa.program == "mafft"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape', 'rpg'),
        .options.snow = opts)  %dopar% {
          mafft2(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i], method = method,
            op = runif(1,0,5))
        }
    }
    if (msa.program == "muscle"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape', 'rpg'),
        .options.snow = opts)  %dopar% {
          muscle2(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec,
            file = msa_out[i],
            MoreArgs = paste("-gapopen ",
              format(runif(1, -400, -10), digits = 0), sep =""))
        }
    }
    if (msa.program == "clustalw2"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape', 'rpg'),
        .options.snow = opts)  %dopar% {
          clustalw2(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i],
            MoreArgs = paste("-PAIRGAP=", format(runif(1, 1, 9), digits = 0), sep =""))
        }
    }
    stopCluster(cl)
  }

  if (!parallel){
    if (msa.program == "mafft"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        mafft2(x = sequences, gt = nj.guide.trees[[i]],
          method = method, exec = exec, file = msa_out[i],
          op = runif(1,0,5))
      }
    }
    if (msa.program == "muscle"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        muscle2(x = sequences, gt = nj.guide.trees[[i]],
          exec = exec, file = msa_out[i],
          MoreArgs = paste("-gapopen ", format(runif(1, -400, -10), digits = 0), sep =""))
      }
    }
    if (msa.program == "clustalw2"){
      foreach(i = 1:bootstrap, .packages=c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        clustalw2(x = sequences, gt = nj.guide.trees[[i]],
          exec = exec, file = msa_out[i],
          MoreArgs = paste("--PAIRGAP= ", format(runif(1, 1, 9), digits = 0), sep =""))
      }
    }
  }
  close(pb)

  mafft_created <- list.files(getwd(),
    full.names = T)[grep("tree.mafft", list.files(getwd()))]
  if(length(mafft_created)>0){
    file.remove(mafft_created)
  }



  ##############################################
  ## PART III
  ##############################################
  ## Generate co-optimal alignments with HoT
  ##############################################
  cat(paste("Sampling", n.coopt, "co-optimal alignments (HoT) \n", sep=" "))

  # predifined file allocation
  start <- seq(1,n.coopt*bootstrap,n.coopt)
  end <- seq(n.coopt,n.coopt*bootstrap,n.coopt)
  stend <- data.frame(start, end)
  stend <- stend + bootstrap

  ## run HoT
  pb <- txtProgressBar(max = bootstrap, style = 3)

  if(parallel){
    progress <- function(n) setTxtProgressBar(pb, n)
    opts <- list(progress = progress)
    cl <- makeCluster(ncore)
    registerDoSNOW(cl)

    foreach(i = 1:bootstrap,.options.snow = opts,
      .packages = c('rpg', 'ips', 'adephylo', 'foreach', 'phangorn')) %dopar% {
      Hot_GUIDANCE2(msa = msa_out[i], n.coopt = n.coopt,
      type = type, files = msa_out[stend[i,1]:stend[i,2]],
        raw_seq = sequences, msa.program = msa.program,
        method = method, exec = exec)
      }
    stopCluster(cl)
  }
  if(!parallel){
    foreach(i = 1:bootstrap) %do% {
      setTxtProgressBar(pb, i)
      Hot_GUIDANCE2(msa = msa_out[i], n.coopt = n.coopt,
        type = type, files = msa_out[stend[i,1]:stend[i,2]],
        raw_seq = sequences, msa.program = msa.program,
        method= method, exec = exec)
    }
  }
  close(pb)


  ##############################################
  ## PART IV
  ##############################################
  ## Computation of GUIDANCE 2 scores
  ##############################################
  cat("Calculating GUIDANCE2 scores \n")

  # ## GUIDANCE Score
  # ##############################################

  ## produce input format for msa_set_score program
  # transfrom character matrix (sequences are columns)
  # base.msa.t <- data.frame(t(base.msa))
  #
  # pb <- txtProgressBar(max = bootstrap, style = 3)
  # if (parallel){
  #   progress <- function(n) setTxtProgressBar(pb, n)
  #   opts <- list(progress = progress)
  #   cl <- makeCluster(ncore)
  #   registerDoSNOW(cl)
  #
  #   bpres <- foreach(i = (bootstrap+1):length(msa_out), .options.snow = opts,
  #     .export = 'compareMSAs', .packages = "ips") %dopar% {
  #       guide.msa <- read.fas(msa_out[i], type = type)
  #       guide.msa <- data.frame(t(as.character(guide.msa)))
  #       guide.msa <- guide.msa[,match(colnames(base.msa.t), colnames(guide.msa))]
  #       compareMSAs(ref = base.msa.t, com = guide.msa)
  #     }
  #   stopCluster(cl)
  # }
  # if (!parallel){
  #   bpres <- foreach(i = (bootstrap+1):length(msa_out), .export = 'compareMSAs',
  #     .packages = 'ips') %do% {
  #       setTxtProgressBar(pb, i)
  #       guide.msa <- read.fas(msa_out[i], type =type)
  #       guide.msa <- data.frame(t(as.character(guide.msa)))
  #       guide.msa <- guide.msa[,match(colnames(base.msa.t), colnames(guide.msa))]
  #       compareMSAs(ref = base.msa.t, com = guide.msa)
  #     }
  # }
  # close(pb)

  dir.create(paste(tempdir(), "alt", sep="/"))
  files_from <- list.files(tempdir(), full.names = TRUE)
  files_from <- files_from[grep("\\.fas", files_from)]
  files_to <- list.files(tempdir())
  files_to <- files_to[grep("\\.fas", files_to)]
  files_to <- paste(tempdir(), "alt", files_to, sep="/")

  file.rename(files_from, files_to)


  scores <- compareMSAs(ref = base.msa, dir_path = paste(tempdir(), "alt", sep ="/"),
    com = NULL)

  ##  if wanted, store alternative MSAs into a zip file
  if(!missing(alt.msas.file)){
    files <- list.files(tempdir())
    files <- files[grep("HoT", files)]
    for(i in 1:(n.coopt*bootstrap)){
      file.rename(paste(tempdir(), files[i], sep="/"),
        paste(tempdir(), paste("altMSA", i, ".fas",sep=""), sep="/"))}
    files <- list.files(tempdir(), full.names = TRUE)
    files <- files[grep("altMSA*", files)]
    zip(zipfile = alt.msas.file, files = files)

    # maybe better to use gzfile
    ## currently zip creates many wired subfolders
  }

  ## delete temporary files in temporary directory
  unlink(msa_out[file.exists(msa_out)], force = TRUE)
  unlink(list.files(paste(tempdir(),"alt", sep="/"), full.names = TRUE), force = TRUE, recursive = TRUE)
  # unlink(tempdir(), force = TRUE) # do not use this, it causes problems


  ### Calculate mean scores
  #------------------------------
  # Mean score
  # msc <- do.call(rbind, lapply(bpres, function(x) x[[1]]))
  # msc[] <- lapply(msc, as.numeric)
  # msc <- colMeans(msc)
  # msc <- data.frame(msc)
  #
  # ## Column score
  # CS <- do.call(cbind, lapply(bpres, function(x) x[[2]]))
  # del <- grep("col", names(CS))
  # CS <- CS[, -del[2:length(del)]]
  # CS <- data.frame(col = CS[, 1],
  #   column_score = rowMeans(CS[,2:ncol(CS)], na.rm = TRUE))
  #
  # ## Residue pair column score (GUIDANCE Score)
  # g.cs <- do.call(cbind, lapply(bpres, function(x) x[[3]]))
  # del <- grep("col\\b", names(g.cs))
  # g.cs <- g.cs[, -del[2:length(del)]]
  # g.cs <- data.frame(col = g.cs[, 1],
  #   res_pair_col_score = rowMeans(g.cs[,2:ncol(g.cs)], na.rm = TRUE))
  #
  # ## GUIDANCE Alignment score
  # alignment_score <- mean(g.cs[, 2])
  # msc <- rbind(msc, MEAN_GUIDANCE_SCORE = alignment_score)
  #
  # # Residue pair residue score
  # rpr.sc <- do.call(cbind, lapply(bpres, function(x) x[[4]]))
  # del <- grep("col\\b|residue", names(rpr.sc))
  # rpr.sc <- rpr.sc[, -del[3:length(del)]]
  # rpr.sc <- data.frame(col = rpr.sc[, 1], residue = rpr.sc[, 2],
  #   res_pair_res_score = rowMeans(rpr.sc[,3:ncol(rpr.sc)], na.rm = TRUE))
  #
  # # Residual pair sequence pair score
  # rpsp.sc <- do.call(cbind, lapply(bpres, function(x) x[[5]]))
  # del <- grep("seq_row1|seq_row2", names(rpsp.sc))
  # rpsp.sc <- rpsp.sc[, -del[3:length(del)]]
  # rpsp.sc <- data.frame(seq1 = rpsp.sc[, 1], seq2 = rpsp.sc[, 2],
  #   res_pair_seq_pair_score = rowMeans(rpsp.sc[,3:ncol(rpsp.sc)], na.rm = TRUE))
  #
  # # Residual pair sequence score
  # rps.sc <- do.call(cbind, lapply(bpres, function(x) x[[6]]))
  # del <- grep("seq", names(rps.sc))
  # rps.sc <- rps.sc[, -del[2:length(del)]]
  # rps.sc <- data.frame(seq = rps.sc[, 1],
  #   res_pair_seq_score = rowMeans(rps.sc[,2:ncol(rps.sc)], na.rm = TRUE))
  #
  # # Residue pair score
  # rp.sc <- do.call(cbind, lapply(bpres, function(x) x[[7]]))
  # del <- grep("col|row1|row2", names(rp.sc))
  # rp.sc <- rp.sc[, -del[4:length(del)]]
  # rp.sc <- data.frame(col = rp.sc[, 1], row1 = rp.sc[,2], col2 = rp.sc[,3],
  #   res_pair_score = rowMeans(rp.sc[,4:ncol(rp.sc)], na.rm = TRUE))
  # ### Calculate mean scores DONE
  # # maybe put this into a own function
  #


  msa <- guidance2.msa <- base.msa
  msa <- as.character(msa)
  ## masking residues below cutoff
  if (mask.cutoff > 0){
    txt <- as.vector(as.character(base.msa))
    # mat <- data.frame(rpr.sc, txt)
    mat <- data.frame(scores$residue_pair_residue_score, txt)
    rown <- max(mat$residue)
    coln <- max(mat$col)
    res_mat <- matrix(mat$score, nrow = rown, ncol = coln)

    if (mask.cutoff == "auto"){ mask.cutoff <- 0.50 }

    if (inherits(sequences, "DNAbin")){
      msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "N"
      msa <- as.DNAbin(msa)
    }
    if (inherits(sequences, "AAbin")) {
      msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "X"
      rownames(msa) <- labels(sequences)
      class(msa) <- "AAbin"
    }
    guidance2.msa <- msa
  }
  ## remove unreliable columns
  if (col.cutoff > 0){
    if (mask.cutoff > 0) { msa <- guidance2.msa
    } else { msa <- base.msa }
    if(col.cutoff =="auto"){col.cutoff <- 0.97}
    # remove_cols <- g.cs$res_pair_col_score < col.cutoff
    remove_cols <- scores$column_score$CS < col.cutoff
    guidance2.msa <- msa[,!remove_cols]
  }
  ## remove unreliable sequences
  if (seq.cutoff > 0){
    if (mask.cutoff > 0) { msa <- guidance2.msa
    } else { msa <- base.msa }
    if (seq.cutoff =="auto"){seq.cutoff <- 0.5}
    # remove_sequences <- rps.sc$res_pair_seq_score < seq.cutoff
    remove_sequences <- scores$residual_pair_sequence_score$score < seq.cutoff
    guidance2.msa <- msa[!remove_sequences,]
  }

  ## prepare base.msa for output
  if(inherits(sequences, "DNAbin") & !inherits(base.msa, "DNAbin"))
    base.msa <- as.DNAbin(base.msa)
  if(inherits(sequences, "AAbin") & !inherits(base.msa, "AAbin"))
    base.msa <- as.AAbin(base.msa)

  ## Produce output
  # res <- list(mean_score = msc,
  #   column_score = CS,
  #   GUIDANCE_column_score = g.cs,
  #   residue_pair_residue_score = rpr.sc,
  #   residual_pair_sequence_pair_score  = rpsp.sc,
  #   residual_pair_sequence_score = rps.sc,
  #   residue_pair_score = rp.sc,
  #   guidance2_msa = guidance2.msa,
  #   base_msa = base.msa)

  res <- list(scores = scores,
    GUIDANCE2_msa = guidance2.msa,
    base_msa = base.msa)

  return(res)
}