dep/guidance.R

#' GUIDetree-based AligNment ConficencE
#'
#' @description MSA reliability assessment GUIDANCE (Penn et al. 2010)
#'
#' @param sequences An object of class \code{\link{DNAbin}} or
#'   \code{\link{AAbin}} containing unaligned sequences of DNA or amino acids.
#' @param parallel logical, if TRUE, specify the number of cores
#' @param ncore number of cores (default is maxinum of local threads)
#' @param bootstrap An integer giving the number of perturbated MSAs.
#' @param msa.program A charcter string giving the name of the MSA program,
#'   currelty one of c("mafft", "muscle", "clustalo", "clustalw2"); MAFFT is
#'   default
#' @param exec A character string giving the path to the executable of the
#'   alignment program.
#' @param method further arguments passed to mafft, default is \code{"auto"}
#' @param col.cutoff numberic between 0 and 1; specifies a cutoff to remove
#'   unreliable columns below the cutoff; either user supplied or \code{"auto"}
#'   (0.73)
#' @param seq.cutoff numberic between 0 and 1; specifies a cutoff to remove
#'   unreliable sequences below the cutoff; either user supplied of
#'   \code{"auto"} (0.5)
#' @param mask.cutoff specific residues below a certain cutoff are masked ('N'
#'   for DNA, 'X' for AA); either user supplied or \code{"auto"} (0.5)
#' @param nj.program specify if R functions or SEMPHY should be used for guide-tree estimation
#' @return list containing following scores and alignments:
#' @return mean_scores residue pair score and mean column score
#' @return column_score
#' @return residue_column_score GUIDANCE score
#' @return residue_pair_residue_score
#' @return residual_pair_sequence_pair_score
#' @return residual_pair_sequence_score
#' @return residue_pair_score
#' @return base_msa
#' @return guidance_msa is the base_MSA removed from unreliable
#'   residues/columns/sequences below cutoffs
#'
#' @details Calculates column confidence (and other scors) by comparing
#'   alternative MSAs generated by alternative guide trees derived from
#'   bootstrap MSAs (Felsenstein 1985). The basic comparison between the BP MSAs
#'   and a reference MSA is if column residue pairs are identically aligned in
#'   all alternative MSAs compared with the base MSA (see \code{compareMSAs}).
#' @references Felsenstein J. 1985. Confidence limits on phylogenies: an
#'   approach using the bootstrap. Evolution 39:783–791
#' @references Penn et al. (2010). An alignment confidence score capturing
#'   robustness to guide tree uncertainty. Molecular Biology and Evolution
#'   27:1759--1767
#' @references G. Landan and D. Graur (2008). Local reliability measures from
#'   sets of co-optimal multiple sequencesuence alignments. 13:15--24
#'
#' @import ips
#' @import doSNOW
#' @import foreach
#' @import parallel
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#'
#' @seealso \code{\link{compareMSAs}}, \code{\link{guidance2}},
#'   \code{\link{HoT}}
#'
#' @examples
#' \dontrun{
#' # first run GUIDANCE on example data using MAFFT
#' file <- system.file("extdata", "BB50009.fasta", package = "rpg")
#' aa_seq<- read.fas(file)
#' g_msa <- guidance(sequences = aa_seq,
#' msa.program = "mafft",
#' exec = "/usr/local/bin/mafft",
#' bootstrap = 100,
#' parallel = FALSE,
#' method = "retree 1")
#' h.p <- confidence.heatmap(g_msa, title = "GUIDANCE BB50009 benchmark",
#' legend = TRUE,guidance_score = TRUE)
#' h.p
#' # again with Muscle
#' g_msa_m <- guidance(sequences = aa_seq,
#' msa.program = "muscle",
#' exec = "/Applications/muscle",
#' bootstrap = 100,
#' parallel = FALSE,
#' method = "retree 1")
#' h.p <- confidence.heatmap(g_msa_m, title = "GUIDANCE BB50009 benchmark",
#' legend = TRUE,guidance_score = TRUE)
#' h.p
#'
#' ## Plot both for comparison
#' h.p.mafft <- confidence.heatmap(g_msa, title = "MAFFT",
#' legend = FALSE, guidance_score = FALSE)
#' h.p.muscle <- confidence.heatmap(g_msa_m, title = "MUSCLE",
#' legend = FALSE, guidance_score = FALSE)
#' library(cowplot)
#' plot_grid(h.p.mafft, h.p.muscle, ncol = 1, nrow = 2)
#' }
#'
#' @author Franz-Sebastian Krah
#' @author Christoph Heibl
#' @export

guidance <- function(sequences,
  msa.program = "mafft", exec,
  bootstrap = 100,
  col.cutoff = "auto",
  seq.cutoff = "auto",
  mask.cutoff = "auto",
  parallel = FALSE, ncore ="auto",
  method = "auto",
  alt.msas.file,
  nj.program){

  ##############################################
  ## SOME CHECKS
  ##############################################
  if (!inherits(sequences, c("DNAbin", "AAbin")))
    stop("sequences not of class DNAbin or AAbin (ape)")


  if (length(labels(sequences)) < 8)
    warning("GUIDANCE is not suitable for alignments of very few sequences.\n
      As a rule of thumb, use GUIDANCE2 or HoT for less than 8 sequences.")

  ## Check for MSA program
  if (missing(exec)){
    os <- Sys.info()
    os <- os[grep("sysname", names(os))]
    if (msa.program == "mafft") {
      exec <- switch(os, Linux = "mafft", Darwin = "mafft",
        Windows = "mafft.bat")
    }
    if (msa.program == "muscle") {
      exec <- switch(os, Linux = "muscle", Darwin = "muscle",
        Windows = "muscle3.8.31_i86win32.exe")
    }
    if (msa.program == "clustalo") {
      exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
        Windows = "clustalo.exe")
    }
    if (msa.program == "clustalw2") {
      exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
        Windows = "clustalw2.exe")
    }
  }
  out <- system(paste(exec, "--v"), ignore.stdout = TRUE, ignore.stderr = TRUE)
  if (out == 127)
    stop("please provide exec path or install MSA program in root \n
      i.e. in Unix: '/usr/local/bin/mafft'")

  ## generate some parameters if not specified
  ## -----------------------------------------
  ## number of cores
  if (ncore == "auto"){
    ncore <- detectCores(all.tests = FALSE, logical = TRUE)
  }
  ## Sequence type
  # type <- class(sequences)
  # type <- gsub("bin", "", type)

  ##############################################
  ## PART I
  ##############################################
  ## BASE and ALTERNATIVE MSAs
  ##############################################

  ## Generate BASE MSA
  #---------------------------------------------
  cat("Generating the base alignment")
  if (msa.program == "mafft"){
    base.msa <- mafft(x = sequences, method = method, exec = exec)
  }
  if (msa.program == "muscle"){
    base.msa <- muscle2(x = sequences, exec = exec)
  }
  if (msa.program == "clustalo"){
    base.msa <- clustalo(x = sequences, exec = exec)
  }
  if (msa.program == "clustalw2"){
    base.msa <- clustalw2(x = sequences, exec = exec)
  }
  cat("... done \n")

  ## Constructing BP guide-trees for the pertubated MSAs
  #--------------------------------------------------------
  cat("Generating NJ guide trees \n")
  ## Compute NJ guide trees

  if (nj.program =="semphy"){
    #### SEMPHY  ###
    if(nrow(base.msa) > 150){
      if (type == "DNA")
        model <- "--nucjc"
      if (type == "AA")
        model <- "--aaJC"
    } else {
      if (tpye == "DNA")
        model <- "--hky"
      if (type == "AA")
        model <- "--jtt"
    }
    nj.guide.trees <- semphy(msa = base.msa,
      dist.table<- "-J",
      model <- model,
      arsv <- "-H",
      bootstrap = bootstrap,
      verbose = verbose)

    # lapply(nj.guide.trees, root, outgroup = )
    nj.guide.trees <- lapply(nj.guide.trees, compute.brlen)
  }

  pb <- txtProgressBar(max = bootstrap, style = 3)

  if (nj.program == "R"){
    if (parallel){
      progress <- function(n) setTxtProgressBar(pb, n)
      opts <- list(progress = progress)

      cl <- makeCluster(ncore)
      registerDoSNOW(cl)

      nj.guide.trees <- foreach(i = 1:bootstrap,
        .options.snow = opts,
        .packages = "phangorn", .export = 'msaBP_nj_tree') %dopar% {
          msaBP_nj_tree(base.msa, outgroup = "auto")
        }
      stopCluster(cl)
    }
    if (!parallel){
      nj.guide.trees <- foreach(i = 1:bootstrap, .packages = "phangorn") %do% {
        setTxtProgressBar(pb, i)
        msaBP_nj_tree(base.msa, outgroup = "auto")
      }
    }
    close(pb)
  }

  ## Alignment of MSA BP times with new NJ guide trees
  ## -------------------------------------------------
  cat("Alignment of sequences using NJ guide trees \n")

  msa_out <- vector(length = bootstrap)
  for (i in seq_along(msa_out))
    msa_out[i] <- tempfile(pattern = "mafft", tmpdir = tempdir(), fileext = ".fas")
  unlink(msa_out[file.exists(msa_out)])

  pb <- txtProgressBar(max = bootstrap, style = 3)

  if (parallel){
    progress <- function(n) setTxtProgressBar(pb, n)
    opts <- list(progress = progress)
    cl <- makeCluster(ncore)
    registerDoSNOW(cl)

    if (msa.program == "mafft"){
      foreach(i = 1:bootstrap, .packages = c('ips', 'ape'),
        .options.snow = opts)  %dopar% {
          mafft(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i], method = method)
        }
    }
    if (msa.program == "muscle"){
      foreach(i = 1:bootstrap, .packages = c('ips', 'ape'),
        .options.snow = opts)  %dopar% {
          muscle2(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i])
        }
    }
    if (msa.program == "clustalo"){
      foreach(i = 1:bootstrap, .packages = c('ips', 'ape'),
        .options.snow = opts)  %dopar% {
          clustalo(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i])

        }
    }
    if (msa.program == "clustalw2"){
      foreach(i = 1:bootstrap, .packages = c('ips', 'ape'),
        .options.snow = opts)  %dopar% {
          clustalw2(x = sequences, gt = nj.guide.trees[[i]],
            exec = exec, file = msa_out[i])
        }
    }
    stopCluster(cl)
  }

  if (!parallel){
    if (msa.program == "mafft"){
      foreach(i = 1:bootstrap, .packages = c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        mafft2(x = sequences, gt = nj.guide.trees[[i]],
          method = method, exec = exec, file = msa_out[i])
      }
    }
    if (msa.program == "muscle"){
      foreach(i = 1:bootstrap, .packages = c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        muscle2(x = sequences, gt = nj.guide.trees[[i]],
          exec = exec, file = msa_out[i])
      }
    }
    if (msa.program == "clustalo"){
      foreach(i = 1:bootstrap, .packages = c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        clustalo(x = sequences, gt = nj.guide.trees[[i]],
          exec = exec, file = msa_out[i])
      }
    }
    if (msa.program == "clustalw2"){
      foreach(i = 1:bootstrap, .packages = c('ips', 'ape'))  %do% {
        setTxtProgressBar(pb, i)
        clustalw2(x = sequences, gt = nj.guide.trees[[i]],
          exec = exec, file = msa_out[i])
      }
    }
  }
  close(pb)

  mafft_created <- list.files(getwd(),
    full.names = TRUE)[grep("tree.mafft", list.files(getwd()))]
  if (length(mafft_created)){
    file.remove(mafft_created)
  }

  ##############################################
  ## PART II
  ##############################################
  ## Computation of GUIDANCE scores
  ##############################################
  cat("Calculating GUIDANCE scores \n")

  # ## GUIDANCE Score
  #----------------------------------------------
  ## Create temporary dir for alternative MSAs
  dir.create(paste(tempdir(), "alt", sep = "/"))
  files_from <- list.files(tempdir(), full.names = TRUE)
  files_from <- files_from[grep("\\.fas", files_from)]
  files_to <- list.files(tempdir())
  files_to <- files_to[grep("\\.fas", files_to)]
  files_to <- paste(tempdir(), "alt", files_to, sep="/")

  ## Move files into new dir
  file.rename(files_from, files_to)


  ## Run msa_set_score
  #---------------------
  scores <- compareMSAs(ref = base.msa,
                        dir_path = paste(tempdir(), "alt", sep = "/"),
                        com = NULL)


  ##  if wanted, store alternative MSAs into a zip file
  if (!missing(alt.msas.file)){
    files <- list.files(paste(tempdir(), "alt", sep = "/"))
    files <- files[grep("HoT", files)]
    for (i in 1:(n.coopt*bootstrap)){
      file.rename(paste(tempdir(), files[i], sep = "/"),
        paste(tempdir(), paste0("altMSA", i, ".fas"), sep = "/"))}
    files <- list.files(tempdir(), full.names = TRUE)
    files <- files[grep("altMSA*", files)]
    zip(zipfile = alt.msas.file, files = files)

    # maybe better to use gzfile
    ## currently zip creates many wired subfolders
  }

  ## Delete temporary files
  # this deletion approach has proven best to delete everything
  files <- list.files(tempdir(), full.names = TRUE)
  files <- files[-grep("rs-graphics", files)]
  unlink(files, force = TRUE, recursive = TRUE)

  ## Reduce MSA according to cutoffs, it wanted
  ## WILL BE TRASFERRED TO METHOD FOR polentaDNA class
  #---------------------------------------------------
  # msa <- guidance.msa <- base.msa
  # msa <- as.character(msa)
  # ## masking residues below cutoff
  # if (mask.cutoff > 0){
  #   txt <- as.vector(as.character(base.msa))
  #   # mat <- data.frame(rpr.sc, txt)
  #   mat <- data.frame(scores$residue_pair_residue_score, txt)
  #   rown <- max(mat$residue)
  #   coln <- max(mat$col)
  #   res_mat <- matrix(mat$score, nrow = rown, ncol = coln)
  #
  #   if (mask.cutoff == "auto"){ mask.cutoff <- 0.50 }
  #
  #   if (inherits(sequences, "DNAbin")){
  #     msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "N"
  #     msa <- as.DNAbin(msa)
  #   }
  #   if (inherits(sequences, "AAbin")) {
  #     msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "X"
  #     rownames(msa) <- labels(sequences)
  #     class(msa) <- "AAbin"
  #   }
  #   guidance.msa <- msa
  # }
  # ## remove unreliable columns
  # if (col.cutoff > 0){
  #   if (mask.cutoff) {
  #     msa <- guidance.msa
  #   } else {
  #     msa <- base.msa
  #     }
  #   if (col.cutoff == "auto"){ col.cutoff <- 0.97 }
  #   # remove_cols <- g.cs$res_pair_col_score < col.cutoff
  #   remove_cols <- scores$column_score$CS < col.cutoff
  #   guidance.msa <- msa[, !remove_cols]
  # }
  # ## remove unreliable sequences
  # if (seq.cutoff > 0){
  #   if (mask.cutoff) { msa <- guidance.msa
  #   } else { msa <- base.msa }
  #   if (seq.cutoff =="auto"){seq.cutoff <- 0.5}
  #   # remove_sequences <- rps.sc$res_pair_seq_score < seq.cutoff
  #   remove_sequences <- scores$residual_pair_sequence_score$score < seq.cutoff
  #   guidance.msa <- msa[!remove_sequences,]
  # }
  #
  # ## Generate output
  # res <- list(scores = scores,
  #   GUIDANCE_msa = guidance.msa,
  #   base_msa = base.msa)

  ## Prepare and return output
  ## -------------------------
  m <- matrix(scores$residue_pair_residue_score$score, nrow = nrow(base.msa))
  polentaDNA(base.msa, m, "guidance")
}
heibl/polenta documentation built on May 17, 2019, 3:22 p.m.