rpg: Multiple Sequence Alignment with PASTA and GUIDANCE

Documented in HoT

#' Heads or Tails alignment reliability
#'
#' @description MSA reliability assessment HoT (Landan and Graur 2008)
#'
#' @param sequences An object of class \code{\link{DNAbin}} or \code{\link{AAbin}}
#'   containing unaligned sequences of DNA or amino acids.
#'   @param cutoff specifies a cutoff to remove unreliable columns below the cutoff
#' @param col.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable columns below the cutoff; either user supplied or "auto" (0.73)
#' @param seq.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable sequences below the cutoff; either user supplied of "auto" (0.5)
#'@param mask.cutoff specific residues below a certain cutoff are masked ('N' for DNA, 'X' for AA); either user supplied of "auto" (0.5)
#' @param parallel logical, if TRUE, specify the number of cores
#' @param ncore number of cores (default is 'auto')
#' @param msa.program A charcter string giving the name of the MSA program,
#'   one of c("mafft", "muscle", "clustalo", "clustalw2"); MAFFT is default
#' @param exec A character string giving the path to the executable of the
#'   alignment program.
#' @param method further arguments passed to MAFFT, default is "auto"
#'
#' @return list containing following scores and alignments:
#' @return mean_scores residue pair score and mean column score
#' @return column_score
#' @return residue_column_score column reliability
#' @return residue_pair_residue_score
#' @return residual_pair_sequence_pair_score
#' @return residual_pair_sequence_score
#' @return residue_pair_score
#' @return base_msa
#' @return HoT_msa is the base_MSA removed from unreliable residues/columns/sequences below cutoffs
#'
#' @details Calculates column reliability (and other scors) by comparing alternative
#'   MSAs generated by aligning guide tree partitions as described in Landan and Graur (2008). For details see \code{compareMSAs}. 8*(N-3) alternative MSAs are generated by default, where N is the number of sequences.
#' @references G. Landan and D. Graur (2008). Local reliability measures from sets of co-optimal multiple sequencesuence alignments. 13:15--24
#'
#' @import ips
#' @import doSNOW
#' @import foreach
#' @import parallel
#' @import pbmcapply
#' @import plyr
#' @importFrom phangorn as.phyDat dist.ml
#' @import adephylo
#' @importFrom phytools plotTree
#'
#' @seealso \code{\link{compareMSAs}}, \code{\link{guidance}}, \code{\link{guidance2}}
#'
#' @author Franz-Sebastian Krah
#' @author Christoph Heibl
#' @export


HoT <- function(sequences,
  msa.program = "mafft", exec,
  type,
  n.coopt ="auto",
  col.cutoff = "auto",
  seq.cutoff = "auto",
  mask.cutoff = "auto",
  parallel = FALSE, ncore = "auto",
  method = "auto",
  plot_guide = TRUE,
  alt.msas.file) {

  ##############################################
  ## SOME CHECKS
  ##############################################
  if(!is.object(sequences)){
    read.fas(sequences, type = type)
  }

  if (!inherits(sequences, c("DNAbin","AAbin")))
    stop("sequencesuences not of class DNAbin or AAbin (ape)")

  if(length(sequences)>200)
    message("N seq > 200: consider using 'pasta' with desired MSA confidence program")

  ## Check for MSA program
  if(missing(exec)){
    os <- Sys.info()[1]
    if (msa.program =="mafft") {
      exec <- switch(os, Linux = "mafft", Darwin = "mafft",
        Windows = "mafft.bat")
    }
    if (msa.program =="muscle") {
      exec <- switch(os, Linux = "muscle", Darwin = "muscle",
        Windows = "muscle3.8.31_i86win32.exe")
    }
    if (msa.program =="clustalo") {
      exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
        Windows = "clustalo.exe")
    }
    if (msa.program =="clustalw2") {
      exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
      Windows = "clustalw2.exe")
    }
    out <- system(paste(exec, "--v", sep=" "), ignore.stdout = TRUE, ignore.stderr = TRUE)
    if (out == 127)
      stop("please provide exec path or install MSA program in root \n
        i.e. in Unix: '/usr/local/bin/mafft'")
  }

  ## generate some parameters if not specified
  #---------------------------------------------
  ## number of cores
  if(ncore=="auto"){
    ncore <- detectCores(all.tests = FALSE, logical = TRUE)
  }

  ## Sequence type, needed for ips::read.fas
  type <- class(sequences)
  type <- gsub("bin", "", type)

  ##############################################
  ## PART I
  ##############################################
  ## BASE and ALTERNATIVE MSAs
  ##############################################
  cat("Generating the base alignment")
  if (msa.program == "mafft"){
    base.msa <- mafft(sequences, method = method, exec = exec)
  }
  if (msa.program == "muscle"){
    base.msa <- muscle2(sequences, exec = exec)
  }
  if (msa.program == "clustalo"){
    base.msa <- clustalo(x = sequences, exec = exec)
  }
  if (msa.program == "clustalw2"){
    base.msa <- clustalw2(x = sequences, exec = exec)
  }
  cat("... done \n")

  ## Calculate start guide tree
  #----------------------------------------------
  cat("Calculate start tree")
  base.msa.ml <- as.phyDat(base.msa)
  # find ML distance as input to nj tree search
  ml.dist.msa <- dist.ml(base.msa.ml)
  # NJ
  start_tree <- ape::nj(ml.dist.msa)
  start_tree <- multi2di(start_tree)
  start_tree <- compute.brlen(start_tree)

  # plot guide tree
  if(plot_guide){
    phytools::plotTree(start_tree)
    legend("bottomleft",
      paste(Ntip(start_tree)," tips","; ",
        Ntip(start_tree)-3, " partitions", sep =""),
      bty = "n")
  }
  cat("... done \n")

  ## produce MSA partitions
  align_parts <- partitions(start_tree)

  # here could be a sampling of co-opts like in
  # guidance2. now we sample all
  n.coopt.sub <- rep("all", ncol(align_parts))
  n.coopt <- (Ntip(start_tree)-3)*8


  ##############################################
  ## PART II
  ##############################################
  ## Co-optimal MSAs
  ##############################################
  cat(paste("Sampling", n.coopt, "co-optimal alignments \n", sep=" "))

  ## Create temporary files
  #----------------------------------------------
  msa_out <- vector(length = n.coopt)
  for (i in seq_along(msa_out))
    msa_out[i] <- tempfile(pattern = "HoT", tmpdir = tempdir(), fileext = ".fas")
  unlink(msa_out[file.exists(msa_out)])

  # predifined file storage allocation (because it runs in batches of 8)
  start <- seq(1,n.coopt,8)
  end <- seq(8,n.coopt,8)
  stend <- data.frame(start, end)

  ## Function implementing batch alignment and storage in batches
  align_part_set2 <- function(x, partition_set, method,
    exec, msa.program, coopt.sub, files){
    alt_msas <- align_part_set(x = x,
      partition_set = partition_set,
      method = method, exec = exec, msa.program = msa.program,
      coopt.sub = coopt.sub)

    for(j in 1:8)
      write.fas(alt_msas[[j]], files[j])
  }


  ## Run batch alignments
  #----------------------------------------------
  pb <- txtProgressBar(max = ncol(align_parts), style = 3)
  if (parallel){
    progress <- function(n) setTxtProgressBar(pb, n)
    opts <- list(progress = progress)
    cl <- makeCluster(ncore)
    registerDoSNOW(cl)
    foreach(i = 1:ncol(align_parts), .options.snow = opts,
      .packages = c('rpg', 'ips', 'adephylo', 'foreach', 'phangorn')) %dopar% {
        # setTxtProgressBar(pb, i)
        align_part_set2(x = sequences, partition_set = align_parts[,i],
          coopt.sub = n.coopt.sub[i],
          method = method, exec = exec, msa.program = msa.program,
          files = msa_out[stend[i,1]:stend[i,2]])
      }
    stopCluster(cl)
  }
  if (!parallel){
    foreach(i = 1:ncol(align_parts)) %do% {
      setTxtProgressBar(pb, i)
      align_part_set2(x = sequences, partition_set = align_parts[,i],
        coopt.sub = n.coopt.sub,
        method = method, exec = exec, msa.program = msa.program,
        files = msa_out[stend[i,1]:stend[i,2]])
    }
  }
  close(pb)


  ##############################################
  ## PART III
  ##############################################
  ## Computation of reliability scores
  ##############################################
  cat("Calculating reliability scores \n")

  ## HoT Score
  #----------------------------------------------
  ## Create temporary dir for alternative MSAs
  dir.create(paste(tempdir(), "alt", sep="/"))
  files_from <- list.files(tempdir(), full.names = TRUE)
  files_from <- files_from[grep("\\.fas", files_from)]
  files_to <- list.files(tempdir())
  files_to <- files_to[grep("\\.fas", files_to)]
  files_to <- paste(tempdir(), "alt", files_to, sep="/")

  ## Move files into new dir
  file.rename(files_from, files_to)

  ## Run msa_set_score
  #---------------------
  scores <- compareMSAs(ref = base.msa,
    dir_path = paste(tempdir(), "alt", sep ="/"),
    com = NULL)


  ##  if wanted, store alternative MSAs into a zip file
  if(!missing(alt.msas.file)){
    files <- list.files(tempdir())
    files <- files[grep("HoT", files)]
    for(i in 1:(n.coopt*bootstrap)){
      file.rename(paste(tempdir(), files[i], sep="/"),
        paste(tempdir(), paste("altMSA", i, ".fas",sep=""), sep="/"))}
    files <- list.files(tempdir(), full.names = TRUE)
    files <- files[grep("altMSA*", files)]
    zip(zipfile = alt.msas.file, files = files)

    # maybe better to use gzfile
    ## currently zip creates many wired subfolders
  }

  ## Delete temporary files
  # this deletion approach has proven best to delete everything
  files <- list.files(tempdir(), full.names = TRUE)
  files <- files[-grep("rs-graphics", files)]
  unlink(files, force = TRUE, recursive = TRUE)


  ## Reduce MSA according to cutoffs, it wanted
  #---------------------------------------------
  msa <- HoT.msa <- base.msa
  msa <- as.character(msa)
  ## masking residues below cutoff
  if (mask.cutoff>0){
    txt <- as.vector(as.character(base.msa))
    mat <- data.frame(scores$residue_pair_residue_score, txt)
    rown <- max(mat$residue)
    coln <- max(mat$col)
    res_mat <- matrix(mat$score, nrow = rown, ncol = coln)

    if (mask.cutoff=="auto"){ mask.cutoff <- 0.50 }

    if (inherits(sequences, "DNAbin")){
      msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "N"
      msa <- as.DNAbin(msa)
    }
    if (inherits(sequences, "AAbin")) {
      msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "X"
      rownames(msa) <- labels(sequences)
      class(msa) <- "AAbin"
    }
    HoT.msa <- msa
  }
  ## remove unreliable columns
  if (col.cutoff>0){
    if (mask.cutoff>0) { msa <- HoT.msa
    } else { msa <- base.msa }
    if(col.cutoff =="auto"){col.cutoff <- 0.97}
    remove_cols <- scores$column_score$CS < col.cutoff
    HoT.msa <- msa[,!remove_cols]
  }
  ## remove unreliable sequences
  if (seq.cutoff>0){
    if (mask.cutoff>0) { msa <- HoT.msa
    } else { msa <- base.msa }
    if (seq.cutoff =="auto"){seq.cutoff <- 0.5}
    remove_sequences <- scores$residual_pair_sequence_score$score < seq.cutoff
    HoT.msa <- msa[!remove_sequences,]
  }

  # ## prepare base.msa for output
  # if(inherits(sequences, "DNAbin") & !inherits(base.msa, "DNAbin"))
  #   base.msa <- as.DNAbin(base.msa)
  # if(inherits(sequences, "AAbin") & !inherits(base.msa, "AAbin"))
  #   base.msa <- as.AAbin(base.msa)

  ## Generate output
  res <- list(scores = scores,
    HoT_msa = HoT.msa,
    base_msa = base.msa)

  ## Return output
  return(res)
}