dep/guidance-dep.R

#' #' GUIDetree-based AligNment ConficencE
#' #'
#' #' @param sequencess An object of class \code{\link{DNAbin}} or \code{\link{AAbin}}
#' #'   containing unaligned sequences of DNA or amino acids.
#' #' @param parallel logical, if TRUE, specify the number of cores
#' #' @param ncore number of cores (default is maxinum of local threads)
#' #' @param bootstrap An integer giving the number of perturbated MSAs.
#' #' @param msa.program A charcter string giving the name of the MSA program,
#' #'   currelty one of c("mafft", "muscle", "clustalo", "clustalw2"); MAFFT is default
#' #' @param exec A character string giving the path to the executable of the
#' #'   alignment program.
#' #' @param method further arguments passed to mafft, default is "auto"
#' #' @param col.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable columns below the cutoff; either user supplied or "auto" (0.73)
#' #' @param seq.cutoff numberic between 0 and 1; specifies a cutoff to remove unreliable sequences below the cutoff; either user supplied of "auto" (0.5)
#' #' @param mask.cutoff specific residues below a certain cutoff are masked ('N' for DNA, 'X' for AA); either user supplied of "auto" (0.5)
#' #' @description Assess uncertainty in multiple sequence aligment (MSA) with the
#' #'   GUIDANCE algorithm (Penn et al. 2010).
#' #' @return alignment_score
#' #' @return GUIDANCE_residue_score
#' #' @return GUIDANCE_score: is the GUIDANCE column score
#' #' @return GUIDANCE_sequence_score
#' #' @return guidance_msa: is the base MSA removed from unreliable residues/columns/sequences below \code{cutoffs}
#' #' @return base_msa
#' #'
#' #' @details Calculates column confidence by comparing alternative MSAS generated by bootstrap MSAs (Felsenstein 1985) and derived guide trees. The basic comparison between the BP MSAs and a reference MSA is if column residue pairs are identical in both MSAs.
#' #' @author Franz-Sebastian Krah
#' #' @references Felsenstein J. 1985. Confidence limits on phylogenies: an
#' #'   approach using the bootstrap. Evolution 39:783–791
#' #' @references Penn et al. (2010). An alignment confidence score capturing
#' #'   robustness to guide tree uncertainty. Molecular Biology and Evolution
#' #'   27:1759--1767
#' #'
#' #' @import ips
#' #' @import doSNOW
#' #' @import foreach
#' #' @import parallel
#' #' @import pbmcapply
#' #' @import plyr
#' #' @importFrom phangorn as.phyDat dist.ml
#' #'
#' #' @author Franz-Sebastian Krah
#' #' @author Christoph Heibl
#' #' @export
#'
#' guidance <- function(sequences,
#'   msa.program = "mafft", exec,
#'   bootstrap = 100,
#'   col.cutoff = "auto",
#'   seq.cutoff = "auto",
#'   mask.cutoff = "auto",
#'   parallel = FALSE, ncore ="auto",
#'   method = "auto",
#'   alt.msas.file){
#'
#'   ##############################################
#'   ## SOME CHECKS
#'   ##############################################
#'   if (!inherits(sequences, c("DNAbin", "AAbin")))
#'     stop("sequences not of class DNAbin or AAbin (ape)")
#'
#'
#'   if (length(labels(sequences)) < 8)
#'     warning("GUIDANCE is not suitable for alignments of very few sequences.\n
#'               As a rule of thumb, use GUIDANCE2 or HoT for less than 8 sequences.")
#'
#'   ## Check for MSA program
#'   if(missing(exec)){
#'     os <- Sys.info()[1]
#'     if (msa.program =="mafft") {
#'       exec <- switch(os, Linux = "mafft", Darwin = "mafft",
#'         Windows = "mafft.bat")
#'     }
#'     if (msa.program =="muscle") {
#'       exec <- switch(os, Linux = "muscle", Darwin = "muscle",
#'         Windows = "muscle3.8.31_i86win32.exe")
#'     }
#'     if (msa.program =="clustalo") {
#'       exec <- switch(os, Linux = "clustalo", Darwin = "clustalo",
#'         Windows = "clustalo.exe")
#'     }
#'     if (msa.program =="clustalw2") {
#'       exec <- switch(os, Linux = "clustalw", Darwin = "clustalw2",
#'         Windows = "clustalw2.exe")
#'     }
#'   }
#'   out <- system(paste(exec, "--v", sep=" "), ignore.stdout = TRUE, ignore.stderr = TRUE)
#'   if (out == 127)
#'     stop("please provide exec path or install MSA program in root \n
#'       i.e. in Unix: '/usr/local/bin/mafft'")
#'
#'
#'   type <- class(sequences)
#'   type <- gsub("bin", "", type)
#'
#'   if(ncore=="auto"){
#'     ncore <- detectCores(all.tests = FALSE, logical = TRUE)
#'   }
#'
#'   ##############################################
#'   ## PART I
#'   ##############################################
#'   ## BASE and PERTUBATED MSAs
#'   ##############################################
#'
#'   ## Generate BASE alignment
#'   ###########################
#'   cat("Generating the base alignment")
#'   if (msa.program == "mafft"){
#'     base.msa <- mafft(x = sequences, method = method, exec = exec)
#'   }
#'   if (msa.program == "muscle"){
#'     base.msa <- muscle2(x = sequences, exec = exec)
#'   }
#'   if (msa.program == "clustalo"){
#'     base.msa <- clustalo(x = sequences, exec = exec)
#'   }
#'   if (msa.program == "clustalw2"){
#'     base.msa <- clustalw2(x = sequences, exec = exec)
#'   }
#'
#'   cat("... done \n")
#'
#'   ## form into matrix for perturbation
#'   # base.msa <- as.character(base.msa)
#'
#'   ## Constructing BP guide-trees for the pertubated MSAs
#'   #######################################################
#'   cat("Generating NJ guide trees \n")
#'   ## Compute NJ guide trees
#'
#'   if(semphy){
#'     #### SEMPHY  ###
#'     if(nrow(base.msa)>150){
#'       if(tpye == "DNA")
#'         model <- "--nucjc"
#'       if(type == "AA")
#'         model <- "--aaJC"
#'     }else{
#'       if(tpye == "DNA")
#'         model <- "--hky"
#'       if(type == "AA")
#'         model <- "--jtt"
#'     }
#'     nj.guide.trees <- semphy(msa = base.msa,
#'       dist.table<- "-J",
#'       model <- model,
#'       arsv <- "-H",
#'       bootstrap = bootstrap,
#'       verbose = verbose)
#'
#'     # lapply(nj.guide.trees, root, outgroup = )
#'     nj.guide.trees <- lapply(nj.guide.trees, compute.brlen)
#'   }
#'
#'   Rnj = TRUE
#'   if(Rnj){
#'     if (parallel){
#'       progress <- function(n) setTxtProgressBar(pb, n)
#'       opts <- list(progress = progress)
#'
#'       cl <- makeCluster(ncore)
#'       registerDoSNOW(cl)
#'
#'       nj.guide.trees <- foreach(i = 1:bootstrap,
#'         .options.snow = opts,
#'         .packages = "phangorn", .export = 'msaBP_nj_tree') %dopar% {
#'           msaBP_nj_tree(base.msa, outgroup = "auto")
#'         }
#'       stopCluster(cl)
#'     }
#'     if (!parallel){
#'       nj.guide.trees <- foreach(i = 1:bootstrap, .packages = "phangorn") %do%{
#'         setTxtProgressBar(pb, i)
#'         msaBP_nj_tree(base.msa, outgroup = "auto")
#'       }
#'     }
#'     close(pb)
#'   }
#'
#'
#'
#'   ## Alignment of MSA BP times with new NJ guide trees
#'   ## -------------------------------------------------
#'   cat("Alignment of sequences using NJ guide trees \n")
#'
#'   msa_out <- vector(length = bootstrap)
#'   for (i in seq_along(msa_out))
#'     msa_out[i] <- tempfile(pattern = "mafft", tmpdir = tempdir(), fileext = ".fas")
#'   unlink(msa_out[file.exists(msa_out)])
#'
#'   pb <- txtProgressBar(max = bootstrap, style = 3)
#'
#'   if (parallel){
#'     progress <- function(n) setTxtProgressBar(pb, n)
#'     opts <- list(progress = progress)
#'     cl <- makeCluster(ncore)
#'     registerDoSNOW(cl)
#'
#'     if (msa.program == "mafft"){
#'       foreach(i = 1:bootstrap, .packages=c('ips', 'ape'),
#'         .options.snow = opts)  %dopar% {
#'           mafft2(x = sequences, gt = nj.guide.trees[[i]],
#'             exec = exec, file = msa_out[i], method = method)
#'         }
#'     }
#'     if (msa.program == "muscle"){
#'       foreach(i = 1:bootstrap, .packages=c('ips', 'ape'),
#'         .options.snow = opts)  %dopar% {
#'           muscle2(x = sequences, gt = nj.guide.trees[[i]],
#'             exec = exec, file = msa_out[i])
#'         }
#'     }
#'     if (msa.program == "clustalo"){
#'       foreach(i = 1:bootstrap, .packages =c('ips', 'ape'),
#'         .options.snow = opts)  %dopar% {
#'           clustalo(x = sequences, gt = nj.guide.trees[[i]],
#'             exec = exec, file = msa_out[i])
#'
#'         }
#'     }
#'     if (msa.program == "clustalw2"){
#'       foreach(i = 1:bootstrap, .packages=c('ips', 'ape'),
#'         .options.snow = opts)  %dopar% {
#'           clustalw2(x = sequences, gt = nj.guide.trees[[i]],
#'             exec = exec, file = msa_out[i])
#'         }
#'     }
#'     stopCluster(cl)
#'   }
#'
#'   if (!parallel){
#'     if (msa.program == "mafft"){
#'       foreach(i = 1:bootstrap, .packages=c('ips', 'ape'))  %do% {
#'         setTxtProgressBar(pb, i)
#'         mafft2(x = sequences, gt = nj.guide.trees[[i]],
#'           method = method, exec = exec, file = msa_out[i])
#'         }
#'     }
#'     if (msa.program == "muscle"){
#'       foreach(i = 1:bootstrap, .packages=c('ips', 'ape'))  %do% {
#'         setTxtProgressBar(pb, i)
#'         muscle2(x = sequences, gt = nj.guide.trees[[i]],
#'           exec = exec, file = msa_out[i])
#'         }
#'     }
#'     if (msa.program == "clustalo"){
#'       foreach(i = 1:bootstrap, .packages =c('ips', 'ape'))  %do% {
#'         setTxtProgressBar(pb, i)
#'         clustalo(x = sequences, gt = nj.guide.trees[[i]],
#'           exec = exec, file = msa_out[i])
#'         }
#'     }
#'     if (msa.program == "clustalw2"){
#'       foreach(i = 1:bootstrap, .packages=c('ips', 'ape'))  %do% {
#'         setTxtProgressBar(pb, i)
#'         clustalw2(x = sequences, gt = nj.guide.trees[[i]],
#'           exec = exec, file = msa_out[i])
#'         }
#'     }
#'   }
#'   close(pb)
#'
#'   mafft_created <- list.files(getwd(),
#'     full.names = T)[grep("tree.mafft", list.files(getwd()))]
#'   if(length(mafft_created)>0){
#'     file.remove(mafft_created)
#'   }
#'
#'   # removing some intermediate objects not further needed
#'   # rm(base.msa.bp)
#'   # rm(nj.guide.trees)
#'
#'   ##############################################
#'   ## PART II
#'   ##############################################
#'   ## Computation of GUIDANCE scores
#'   ##############################################
#'   cat("Calculating GUIDANCE scores \n")
#'
#'   # ## GUIDANCE Score
#'   # ##############################################
#'
#'   ## Transform MSAs for input to *msa_set_score*
#'   # base.msa.t <- data.frame(t(as.character(base.msa)))
#'
#'
#' #   compareMSAs2 <- function(ref, com = msa_out[i], type =type){
#' #     guide.msa <- read.fas(com, type = type)
#' #     guide.msa <- guide.msa[match(rownames(ref), rownames(guide.msa)),]
#' #     compareMSAs(ref = ref, com = guide.msa)
#' #   }
#' #
#' #   ## Run scores program msa_set_score
#' #   system.time(
#' #   if (parallel){
#' #     pb <- txtProgressBar(max = bootstrap, style = 3)
#' #     progress <- function(n) setTxtProgressBar(pb, n)
#' #     opts <- list(progress = progress)
#' #
#' #     cl <- makeCluster(ncore)
#' #     registerDoSNOW(cl)
#' #     altres <- foreach(i = 1:bootstrap, .options.snow = opts,
#' #       .export = 'compareMSAs', .packages = 'ips') %dopar% {
#' #         compareMSAs2(ref = base.msa, com = list.files( paste(tempdir(), "alt", sep="/"), full.names = TRUE)[i], type =type)
#' #       }
#' #     stopCluster(cl)
#' #   }
#' # )
#'
#'   # if (!parallel){
#'   #   altres <- foreach(i = 1:bootstrap,
#'   #     .export = 'compareMSAs') %do% {
#'   #       setTxtProgressBar(pb, i)
#'   #       compareMSAs2(ref = base.msa, com = msa_out[i])
#'   #     }
#'   # }
#'   # close(pb)
#'
#'   # running it as list does not require loading the intermediate results (read.table)
#'   # especially RPRSC is very time consuming.
#'   # so running msa_set_score with -d is appr. 6 times faster
#'   dir.create(paste(tempdir(), "alt", sep="/"))
#'   files_from <- list.files(tempdir(), full.names = TRUE)
#'   files_from <- files_from[grep("\\.fas", files_from)]
#'   files_to <- list.files(tempdir())
#'   files_to <- files_to[grep("\\.fas", files_to)]
#'   files_to <- paste(tempdir(), "alt", files_to, sep="/")
#'
#'   file.rename(files_from, files_to)
#'
#'
#'   scores <- compareMSAs(ref = base.msa, dir_path = paste(tempdir(), "alt", sep ="/"), com = NULL)
#'
#'
#'
#'   # ### Calculate mean scores
#'   # #------------------------------
#'   # # Mean score
#'   # msc <- do.call(rbind, lapply(altres, function(x) x[[1]]))
#'   # msc[] <- lapply(msc, as.numeric)
#'   # msc <- colMeans(msc)
#'   # msc <- data.frame(msc)
#'   #
#'   # ## Column score
#'   # CS <- do.call(cbind, lapply(altres, function(x) x[[2]]))
#'   # del <- grep("col", names(CS))
#'   # CS <- CS[, -del[2:length(del)]]
#'   # CS <- data.frame(col = CS[, 1],
#'   #   column_score = rowMeans(CS[,2:ncol(CS)], na.rm = TRUE))
#'   #
#'   # ## Residue pair column score (GUIDANCE Score)
#'   # g.cs <- do.call(cbind, lapply(altres, function(x) x[[3]]))
#'   # del <- grep("col\\b", names(g.cs))
#'   # g.cs <- g.cs[, -del[2:length(del)]]
#'   # g.cs <- data.frame(col = g.cs[, 1],
#'   #   res_pair_col_score = rowMeans(g.cs[,2:ncol(g.cs)], na.rm = TRUE))
#'   #
#'   # ## GUIDANCE Alignment score
#'   # alignment_score <- mean(g.cs[, 2])
#'   # msc <- rbind(msc, MEAN_GUIDANCE_SCORE = alignment_score)
#'   #
#'   # # Residue pair residue score
#'   # rpr.sc <- do.call(cbind, lapply(altres, function(x) x[[4]]))
#'   # del <- grep("col\\b|residue", names(rpr.sc))
#'   # rpr.sc <- rpr.sc[, -del[3:length(del)]]
#'   # rpr.sc <- data.frame(col = rpr.sc[, 1], residue = rpr.sc[, 2],
#'   #   res_pair_res_score = rowMeans(rpr.sc[,3:ncol(rpr.sc)], na.rm = TRUE))
#'   #
#'   # # Residual pair sequence pair score
#'   # rpsp.sc <- do.call(cbind, lapply(altres, function(x) x[[5]]))
#'   # del <- grep("seq_row1|seq_row2", names(rpsp.sc))
#'   # rpsp.sc <- rpsp.sc[, -del[3:length(del)]]
#'   # rpsp.sc <- data.frame(seq1 = rpsp.sc[, 1], seq2 = rpsp.sc[, 2],
#'   #   res_pair_seq_pair_score = rowMeans(rpsp.sc[,3:ncol(rpsp.sc)], na.rm = TRUE))
#'   #
#'   # # Residual pair sequence score
#'   # rps.sc <- do.call(cbind, lapply(altres, function(x) x[[6]]))
#'   # del <- grep("seq", names(rps.sc))
#'   # rps.sc <- rps.sc[, -del[2:length(del)]]
#'   # rps.sc <- data.frame(seq = rps.sc[, 1],
#'   #   res_pair_seq_score = rowMeans(rps.sc[,2:ncol(rps.sc)], na.rm = TRUE))
#'   #
#'   # # Residue pair score
#'   # rp.sc <- do.call(cbind, lapply(altres, function(x) x[[7]]))
#'   # del <- grep("col|row1|row2", names(rp.sc))
#'   # rp.sc <- rp.sc[, -del[4:length(del)]]
#'   # rp.sc <- data.frame(col = rp.sc[, 1], row1 = rp.sc[,2], col2 = rp.sc[,3],
#'   #   res_pair_score = rowMeans(rp.sc[,4:ncol(rp.sc)], na.rm = TRUE))
#'   ### Calculate mean scores DONE
#'
#'   ##  if wanted, store alternative MSAs into a zip file
#'   if(!missing(alt.msas.file)){
#'     files <- list.files(paste(tempdir(), "alt", sep="/"))
#'     files <- files[grep("HoT", files)]
#'     for(i in 1:(n.coopt*bootstrap)){
#'       file.rename(paste(tempdir(), files[i], sep="/"),
#'         paste(tempdir(), paste("altMSA", i, ".fas",sep=""), sep="/"))}
#'     files <- list.files(tempdir(), full.names = TRUE)
#'     files <- files[grep("altMSA*", files)]
#'     zip(zipfile = alt.msas.file, files = files)
#'
#'     # maybe better to use gzfile
#'     ## currently zip creates many wired subfolders
#'   }
#'
#'   ## delete temporary files in temporary directory
#'   unlink(msa_out[file.exists(msa_out)], force = TRUE, recursive = TRUE)
#'   unlink(list.files(paste(tempdir(),"alt", sep="/"), full.names = TRUE), force = TRUE, recursive = TRUE)
#'   files <- list.files(tempdir(), full.names = TRUE)
#'   files <- files[-grep("rs-graphics", files)]
#'   unlink(files, force = TRUE, recursive = TRUE)
#'
#'   msa <- guidance.msa <- base.msa
#'   msa <- as.character(msa)
#'   ## masking residues below cutoff
#'   if (mask.cutoff>0){
#'     txt <- as.vector(as.character(base.msa))
#'     # mat <- data.frame(rpr.sc, txt)
#'     mat <- data.frame(scores$residue_pair_residue_score, txt)
#'     rown <- max(mat$residue)
#'     coln <- max(mat$col)
#'     res_mat <- matrix(mat$score, nrow = rown, ncol = coln)
#'
#'     if (mask.cutoff=="auto"){ mask.cutoff <- 0.50 }
#'
#'     if (inherits(sequences, "DNAbin")){
#'       msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "N"
#'       msa <- as.DNAbin(msa)
#'     }
#'     if (inherits(sequences, "AAbin")) {
#'       msa[res_mat<mask.cutoff & !is.na(res_mat)] <- "X"
#'       rownames(msa) <- labels(sequences)
#'       class(msa) <- "AAbin"
#'     }
#'     guidance.msa <- msa
#'   }
#'   ## remove unreliable columns
#'   if (col.cutoff>0){
#'     if (mask.cutoff>0) { msa <- guidance.msa
#'     } else { msa <- base.msa }
#'     if(col.cutoff =="auto"){col.cutoff <- 0.97}
#'     # remove_cols <- g.cs$res_pair_col_score < col.cutoff
#'     remove_cols <- scores$column_score$CS < col.cutoff
#'     guidance.msa <- msa[,!remove_cols]
#'   }
#'   ## remove unreliable sequences
#'   if (seq.cutoff>0){
#'     if (mask.cutoff>0) { msa <- guidance.msa
#'     } else { msa <- base.msa }
#'     if (seq.cutoff =="auto"){seq.cutoff <- 0.5}
#'     # remove_sequences <- rps.sc$res_pair_seq_score < seq.cutoff
#'     remove_sequences <- scores$residual_pair_sequence_score$score < seq.cutoff
#'     guidance.msa <- msa[!remove_sequences,]
#'   }
#'
#'   ## prepare base.msa for output
#'   if(inherits(sequences, "DNAbin") & !inherits(base.msa, "DNAbin"))
#'     base.msa <- as.DNAbin(base.msa)
#'   if(inherits(sequences, "AAbin") & !inherits(base.msa, "AAbin"))
#'     base.msa <- as.AAbin(base.msa)
#'
#'   ## Produce output
#'   # res <- list(mean_score = msc,
#'   #   column_score = CS,
#'   #   GUIDANCE_column_score = g.cs,
#'   #   residue_pair_residue_score = rpr.sc,
#'   #   residual_pair_sequence_pair_score  = rpsp.sc,
#'   #   residual_pair_sequence_score = rps.sc,
#'   #   residue_pair_score = rp.sc,
#'   #   GUIDANCE_msa = guidance.msa,
#'   #   base_msa = base.msa)
#'
#'   res <- list(scores = scores,
#'     GUIDANCE_msa = guidance.msa,
#'     base_msa = base.msa)
#'   return(res)
#' }
heibl/polenta documentation built on May 17, 2019, 3:22 p.m.