R/hmmer3.R
In micropan: Microbial Pan-Genome Analysis

Documented in hmmerScan readHmmer

#' @name hmmerScan
#' @title Scanning a profile Hidden Markov Model database
#' 
#' @description Scanning FASTA formatted protein files against a database of pHMMs using the HMMER3
#' software.
#' 
#' @param in.files A character vector of file names.
#' @param dbase The full path-name of the database to scan (text).
#' @param out.folder The name of the folder to put the result files.
#' @param threads Number of CPU's to use.
#' @param verbose Logical indicating if textual output should be given to monitor the progress.
#' 
#' @details The HMMER3 software is purpose-made for handling profile Hidden Markov Models (pHMM)
#' describing patterns in biological sequences (Eddy, 2008). This function will make calls to the
#' HMMER3 software to scan FASTA files of proteins against a pHMM database. 
#' 
#' The files named in \samp{in.files} must contain FASTA formatted protein sequences. These files
#' should be prepared by \code{\link{panPrep}} to make certain each sequence, as well as the file name,
#' has a GID-tag identifying their genome. The database named in \samp{db} must be a HMMER3 formatted
#' database. It is typically the Pfam-A database, but you can also make your own HMMER3 databases, see
#' the HMMER3 documentation for help.
#' 
#' \code{\link{hmmerScan}} will query every input file against the named database. The database contains
#' profile Hidden Markov Models describing position specific sequence patterns. Each sequence in every
#' input file is scanned to see if some of the patterns can be matched to some degree. Each input file
#' results in an output file with the same GID-tag in the name. The result files give tabular output, and
#' are plain text files. See \code{\link{readHmmer}} for how to read the results into R.
#' 
#' Scanning large databases like Pfam-A takes time, usually several minutes per genome. The scan is set
#' up to use only 1 cpu per scan by default. By increasing \code{threads} you can utilize multiple CPUs, typically
#' on a computing cluster.
#' Our experience is that from a multi-core laptop it is better to start this function in default mode
#' from mutliple R-sessions. This function will not overwrite an existing result file, and multiple parallel
#' sessions can write results to the same folder.
#' 
#' @return This function produces files in the folder specified by \samp{out.folder}. Existing files are
#' never overwritten by \code{\link{hmmerScan}}, if you want to re-compute something, delete the
#' corresponding result files first.
#' 
#' @references Eddy, S.R. (2008). A Probabilistic Model of Local Sequence Alignment That Simplifies
#' Statistical Significance Estimation. PLoS Computational Biology, 4(5).
#' 
#' @note The HMMER3 software must be installed on the system for this function to work, i.e. the command
#' \samp{system("hmmscan -h")} must be recognized as a valid command if you run it in the Console window.
#' 
#' @author Lars Snipen and Kristian Hovde Liland.
#' 
#' @seealso \code{\link{panPrep}}, \code{\link{readHmmer}}.
#' 
#' @examples
#' \dontrun{
#' # This example require the external software HMMER
#' # Using example files in this package
#' pf <- file.path(path.package("micropan"), "extdata", "xmpl_GID1.faa.xz")
#' dbf <- file.path(path.package("micropan"), "extdata",
#'                  str_c("microfam.hmm", c(".h3f.xz",".h3i.xz",".h3m.xz",".h3p.xz")))
#' 
#' # We need to uncompress them first...
#' prot.file <- tempfile(pattern = "GID1.faa", fileext=".xz")
#' ok <- file.copy(from = pf, to = prot.file)
#' prot.file <- xzuncompress(prot.file)
#' db.files <- str_c(tempfile(), c(".h3f.xz",".h3i.xz",".h3m.xz",".h3p.xz"))
#' ok <- file.copy(from = dbf, to = db.files)
#' db.files <- unlist(lapply(db.files, xzuncompress))
#' db.name <- str_remove(db.files[1], "\\.[a-z0-9]+$")
#' 
#' # Scanning the FASTA file against microfam.hmm...
#' hmmerScan(in.files = prot.file, dbase = db.name, out.folder = ".")
#'
#' # Reading results
#' hmm.file <- file.path(".", str_c("GID1_vs_", basename(db.name), ".txt"))
#' hmm.tbl <- readHmmer(hmm.file)
#' 
#' # ...and cleaning...
#' ok <- file.remove(prot.file)
#' ok <- file.remove(str_remove(db.files, ".xz"))
#' }
#' 
#' @importFrom stringr str_c str_extract
#' 
#' @export hmmerScan
#' 
hmmerScan <- function(in.files, dbase, out.folder, threads = 0, verbose = TRUE){
  if(length(dbase) > 1){
    stop("Argument dbase must be a single text")
  }
  if(available.external("hmmer")){
    log.fil <- file.path(out.folder, "log.txt")
    basic <- paste("hmmscan -o", log.fil,"--cut_ga --noali --cpu", threads)
    rbase <- str_c("_vs_", basename(dbase), ".txt")
    gids <- str_extract(in.files, "GID[0-9]+")
    in.files <- normalizePath(in.files)
    for(i in 1:length(in.files)){
      rname <- str_c(gids[i], rbase)
      res.files <- list.files(out.folder)
      if(!(rname %in% res.files)){
        if(verbose) cat("hmmerScan: Scanning file", i, "out of", length(in.files), "...\r")
        cmd <- paste(basic,
                     "--domtblout", file.path(out.folder, rname),
                     dbase,
                     in.files[i])
        system(cmd)
        ok <- file.remove(log.fil)
      }
    }
  }
}



#' @name readHmmer
#' @title Reading results from a HMMER3 scan
#' 
#' @description Reading a text file produced by \code{\link{hmmerScan}}.
#' 
#' @param hmmer.file The name of a \code{\link{hmmerScan}} result file.
#' @param e.value Numeric threshold, hits with E-value above this are ignored (default is 1.0).
#' @param use.acc Logical indicating if accession numbers should be used to identify the hits.
#' 
#' @details The function reads a text file produced by \code{\link{hmmerScan}}. By specifying a smaller
#' \samp{e.value} you filter out poorer hits, and fewer results are returned. The option \samp{use.acc}
#' should be turned off (FALSE) if you scan against your own database where accession numbers are lacking.
#' 
#' @return The results are returned in a \samp{tibble} with columns \samp{Query}, \samp{Hit},
#' \samp{Evalue}, \samp{Score}, \samp{Start}, \samp{Stop} and \samp{Description}. \samp{Query} is the tag
#' identifying each query sequence. \samp{Hit} is the name or accession number for a pHMM in the database
#' describing patterns. The \samp{Evalue} is the \samp{ievalue} in the HMMER3 terminology. The \samp{Score}
#' is the HMMER3 score for the match between \samp{Query} and \samp{Hit}. The \samp{Start} and \samp{Stop}
#' are the positions within the \samp{Query} where the \samp{Hit} (pattern) starts and stops.
#' \samp{Description} is the description of the \samp{Hit}. There is one line for each hit. 
#' 
#' @author Lars Snipen and Kristian Hovde Liland.
#' 
#' @seealso \code{\link{hmmerScan}}, \code{\link{hmmerCleanOverlap}}, \code{\link{dClust}}.
#' 
#' @examples
#' # See the examples in the Help-files for dClust and hmmerScan.
#' 
#' @importFrom stringr str_detect str_split str_c str_replace_all
#' @importFrom tibble tibble
#' @importFrom dplyr %>% 
#' @importFrom rlang .data
#' 
#' @export readHmmer
#' 
readHmmer <- function(hmmer.file, e.value = 1, use.acc = TRUE){
  hmmer.file <- normalizePath(hmmer.file)
  lines <- readLines(hmmer.file)
  subset(lines, !str_detect(lines, "^\\#")) %>%
    str_replace_all("[ ]+", " ") %>% 
    str_split(pattern = " ") -> lst
  if(use.acc){
    hit <- sapply(lst, function(x){x[2]})
  } else {
    hit <- sapply(lst, function(x){x[1]})
  }
  tibble(Query  = sapply(lst, function(x){x[4]}),
         Hit    = hit,
         Evalue = as.numeric(sapply(lst, function(x){x[13]})),
         Score  = as.numeric(sapply(lst, function(x){x[14]})),
         Start  = as.numeric(sapply(lst, function(x){x[18]})),
         Stop   = as.numeric(sapply(lst, function(x){x[19]})),
         Description = sapply(lst, function(x){str_c(x[23:length(x)], collapse = " ")})) %>% 
    filter(.data$Evalue <= e.value) -> hmmer.tbl
  return(hmmer.tbl)
}