R/hmmer.R

"hmmer" <- function(seq, type='phmmer', db=NULL, verbose=TRUE, timeout=90) {
  cl <- match.call()
    
  oopsa <- requireNamespace("XML", quietly = TRUE)
  oopsb <- requireNamespace("RCurl", quietly = TRUE)
  if(!all(c(oopsa, oopsb)))
     stop("Please install the XML and RCurl package from CRAN")
 
  seqToStr <- function(seq) {
    if(inherits(seq, "fasta"))
      seq <- seq$ali
    if(is.matrix(seq)) {
      if(nrow(seq)>1)
        warning(paste("Alignment with multiple sequences detected. Using only the first sequence"))
      seq <- as.vector(seq[1,!is.gap(seq[1,])])
    }
    else
      seq <- as.vector(seq[!is.gap(seq)])
    return(paste(seq, collapse=""))
  }
  
  alnToStr <- function(seq) {
    if(!inherits(seq, "fasta"))
      stop("seq must be of type 'fasta'")
    tmpfile <- tempfile()
    write.fasta(seq, file=tmpfile)
    rawlines <- paste(readLines(tmpfile), collapse="\n")
    unlink(tmpfile)
    return(rawlines)
  }
  
  types.allowed <- c("phmmer", "hmmscan", "hmmsearch", "jackhmmer")
  if(! type%in%types.allowed )
    stop(paste("Input type should be either of:", paste(types.allowed, collapse=", ")))

  ## PHMMER (protein sequence vs protein sequence database)
  ## seq is a sequence
  if(type=="phmmer") {
    seq <- seqToStr(seq)
    if(is.null(db))
      db="pdb"
    db.allowed <- c("env_nr", "nr", "refseq", "pdb", "rp15", "rp35", "rp55",
                    "rp75", "swissprot", "unimes", "uniprotkb",
                    "uniprotrefprot", "pfamseq")
    db <- tolower(db)
    if(!db%in%db.allowed)
      stop(paste("db must be either:", paste(db.allowed, collapse=", ")))
    
    seqdb <- db
    hmmdb <- NULL
    iter <- NULL
    rcurl <- TRUE
  }

  ## HMMSCAN (protein sequence vs profile-HMM database)
  ## seq is a sequence
  if(type=="hmmscan") {
    seq <- seqToStr(seq)
    if(is.null(db))
      db="pfam"
    db.allowed <- tolower(c("pfam", "gene3d", "superfamily", "tigrfam"))
    db <- tolower(db)
    if(!db%in%db.allowed)
      stop(paste("db must be either:", paste(db.allowed, collapse=", ")))
    
    seqdb <- NULL
    hmmdb <- db
    iter <- NULL
    rcurl <- TRUE
  }

  ## HMMSEARCH (protein alignment/profile-HMM vs protein sequence database)
  ## seq is an alignment
  if(type=="hmmsearch") {
    if(!inherits(seq, "fasta"))
      stop("please provide 'seq' as a 'fasta' object")
    
    ##alnfile <- tempfile()
    ##seq <- write.fasta(seq, file=alnfile)
    seq <- alnToStr(seq)

    if(is.null(db))
      db="pdb"
    db.allowed <- tolower(c("pdb", "swissprot"))
    db <- tolower(db)
    if(!db%in%db.allowed)
      stop(paste("db must be either:", paste(db.allowed, collapse=", ")))
    
    seqdb <- db
    hmmdb <- NULL
    iter <- NULL
    rcurl <- TRUE
  }

  ## JACKHMMER (iterative search vs protein sequence database)
  ## seq can be sequence, HMM, or multiple sequence alignment
  ## HMM not implemented here yet
  if(type=="jackhmmer") {
    if(!inherits(seq, "fasta"))
      stop("please provide 'seq' as a 'fasta' object")
    
    ##alnfile <- tempfile()
    ##seq <- write.fasta(seq, file=alnfile)
    seq <- alnToStr(seq)
    
    if(is.null(db))
      db="pdb"
    db.allowed <- tolower(c("pdb", "swissprot"))
    db.allowed <- c("env_nr", "nr", "refseq", "pdb", "rp15", "rp35", "rp55",
                    "rp75", "swissprot", "unimes", "uniprotkb",
                    "uniprotrefprot", "pfamseq")
    db <- tolower(db)
    if(!db%in%db.allowed)
      stop(paste("db must be either:", paste(db.allowed, collapse=", ")))

    seqdb <- db
    hmmdb <- NULL
    iter <- NULL
    rcurl <- TRUE
  }
  
  
  ## Make the request to the HMMER website
  ##url <- paste('http://hmmer.janelia.org/search/', type, sep="")
  url <- paste("https://www.ebi.ac.uk/Tools/hmmer/search/", type, sep="")
  curl.opts <- list(httpheader = "Expect:",
                    httpheader = "Accept:text/xml",
                    verbose = verbose,
                    followlocation = TRUE
                    )
    
  hmm <- RCurl::postForm(url, hmmdb=hmmdb, seqdb=seqdb, seq=seq, 
                  style = "POST",
                  .opts = curl.opts,
                  .contentEncodeFun=RCurl::curlPercentEncode, .checkParams=TRUE )

  ## check results from the server
  if(!grepl("results", hmm)) {
      if(verbose) {
          message("Content from HMMER server:")
          message("  ", hmm)
      }
      
      stop("Request to HMMER server failed")
  }
  
  add.pdbs <- function(x, ...) {
    hit <- XML::xpathSApply(x, '@*')
    pdbs <- unique(XML::xpathSApply(x, 'pdbs', XML::xmlToList))
    new <- as.matrix(hit, ncol=1)
    
    if(length(pdbs) > 1) {
      for(i in 1:length(pdbs)) {
        hit["acc"]=pdbs[i]
        new=cbind(new, hit)
      }
      colnames(new)=NULL
    }
    return(new)
  }

  ##fetch.pdbs <- function(x) {
  ##  unique(XML::xpathSApply(x, 'pdbs', XML::xmlToList))
  ##}

  ## workaround for 'act_site' tags that can not be parsed
  ## XML Parsing Error: not well-formed, e.g. for 1h1h_A
  ## type = hmmscan and pdb = pfam
  if(grepl("act_site", hmm)) {
      lines <- unlist(strsplit(hmm, "\n"))
      actsite.inds <- grep("act_site", lines)
      actlen <- length(actsite.inds)
      if(actlen>2) {
        if(actlen%%2 != 0) {
          stop("Bad XML format")
        }
        rm.inds <- NULL
        for(i in seq(1, actlen, 2)) {
          rm.inds <- c(rm.inds, seq(actsite.inds[i], actsite.inds[i+1]))
        }
        hmm <- paste(lines[-rm.inds], collapse="\n")
      }
      else {
         hmm <- paste(lines[-seq(actsite.inds[1], actsite.inds[2])],
                      collapse="\n")
      }
  }

  xml <- XML::xmlParse(hmm)
  resurl <- XML::xpathSApply(xml, '//data', XML::xpathSApply, '@*')
  resurl <- paste0("http://www.ebi.ac.uk/Tools/hmmer/results/", resurl["uuid", 1])
  data <- XML::xpathSApply(xml, '///hits', XML::xpathSApply, '@*')

  pdb.ids <- NULL
  if(db=="pdb") {
    tmp <- XML::xpathSApply(xml, '///hits', add.pdbs)
    data <- as.data.frame(tmp, stringsAsFactors=FALSE)
    colnames(data) <- NULL
  }

  data <- as.data.frame(t(data), stringsAsFactors=FALSE)
  data <- data[!duplicated(data$acc), ]
  ##rownames(data) <- data[, "acc"]
  
  ## convert to numeric
  fieldsToNumeric <- c("evalue", "pvalue", "score", "archScore", "ndom", "nincluded",
                       "niseqs", "nregions", "nreported", "bias", "dcl", "hindex")
  inds <- which(names(data) %in% fieldsToNumeric)
  
  for(i in 1:length(inds)) {
    tryCatch({
      data[[inds[i]]] = as.numeric(data[[inds[i]]])
    },
    warning = function(w) {
      #print(w)
      return(data[[inds[i]]])
    },
    error = function(e) {
      #print(e)
      return(data[[inds[i]]])
    }
    )
  }

  data$pdb.id <- data$acc
  data$bitscore <- data$score
  data$mlog.evalue <- -log(data$evalue)
  data$mlog.evalue[is.infinite(data$mlog.evalue)] <- -log(.Machine$double.xmin)

  out <- list(hit.tbl = data,
              url = resurl)
  
  ##class(data) <- c("hmmer", type, "data.frame")
  class(out) <- c("hmmer", type)
  return(out)
}

Try the bio3d package in your browser

Any scripts or data that you put into this service are public.

bio3d documentation built on Oct. 27, 2022, 1:06 a.m.