R/utils_hmmer.R

Defines functions request_hmmer_using_grid_alns request_hmmer_using_grid_seqs add.pdbs workaround_for_act_site parse_hits_xml_if_pdb parse_uuid_xml df_format_hmmer parse_hash_xml is_protein_seq

is_protein_seq <- function(x) {
  AA_STANDARD <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I",
                   "L", "K","M", "F", "P", "S", "T", "W", "Y", "V")
  x.vc <- x %>%
    as.character() %>%
    strsplit(split = "") %>%
    unlist()
  all(x.vc %in% AA_STANDARD)
}
parse_hash_xml <- function(xml, hash){
  df <- xml %>%
    XML::xpathSApply(hash, XML::xpathSApply, '@*') %>%
    t() %>%
    as.data.frame()%>%
#    tibble::as_tibble(.name_repair = "unique") %>%
    dplyr::distinct()
  df_format_hmmer(df)

}

df_format_hmmer <- function(df){
numeric_cols <- c("Z", "Z_setby", "domZ", "domZ_setby",  "elapsed" ,
          "n_past_bias","n_past_fwd",  "n_past_msv",  "n_past_vit",
          "nhits" ,"nincluded", "nmodels", "nreported", "nseqs",
          "page", "sys", "total", "unpacked", "user",
          "bias", "evalue", "flags", "hindex", "ndom", "nincluded",
          "nregions","nreported", "pvalue", "score","aliId", "aliIdCount",
          "aliL","aliM", "aliN", "aliSim", "aliSimCount", "alihindex",
          "alihmmfrom","alihmmto","alisqfrom","alisqto" , "bias",
          "bitscore", "cevalue", "iali", "ienv", "ievalue","jali", "jenv",
          "oasc", "outcompeted", "significant","uniq")
cols <- intersect(numeric_cols,colnames(df))
df %>%
  dplyr::mutate(dplyr::across(cols, as.numeric))
}

parse_uuid_xml <- function(xml){
  xml %>%
    XML::xpathSApply('//data', XML::xpathSApply, '@*') %>%
    magrittr::extract("uuid", 1)
}

parse_hits_xml_if_pdb <- function(xml){
  hits <- xml %>%
    XML::xpathSApply('///hits', add.pdbs) %>%
    as.data.frame(stringsAsFactors=FALSE) %>%
    magrittr::set_colnames(NULL) %>%
    t() %>%
    as.data.frame()%>%
#    tibble::as_tibble(.name_repair = "unique") %>%
    dplyr::distinct()
}

workaround_for_act_site <- function(hmm){
  ## workaround for 'act_site' tags that can not be parsed
  ## XML Parsing Error: not well-formed, e.g. for 1h1h_A
  ## type = hmmscan and pdb = pfam
  ## https://github.com/cran/bio3d/blob/master/R/hmmer.R
  lines <- unlist(strsplit(hmm, "\n"))
  actsite.inds <- grep("act_site", lines)
  actlen <- length(actsite.inds)
  if(actlen>2) {
    if(actlen%%2 != 0) {
      stop("Bad XML format")
    }
    rm.inds <- NULL
    for(i in seq(1, actlen, 2)) {
      rm.inds <- c(rm.inds, seq(actsite.inds[i], actsite.inds[i+1]))
    }
    hmm <- paste(lines[-rm.inds], collapse="\n")
  }
  else {
    hmm <- paste(lines[-seq(actsite.inds[1], actsite.inds[2])],
                 collapse="\n")
  }
  return(hmm)
}

## Add pdbs to hits if db = pdb
## https://github.com/cran/bio3d/blob/master/R/hmmer.R
add.pdbs <- function(x, ...) {
  hit <- XML::xpathSApply(x, '@*')
  pdbs <- unique(XML::xpathSApply(x, 'pdbs', XML::xmlToList))
  new <- as.matrix(hit, ncol=1)

  if(length(pdbs) > 1) {
    for(i in 1:length(pdbs)) {
      hit["acc"]=pdbs[i]
      new=cbind(new, hit)
    }
    colnames(new)=NULL
  }
  return(new)
}

request_hmmer_using_grid_seqs <- function(x, y, url,
  otherwise_list, curl.opts, verbose, fullseqfasta,
  seqdb_or_hmmdb, alignment = FALSE){
  purrr::map2(x, y,
  purrr::possibly(quiet = FALSE, #show errors
      otherwise = otherwise_list,
      ~ {
      # Request HMMER
      seq <- as.character(.x)
      db <- tolower(.y)
      seqdb = NULL
      hmmdb = NULL
      if (seqdb_or_hmmdb == "seqdb") {
        seqdb <- db
      }
      if (seqdb_or_hmmdb == "hmmdb") {
        hmmdb <- db
      }
      hmm <- RCurl::postForm(url, seqdb=seqdb, hmmdb = hmmdb, seq=seq,
                       style = "POST", .opts = curl.opts,
                       .contentEncodeFun=RCurl::curlPercentEncode,
                         .checkParams=TRUE )

      ## check results from the server
      if(!grepl("results", hmm))
        stop("Request to HMMER server failed")
      if(grepl("act_site", hmm))
      hmm <- workaround_for_act_site(hmm)
      if(verbose)
      message(paste0("Content from HMMER server:\n",
                   hmm))
      ## parse xml
      xml <- XML::xmlParse(hmm)
      ## Web url
      uuid <- parse_uuid_xml(xml)
      result.url <- paste0("http://www.ebi.ac.uk/Tools/hmmer/results/", uuid)

      ## Obtain data
      stats <- xml %>%
        parse_hash_xml("///stats")
      ## Hits
      if( db == "pdb"){
        hits <- parse_hits_xml_if_pdb(xml)
      } else{
      hits <- xml %>%
        parse_hash_xml("///hits")
      }
      ## Domains
      domains <- xml %>%
        parse_hash_xml("///domains")
      ## Create list
      rowlist <- list("url" = result.url, "stats" = stats,
                  "hits" = hits, "domains" = domains)
      ## Fullfasta
      if (fullseqfasta) {
      fasta.url <- paste0("https://www.ebi.ac.uk/Tools/hmmer/download/",
                        uuid, "/score?format=fullfasta")
      rowlist <- append(rowlist,
                      list("fullseqfasta" = fasta.url %>%
                             Biostrings::readAAStringSet()))
      }
      if (alignment) {
        alignment.url <- paste0("https://www.ebi.ac.uk/Tools/hmmer/download/",
                            uuid, "/score?format=afa")
        rowlist <- append(rowlist,
                          list("alignment" = alignment.url %>%
                                 Biostrings::readAAMultipleAlignment()))
      }

      return(rowlist)
      }))
}
request_hmmer_using_grid_alns <- function(x, y, url,
                                          otherwise_list,
                                          curl.opts, verbose,
                                          fullseqfasta, alignment){
  purrr::map2(x, y,
  purrr::possibly(quiet = FALSE, #show errors
    otherwise = otherwise_list,
    ~{
      # Request HMMER
      aln <- .x
      db <- tolower(.y)
      hmm <- RCurl::postForm(url, seqdb=db, seq=aln,
                             style = "POST", .opts = curl.opts,
                             .contentEncodeFun=RCurl::curlPercentEncode,
                             .checkParams=TRUE )

      ## check results from the server
      if(!grepl("results", hmm))
        stop("Request to HMMER server failed")
      if(grepl("act_site", hmm))
        hmm <- workaround_for_act_site(hmm)
      if(verbose)
        message(paste0("Content from HMMER server:\n",
                       hmm))
      ## parse xml
      xml <- XML::xmlParse(hmm)
      ## Web url
      uuid <- parse_uuid_xml(xml)
      result.url <- paste0("http://www.ebi.ac.uk/Tools/hmmer/results/", uuid)

      ## Obtain data
      stats <-  xml %>%
        parse_hash_xml("///stats")
      ## Hits
      if( db == "pdb"){
        hits <- parse_hits_xml_if_pdb(xml)
      } else{
        hits <- hits <- xml %>%
          parse_hash_xml("///hits")
      }
      ## Domains
      domains <- xml %>%
        parse_hash_xml("///domains")

      ## Create list
      rowlist <- list("url" = result.url, "stats" = stats,
                      "hits" = hits, "domains" = domains)

      ## Fullfasta
      if (fullseqfasta) {
        fasta.url <- paste0("https://www.ebi.ac.uk/Tools/hmmer/download/",
                            uuid, "/score?format=fullfasta")
        rowlist <- append(rowlist,
                          list("fullseqfasta" = fasta.url %>%
                                 Biostrings::readAAStringSet()))
      }
      if (alignment) {
        alignment.url <- paste0("https://www.ebi.ac.uk/Tools/hmmer/download/",
                                uuid, "/score?format=afa")
        rowlist <- append(rowlist,
                          list("alignment" = alignment.url %>%
                                 Biostrings::readAAMultipleAlignment()))
      }
      return(rowlist)
    })
  )
}
currocam/taxa2hmmer documentation built on April 10, 2022, 11:02 a.m.