taxa2hmmer: Get more out of homologous sequence search with HMMER

Documented in search_phmmer

#' Perform a phmmer search of a protein sequence against a protein sequence database.
#'
#' @param seqs A vector of characters containing the sequences of the query.
#' @param dbs A character vector containing the target databases. Frequently
#'  used databases are `swissprot`, `uniprotrefprot`, `uniprotkb`, `ensembl`,
#'  `pdb` and `alphafold`, but a complete and updated list is available at
#'   \url{https://www.ebi.ac.uk/Tools/hmmer/}.
#' @param fullseqfasta  A logical, if TRUE full sequence fasta will be downloaded
#'  as a AAStringSet. Package `Biostrings` must be installed in that case.
#' @param verbose A logical, if TRUE details of the download process is printed.
#' @param timeout An integer specifying the number of seconds to wait for the
#'  reply before a time out occurs.
#' @param alignment A logical, if TRUE sequence alignment will be downloaded
#'  as a `AAMultipleAlignment`. Package `Biostrings` must be installed in that case.
#'
#' @return A nested DataFrame with columns `seqs`, `dbs`, `url`
#'  (HMMER temporary url), `hits`, `stats`, `domains` and, if selected,
#'   `fullseq.fasta.`
#'
#' @section Stats data structure:
#' * `nhits`: 	The number of hits found above reporting thresholds.
#' * `Z`: 	The number of sequences or models in the target database.
#' * `domZ`: 	The number of hits in the target database.
#' * `nmodels` 	The number of models in this search.
#' * `nincluded` 	The number of sequences or models scoring above the significance threshold.
#' * `nreported` 	The number of sequences or models scoring above the reporting threshold
#'
#' @section Sequence data structure:
#' The hits array contains one or more sequences. Tthe redundant sequence information will also be included.
#' * `name`: 	Name of the target (sequence for phmmer/hmmsearch, HMM for hmmscan).
#' * `acc`: 	Accession of the target.
#' * `acc2`: 	Secondary accession of the target.
#' * `id`: 	Identifier of the target.
#' * `desc`: 	Description of the target.
#' * `score`: 	Bit score of the sequence (all domains, without correction)
#' * `pvalue`: 	P-value of the score.
#' * `evalue`: 	E-value of the score.
#' * `nregions`: 	Number of regions evaluated.
#' * `nenvelopes`: 	Number of envelopes handed over for domain definition, null2, alignment, and scoring.
#' * `ndom`: 	Total number of domains identified in this sequence.
#' * `nreported`: 	Number of domains satisfying reporting thresholding.
#' * `nincluded`: 	Number of domains satisfying inclusion thresholding.
#' * `taxid`: 	The NCBI taxonomy identifier of the target (if applicable).
#' * `species`: 	The species name of the target (if applicable).
#' * `kg`: 	The kingdom of life that the target belongs to - based on placing in the NCBI taxonomy tree (if applicable).
#'
#' @section Domain data structure:
#' The domain contains the details of the match, in particular the alignment between the query and the target.
#' * `ienv`: 	Envelope start position.
#' * `jenv`: 	Envelope end position.
#' * `iali`: 	Alignment start position.
#' * `jali`: 	Alignment end position.
#' * `bias`: 	null2 score contribution.
#' * `oasc`: 	Optimal alignment accuracy score.
#' * `bitscore`: 	Overall score in bits, null corrected, if this were the only domain in seq.
#' * `cevalue`: 	Conditional E-value based on the domain correction.
#' * `ievalue`: 	Independent E-value based on the domain correction.
#' * `is_reported`: 	1 if domain meets reporting thresholds.
#' * `is_included`: 	1 if domain meets inclusion thresholds.
#' * `alimodel`: 	Aligned query consensus sequence phmmer and hmmsearch, target hmm for hmmscan.
#' * `alimline`: 	Match line indicating identities, conservation +’s, gaps.
#' * `aliaseq`: 	Aligned target sequence for phmmer and hmmsearch, query for hmmscan.
#' * `alippline`: 	Posterior probability annotation.
#' * `alihmmname`: 	Name of HMM (query sequence for phmmer, alignment for hmmsearch and target hmm for hmmscan).
#' * `alihmmacc`: 	Accession of HMM.
#' * `alihmmdesc`: 	Description of HMM.
#' * `alihmmfrom`: 	Start position on HMM.
#' * `alihmmto`: 	End position on HMM.
#' * `aliM`: 	Length of model.
#' * `alisqname`: 	Name of target sequence (phmmer, hmmscan) or query sequence(hmmscan).
#' * `alisqacc`: 	Accession of sequence.
#' * `alisqdesc`: 	Description of sequence.
#' * `alisqfrom`: 	Start position on sequence.
#' * `alisqto`: 	End position on sequence.
#' * `aliL`: 	Length of sequence.
#'
#' @export
#'
search_phmmer <- function(seqs, dbs="swissprot",
                          fullseqfasta = TRUE,verbose=TRUE, timeout=90,
                          alignment = FALSE){

  # Check
  if (!requireNamespace("RCurl", quietly = TRUE)) {
    stop(
      "Package \"RCurl\" must be installed to use this function.",
      call. = FALSE
    )
  }
  if ((fullseqfasta) && !requireNamespace("Biostrings", quietly = TRUE)) {
    stop(
      "Package \"Biostrings\" must be installed to use this function if `fullseqfasta = TRUE`.",
      call. = FALSE
    )
  }
  if (!requireNamespace("XML", quietly = TRUE)) {
    stop(
      "Package \"XML\" must be installed to use this function.",
      call. = FALSE
    )
  }
  if(!is_protein_seq(seqs))
    stop(paste0("`seq` must be a protein sequence.\n",
                "Characters not belonging to the AA standar ",
                "alphabet were found."))
  seqs <- as.character(seqs)
  url <- "https://www.ebi.ac.uk/Tools/hmmer/search/phmmer"
  curl.opts <- list(httpheader = "Expect:",
                    httpheader = "Accept:text/xml",
                    verbose = verbose,
                    followlocation = TRUE
  )

  #all combinations of inputs
  grid <- tidyr::expand_grid(seqs, dbs)
  empty_list <- list("url" = NA, "stats" = NA, "hits" = NA, "domains" = NA)
  if (fullseqfasta) {
    empty_list <- append(empty_list, list("fullseqfasta" = NA))
  }
  data <- request_hmmer_using_grid_seqs(grid$seqs, grid$dbs,
                                        url, empty_list, curl.opts,
                                        verbose, fullseqfasta, "seqdb", alignment)
  data <- data %>%
    purrr::transpose()
  df <- grid %>%
    dplyr::mutate(
      "url" = data$url,
      "stats" = data$stats,
      "hits" = data$hits,
      "domains" = data$domains)

  if (fullseqfasta) {
    df <- df %>% dplyr::mutate(
      "fullseqfasta" = data$fullseqfasta
    )
  }
  if (alignment) {
    df <- df %>% dplyr::mutate(
      "alignment" = data$alignment
    )
  }
  class(df) <- c("HMMER_data_tbl", class(df))
  return(df)
}