taxa2hmmer: Get more out of homologous sequence search with HMMER

Documented in read_hmmer_from_xml

#' Read the results of a search with HMMER from an XML file and, optionally, from a fullseq-fasta file and/or an alignment.
#'
#' @param xml_file_paths A a character vector the containing filepaths to XML file.
#' @param fullseq_fasta_paths If desired, a character vector containing the paths to the fullseq-fasta file. By default, `NULL`.
#' @param alignment_fasta_paths If desired, a character vector containing the paths to the fasta alignment file. By default, `NULL`.
#'
#' @return A nested DataFrame with columns `hits`, `stats`, `domains` and, if selected,
#'   `fullseq.fasta.`
#'
#' @section Stats data structure:
#' * `nhits`: 	The number of hits found above reporting thresholds.
#' * `Z`: 	The number of sequences or models in the target database.
#' * `domZ`: 	The number of hits in the target database.
#' * `nmodels` 	The number of models in this search.
#' * `nincluded` 	The number of sequences or models scoring above the significance threshold.
#' * `nreported` 	The number of sequences or models scoring above the reporting threshold
#'
#' @section Sequence data structure:
#' The hits array contains one or more sequences. Tthe redundant sequence information will also be included.
#' * `name`: 	Name of the target (sequence for phmmer/hmmsearch, HMM for hmmscan).
#' * `acc`: 	Accession of the target.
#' * `acc2`: 	Secondary accession of the target.
#' * `id`: 	Identifier of the target.
#' * `desc`: 	Description of the target.
#' * `score`: 	Bit score of the sequence (all domains, without correction)
#' * `pvalue`: 	P-value of the score.
#' * `evalue`: 	E-value of the score.
#' * `nregions`: 	Number of regions evaluated.
#' * `nenvelopes`: 	Number of envelopes handed over for domain definition, null2, alignment, and scoring.
#' * `ndom`: 	Total number of domains identified in this sequence.
#' * `nreported`: 	Number of domains satisfying reporting thresholding.
#' * `nincluded`: 	Number of domains satisfying inclusion thresholding.
#' * `taxid`: 	The NCBI taxonomy identifier of the target (if applicable).
#' * `species`: 	The species name of the target (if applicable).
#' * `kg`: 	The kingdom of life that the target belongs to - based on placing in the NCBI taxonomy tree (if applicable).
#'
#' @section Domain data structure:
#' The domain contains the details of the match, in particular the alignment between the query and the target.
#' * `ienv`: 	Envelope start position.
#' * `jenv`: 	Envelope end position.
#' * `iali`: 	Alignment start position.
#' * `jali`: 	Alignment end position.
#' * `bias`: 	null2 score contribution.
#' * `oasc`: 	Optimal alignment accuracy score.
#' * `bitscore`: 	Overall score in bits, null corrected, if this were the only domain in seq.
#' * `cevalue`: 	Conditional E-value based on the domain correction.
#' * `ievalue`: 	Independent E-value based on the domain correction.
#' * `is_reported`: 	1 if domain meets reporting thresholds.
#' * `is_included`: 	1 if domain meets inclusion thresholds.
#' * `alimodel`: 	Aligned query consensus sequence phmmer and hmmsearch, target hmm for hmmscan.
#' * `alimline`: 	Match line indicating identities, conservation +’s, gaps.
#' * `aliaseq`: 	Aligned target sequence for phmmer and hmmsearch, query for hmmscan.
#' * `alippline`: 	Posterior probability annotation.
#' * `alihmmname`: 	Name of HMM (query sequence for phmmer, alignment for hmmsearch and target hmm for hmmscan).
#' * `alihmmacc`: 	Accession of HMM.
#' * `alihmmdesc`: 	Description of HMM.
#' * `alihmmfrom`: 	Start position on HMM.
#' * `alihmmto`: 	End position on HMM.
#' * `aliM`: 	Length of model.
#' * `alisqname`: 	Name of target sequence (phmmer, hmmscan) or query sequence(hmmscan).
#' * `alisqacc`: 	Accession of sequence.
#' * `alisqdesc`: 	Description of sequence.
#' * `alisqfrom`: 	Start position on sequence.
#' * `alisqto`: 	End position on sequence.
#' * `aliL`: 	Length of sequence.
#'
#' @export
#'
read_hmmer_from_xml <- function(xml_file_paths,
                                fullseq_fasta_paths = NULL,
                                alignment_fasta_paths = NULL){
  # Check
  if (!requireNamespace("XML", quietly = TRUE)) {
    stop(
      "Package \"XML\" must be installed to use this function.",
      call. = FALSE
    )
  }
  if(!all(file.exists(xml_file_paths)))
    stop("`xml_file_paths` must be a character vector of paths to XML files.")

  if (!is.null(fullseq_fasta_paths)) {
    if (!requireNamespace("Biostrings", quietly = TRUE)) {
      stop(
        "Package \"Biostrings\" must be installed to use this function if fullseqfasta.",
        call. = FALSE
      )
    }
    if (!all(file.exists(fullseq_fasta_paths))) {
      stop("`fullseq_fasta_paths` must be a character vector of paths to fullseq-fasta files.")
    }
    if (length(fullseq_fasta_paths) != length(xml_file_paths)) {
      stop("`fullseq_fasta_paths` and `xml_file_paths` must have the same length.")
    }
  }
  if (!is.null(alignment_fasta_paths)) {
    if (!requireNamespace("Biostrings", quietly = TRUE)) {
      stop(
        "Package \"Biostrings\" must be installed to use this function if alignment_fasta_paths.",
        call. = FALSE
      )
    }
    if (!all(file.exists(alignment_fasta_paths))) {
      stop("`alignment_fasta_paths` must be a character vector of paths to alignment fasta files.")
    }
    if (length(alignment_fasta_paths) != length(xml_file_paths)) {
      stop("`alignment_fasta_paths` and `xml_file_paths` must have the same length.")
    }
  }


  df <- purrr::map_dfr(xml_file_paths,
    ~{
      xml <- XML::xmlParse(.x)
      tibble::tibble(
        "stats" = list(parse_hash_xml(xml, "///stats")),
        "hits" = list(parse_hash_xml(xml, "///hits")),
        "domains" = list(parse_hash_xml(xml, "///domains"))
      )
    })

  if (!is.null(fullseq_fasta_paths)) {
    fastas <- purrr::map(fullseq_fasta_paths,
      ~{Biostrings::readAAStringSet(.x)})
    df <- df %>% dplyr::mutate(
      "fullseqfasta" = fastas)
  }
  if (!is.null(alignment_fasta_paths)) {
    fastas <- purrr::map(alignment_fasta_paths,
                         ~{Biostrings::readAAMultipleAlignment(.x)})
    df <- df %>% dplyr::mutate(
      "alignment" = fastas)
  }
  class(df) <- c("HMMER_data_tbl", class(df))
  return(df)
}