#' Read the results of a search with HMMER from an XML file and, optionally, from a fullseq-fasta file and/or an alignment.
#'
#' @param xml_file_paths A a character vector the containing filepaths to XML file.
#' @param fullseq_fasta_paths If desired, a character vector containing the paths to the fullseq-fasta file. By default, `NULL`.
#' @param alignment_fasta_paths If desired, a character vector containing the paths to the fasta alignment file. By default, `NULL`.
#'
#' @return A nested DataFrame with columns `hits`, `stats`, `domains` and, if selected,
#' `fullseq.fasta.`
#'
#' @section Stats data structure:
#' * `nhits`: The number of hits found above reporting thresholds.
#' * `Z`: The number of sequences or models in the target database.
#' * `domZ`: The number of hits in the target database.
#' * `nmodels` The number of models in this search.
#' * `nincluded` The number of sequences or models scoring above the significance threshold.
#' * `nreported` The number of sequences or models scoring above the reporting threshold
#'
#' @section Sequence data structure:
#' The hits array contains one or more sequences. Tthe redundant sequence information will also be included.
#' * `name`: Name of the target (sequence for phmmer/hmmsearch, HMM for hmmscan).
#' * `acc`: Accession of the target.
#' * `acc2`: Secondary accession of the target.
#' * `id`: Identifier of the target.
#' * `desc`: Description of the target.
#' * `score`: Bit score of the sequence (all domains, without correction)
#' * `pvalue`: P-value of the score.
#' * `evalue`: E-value of the score.
#' * `nregions`: Number of regions evaluated.
#' * `nenvelopes`: Number of envelopes handed over for domain definition, null2, alignment, and scoring.
#' * `ndom`: Total number of domains identified in this sequence.
#' * `nreported`: Number of domains satisfying reporting thresholding.
#' * `nincluded`: Number of domains satisfying inclusion thresholding.
#' * `taxid`: The NCBI taxonomy identifier of the target (if applicable).
#' * `species`: The species name of the target (if applicable).
#' * `kg`: The kingdom of life that the target belongs to - based on placing in the NCBI taxonomy tree (if applicable).
#'
#' @section Domain data structure:
#' The domain contains the details of the match, in particular the alignment between the query and the target.
#' * `ienv`: Envelope start position.
#' * `jenv`: Envelope end position.
#' * `iali`: Alignment start position.
#' * `jali`: Alignment end position.
#' * `bias`: null2 score contribution.
#' * `oasc`: Optimal alignment accuracy score.
#' * `bitscore`: Overall score in bits, null corrected, if this were the only domain in seq.
#' * `cevalue`: Conditional E-value based on the domain correction.
#' * `ievalue`: Independent E-value based on the domain correction.
#' * `is_reported`: 1 if domain meets reporting thresholds.
#' * `is_included`: 1 if domain meets inclusion thresholds.
#' * `alimodel`: Aligned query consensus sequence phmmer and hmmsearch, target hmm for hmmscan.
#' * `alimline`: Match line indicating identities, conservation +’s, gaps.
#' * `aliaseq`: Aligned target sequence for phmmer and hmmsearch, query for hmmscan.
#' * `alippline`: Posterior probability annotation.
#' * `alihmmname`: Name of HMM (query sequence for phmmer, alignment for hmmsearch and target hmm for hmmscan).
#' * `alihmmacc`: Accession of HMM.
#' * `alihmmdesc`: Description of HMM.
#' * `alihmmfrom`: Start position on HMM.
#' * `alihmmto`: End position on HMM.
#' * `aliM`: Length of model.
#' * `alisqname`: Name of target sequence (phmmer, hmmscan) or query sequence(hmmscan).
#' * `alisqacc`: Accession of sequence.
#' * `alisqdesc`: Description of sequence.
#' * `alisqfrom`: Start position on sequence.
#' * `alisqto`: End position on sequence.
#' * `aliL`: Length of sequence.
#'
#' @export
#'
read_hmmer_from_xml <- function(xml_file_paths,
fullseq_fasta_paths = NULL,
alignment_fasta_paths = NULL){
# Check
if (!requireNamespace("XML", quietly = TRUE)) {
stop(
"Package \"XML\" must be installed to use this function.",
call. = FALSE
)
}
if(!all(file.exists(xml_file_paths)))
stop("`xml_file_paths` must be a character vector of paths to XML files.")
if (!is.null(fullseq_fasta_paths)) {
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop(
"Package \"Biostrings\" must be installed to use this function if fullseqfasta.",
call. = FALSE
)
}
if (!all(file.exists(fullseq_fasta_paths))) {
stop("`fullseq_fasta_paths` must be a character vector of paths to fullseq-fasta files.")
}
if (length(fullseq_fasta_paths) != length(xml_file_paths)) {
stop("`fullseq_fasta_paths` and `xml_file_paths` must have the same length.")
}
}
if (!is.null(alignment_fasta_paths)) {
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop(
"Package \"Biostrings\" must be installed to use this function if alignment_fasta_paths.",
call. = FALSE
)
}
if (!all(file.exists(alignment_fasta_paths))) {
stop("`alignment_fasta_paths` must be a character vector of paths to alignment fasta files.")
}
if (length(alignment_fasta_paths) != length(xml_file_paths)) {
stop("`alignment_fasta_paths` and `xml_file_paths` must have the same length.")
}
}
df <- purrr::map_dfr(xml_file_paths,
~{
xml <- XML::xmlParse(.x)
tibble::tibble(
"stats" = list(parse_hash_xml(xml, "///stats")),
"hits" = list(parse_hash_xml(xml, "///hits")),
"domains" = list(parse_hash_xml(xml, "///domains"))
)
})
if (!is.null(fullseq_fasta_paths)) {
fastas <- purrr::map(fullseq_fasta_paths,
~{Biostrings::readAAStringSet(.x)})
df <- df %>% dplyr::mutate(
"fullseqfasta" = fastas)
}
if (!is.null(alignment_fasta_paths)) {
fastas <- purrr::map(alignment_fasta_paths,
~{Biostrings::readAAMultipleAlignment(.x)})
df <- df %>% dplyr::mutate(
"alignment" = fastas)
}
class(df) <- c("HMMER_data_tbl", class(df))
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.