taxa2hmmer: Get more out of homologous sequence search with HMMER

Documented in extract_from_HMMER_data_tbl

#' A function that extracts information from `HMMER_data_tbl`, allows to perform some common tasks and returns a Tidy DataFrame.
#'
#' @param HMMER_data_tbl A DataFrame of subclass `HMMER_data_tbl`, obtained through the `search` family of functions or `read_hmmer_from_xml`.
#' @param id_column A character vector with as many elements as rows in HMMER_data_tbl and that will be used to create the `id` column.
#' @param remove_redundancy A Boolean, if TRUE then entries corresponding to 100% redundant sequences will be removed in sequence.
#' @param annotate_taxonomic A character vector of one, if "local" is selected, an annotation will be made using a local database, while if "remote" is selected, it will be made using remote resources (slower).
#' @param evalue A numeric vector of one, by default NULL. If a number is given, hits will be filtered according to that evalue.
#' @param calculate_physicochemical_properties A Boolean, if TRUE a series of numerical descriptors will be calculated for each sequence.
#'
#' @return A DataFrame of subclass `HMMER_data_tbl_tidy`. It has as many rows as identified domains.
#' @section Sequence data structure:
#' Variables beginning with "hits." have been extracted from the hash
#'    corresponding to the sequences. Remember that there are as many rows as
#'    there are domains identified, not per sequence. Therefore, these
#'    variables will be duplicated, for example, for a sequence in which 2
#'    domains are identified.
#'
#' * `hits.name`: 	Name of the target (sequence for phmmer/hmmsearch, HMM for hmmscan).
#' * `hits.acc`: 	Accession of the target.
#' * `hits.acc2`: 	Secondary accession of the target.
#' * `hits.id`: 	Identifier of the target.
#' * `hits.desc`: 	Description of the target.
#' * `hits.score`: 	Bit score of the sequence (all domains, without correction)
#' * `hits.pvalue`: 	P-value of the score.
#' * `hits.evalue`: 	E-value of the score.
#' * `hits.nregions`: 	Number of regions evaluated.
#' * `hits.nenvelopes`: 	Number of envelopes handed over for domain definition, null2, alignment, and scoring.
#' * `hits.ndom`: 	Total number of domains identified in this sequence.
#' * `hits.nreported`: 	Number of domains satisfying reporting thresholding.
#' * `hits.nincluded`: 	Number of domains satisfying inclusion thresholding.
#' * `hits.taxid`: 	The NCBI taxonomy identifier of the target (if applicable).
#' * `hits.species`: 	The species name of the target (if applicable).
#' * `hits.kg`: 	The kingdom of life that the target belongs to - based on placing in the NCBI taxonomy tree (if applicable).
#'
#' @section Domain data structure:
#' Variables beginning with "domains." have been extracted from the hash domains.
#' * `domains.ienv`: 	Envelope start position.
#' * `domains.jenv`: 	Envelope end position.
#' * `domains.iali`: 	Alignment start position.
#' * `domains.jali`: 	Alignment end position.
#' * `domains.bias`: 	null2 score contribution.
#' * `domains.oasc`: 	Optimal alignment accuracy score.
#' * `domains.bitscore`: 	Overall score in bits, null corrected, if this were the only domain in seq.
#' * `domains.cevalue`: 	Conditional E-value based on the domain correction.
#' * `domains.ievalue`: 	Independent E-value based on the domain correction.
#' * `domains.is_reported`: 	1 if domain meets reporting thresholds.
#' * `domains.is_included`: 	1 if domain meets inclusion thresholds.
#' * `domains.alimodel`: 	Aligned query consensus sequence phmmer and hmmsearch, target hmm for hmmscan.
#' * `domains.alimline`: 	Match line indicating identities, conservation +’s, gaps.
#' * `domains.aliaseq`: 	Aligned target sequence for phmmer and hmmsearch, query for hmmscan.
#' * `domains.alippline`: 	Posterior probability annotation.
#' * `domains.alihmmname`: 	Name of HMM (query sequence for phmmer, alignment for hmmsearch and target hmm for hmmscan).
#' * `domains.alihmmacc`: 	Accession of HMM.
#' * `domains.alihmmdesc`: 	Description of HMM.
#' * `domains.alihmmfrom`: 	Start position on HMM.
#' * `domains.alihmmto`: 	End position on HMM.
#' * `domains.aliM`: 	Length of model.
#' * `domains.alisqname`: 	Name of target sequence (phmmer, hmmscan) or query sequence(hmmscan).
#' * `domains.alisqacc`: 	Accession of sequence.
#' * `domains.alisqdesc`: 	Description of sequence.
#' * `domains.alisqfrom`: 	Start position on sequence.
#' * `domains.alisqto`: 	End position on sequence.
#' * `domains.aliL`: 	Length of sequence.
#'
#' @section Taxonomic data structure:
#' If annotate_taxonomic = "local" or annotate_taxonomic = "remote" is selected,
#'  as many columns as available taxonomic categories will be included. All of them will have in common that they will start with "taxa".
#' @section Theoretical physicochemical properties data structure:
#' If calculate_physicochemical_properties = TRUE the following columns will be included.
#'  All of them will have in common that they will start with "properties".
#' * `properties.molecular.weight`: sum of the masses of each atom constituting a
#'    molecule (Da) using the same formulas and weights as ExPASy's.
#' *  `properties.charge`: net charge of a protein sequence based on the
#'    Henderson-Hasselbalch equation using Lehninger pKa scale.
#' *  `properties.pI`:  isoelectric point calculated as in EMBOSS PEPSTATS.
#' * `properties.mz`: mass over charge ratio (m/z) for peptides, as measured in mass spectrometry.
#' * `properties.aIndex`: aliphatic index of a protein. The aindex is defined as the
#'    relative volume occupied by aliphatic side chains (Alanine, Valine,
#'    Isoleucine, and Leucine). It may be regarded as a positive factor
#'    for the increase of thermostability of globular proteins.
#' * `properties.boman`: potential protein interaction index . The index is equal to the
#'  sum of the solubility values for all residues in a sequence, it might give
#'  an overall estimate of the potential of a peptide to bind to membranes or
#'  other proteins as receptors, to normalize it is divided by the number of
#'  residues. A protein have high binding potential if the index value is
#'  higher than 2.48.
#' * `properties.hydrophobicity`: GRAVY hydrophobicity index of an amino acids sequence using
#'   KyteDoolittle hydophobicity scale.
#' * `properties.instaIndex`: Guruprasad's instability index. This index predicts the stability
#'    of a protein based on its amino acid composition, a protein whose instability
#'     index is smaller than 40 is predicted as stable, a value above 40 predicts
#'      that the protein may be unstable.
#' * `properties.STYNQW`: Percentage of amino acids (S + T + Y + N + Q + W)
#' * `properties.Tiny`: Percentage of amino acids (A + C + G + S + T)
#' * `properties.Small`: Percentage of amino acids (A + B + C + D + G + N + P + S + T + V)
#' * `properties.Aliphatic`: Percentage of amino acids (A + I + L + V)
#' * `properties.Aromatic`: Percentage of amino acids (F + H + W + Y)
#' * `properties.Non-polar`: Percentage of amino acids (A + C + F + G + I + L + M + P + V + W + Y)
#' * `properties.Polar`: Percentage of amino acids (D + E + H + K + N + Q + R + S + T + Z)
#' * `properties.Charged`: Percentage of amino acids (B + D + E + H + K + R + Z)
#' * `properties.Basic`: Percentage of amino acids (H + K + R)
#'
#' @export
#'
#' @examples
#' \dontrun{
#' data <- search_phmmer(seqs = "FQTWEEFSRAAEKLYLADPMKVRVVLKYRH",
#'    dbs = c("swissprot", "uniprotrefprot", "ensembl"),
#'    verbose = TRUE)
#' df <- extract_from_HMMER_data_tbl(
#'    data,
#'    id_column = c("Swissprot", "Reference proteomes", "Ensembl"),
#'    remove_redundancy = TRUE,
#'    annotate_taxonomic = "local")
#' }
extract_from_HMMER_data_tbl <- function(
  HMMER_data_tbl,
  id_column = NULL,
  remove_redundancy = FALSE,
  annotate_taxonomic = NULL,
  evalue = NULL,
  calculate_physicochemical_properties = NULL
  ){

  if (!inherits(HMMER_data_tbl, "HMMER_data_tbl"))
    stop("extract_from_HMMER_data_tbl requires a 'HMMER_data_tbl' object")
  if (!all(c("hits", "stats", "domains") %in% colnames(HMMER_data_tbl))) {
    stop("`HMMER_data_tbl` is incorrectly formatted. ")
  }
  if (!is.null(id_column) &&
      (length(rownames(HMMER_data_tbl)) != length(id_column))) {
    stop("If added, id_column must have as many elements as rows in `HMMER_data_tbl.`")
  }
  if (any(is.na(c(HMMER_data_tbl$hits, HMMER_data_tbl$domains)))) {
    stop("NA has been found in `HMMER_data_tbl.` Have you filtered the rows with NA results before using this function?")
  }
  names(HMMER_data_tbl$hits) <- id_column
  names(HMMER_data_tbl$domains) <- id_column

  df <- purrr::map2_dfr(
    .x = HMMER_data_tbl$hits,
    .y = HMMER_data_tbl$domains,
    .id = "id",
    .f = ~{
      hits <- .x %>%
        dplyr::rename_with( ~ paste0("hits.", .))
      domains <- .y %>%
        dplyr::rename_with( ~ paste0("domains.", .))
      row <- hits %>%
        dplyr::left_join(domains,
                         by = c("hits.name" = "domains.alisqname"))
      })
    if ("fullseqfasta" %in% names(HMMER_data_tbl)) {
      seqs <- extract_sequences(HMMER_data_tbl,
                                seq_column = HMMER_data_tbl$fullseqfasta,
                                column_name = "fullseqfasta",
                                id_column = id_column)
      df <- df %>%
        dplyr::left_join(seqs,
        by = c("id" = "id", "hits.name" = "hits.name"))
    }
  if (is.numeric(evalue)) {
    df <- df %>%
      dplyr::filter(.data$hits.evalue < evalue)
  }

  if (remove_redundancy) {
    df <- df %>%
      dplyr::distinct(.data$hits.fullseqfasta, .keep_all = TRUE)
  }

  if (!is.null(annotate_taxonomic)) {
    df <- annotate_with_NCBI_taxid(
      taxid = df$hits.taxid,
      mode = annotate_taxonomic
      ) %>%
      dplyr::rename_with( ~ paste0("taxa.", .)) %>%
      dplyr::right_join(df, by = c("taxa.taxid" = "hits.taxid"))
  }

  if (!is.null(calculate_physicochemical_properties)) {
    df <- calculate_physicochemical_properties(
      seqs = df$hits.fullseqfasta) %>%
      dplyr::rename_with( ~ paste0("properties.", .)) %>%
      dplyr::right_join(df,
          by = c("properties.seqs" = "hits.fullseqfasta")) %>%
      dplyr::select(-c("properties.id")) %>%
      dplyr::rename("hits.fullseqfasta" = "properties.seqs")

  }


  class(df) <- c("HMMER_tidy_tbl", class(df))
  return(df)
}


extract_sequences <- function(HMMER_data_tbl, seq_column,
                              column_name, id_column){
  names(seq_column) <- id_column
  purrr::map2_dfr(
    .x = HMMER_data_tbl$hits,
    .y = seq_column,
    .id = "id",
    purrr::possibly(
      otherwise = tibble::tibble("hits.name" = NA, "column_name" = NA),
      .f = ~{
        fasta <- .y
        hits <- .x %>%
          dplyr::filter(.data$name %in% names(fasta))
        hits.name <- hits$name
        tibble::tibble(
          "hits.name" = hits.name,
          "column_name" = as.character(fasta[hits.name])
        )
      }
    )
  ) %>%
    magrittr::set_colnames(c("id",
                             "hits.name",
                             paste0("hits.", column_name)))
}

#' @export
summary.HMMER_tidy_tbl <- function(object, ...) {
  object %>%
    dplyr::group_by(.data$id, .data$hits.name, .data$hits.acc) %>%
    dplyr::mutate("best.ievalue" = min(.data$domains.ievalue)) %>%
    dplyr::ungroup() %>%
    dplyr::select(
      c("hits.evalue",
        "domains.cevalue",
        "domains.ievalue",
        "best.ievalue")) %>%
    as.data.frame() %>%
    summary()

}


#' @importFrom ggplot2 autoplot
#' @export
autoplot.HMMER_tidy_tbl <- function(object, threshold = 0.01, ...) {
  df <- object %>%
    dplyr::group_by(.data$id, .data$hits.name, .data$hits.acc) %>%
    dplyr::mutate("best.ievalue" = min(.data$domains.ievalue)) %>%
    dplyr::ungroup()
  p <- df %>%
    ggplot2::ggplot()+
    ggplot2::geom_segment(
      ggplot2::aes(
        x = .data$hits.name,
        xend = .data$hits.name,
        y = -log(.data$hits.evalue),
        yend = -log(.data$best.ievalue)),
      color="grey"
    ) +
    ggplot2::geom_point(
      ggplot2::aes(
        x = .data$hits.name,
        y = -log(.data$hits.evalue)),
      color = "green",  size = 1, alpha = 0.7
    ) +
    ggplot2::geom_point(
      ggplot2::aes(
        x = .data$hits.name,
        y = -log(.data$domains.ievalue)),
      color = "red", size = 1
    ) +
    ggplot2::coord_flip() +
    ggplot2::theme(
      panel.grid.major.x = ggplot2::element_blank(),
      panel.border = ggplot2::element_blank(),
      axis.ticks.x = ggplot2::element_blank(),
      axis.text.y = ggplot2::element_blank(),
      strip.text.y = ggplot2::element_text(angle = 0),
      strip.text.x = ggplot2::element_text(angle = 0)
    ) +
    ggplot2::xlab("Sequences hits") +
    ggplot2::ylab("-log(E-value)")+
    ggplot2::geom_hline(yintercept = -log(threshold), alpha = 0.5)
  return(p)
}

#' @importFrom graphics plot
#' @export
plot.HMMER_tidy_tbl <- function(x, ...) {
  print(autoplot(x, ...))
}