R/predict_topology.R

Defines functions predict_topology

Documented in predict_topology

#' Predict the topology of proteins from file
#'
#' Predict the topology of zero, one or more proteins,
#' of which the names and sequences are stored in the FASTA format
#' @inheritParams default_params_doc
#' @return a \link[tibble]{tibble} with the columns 'name'
#' and 'topology', where the 'name' column hold all the proteins' names,
#' and 'topology' contains all respective topologies.
#' @seealso use \link{mock_predict_topology} to do a mock prediction,
#'   as can be useful in testing
#' @note unlike PureseqTM, the topologies predicted are returned in
#'   the same order as the original sequences.
#'   A bugreport is posted at the PureseqTM GitHub repository
#'   at \url{https://github.com/PureseqTM/PureseqTM_Package/issues/11}
#' @examples
#' if (is_pureseqtm_installed()) {
#'   fasta_filename <- get_example_filename("1bhaA.fasta")
#'   predict_topology(fasta_filename)
#' }
#' @author Richèl J.C. Bilderbeek
#' @export
predict_topology <- function(
  fasta_filename,
  folder_name = get_default_pureseqtm_folder(),
  topology_filename = tempfile(fileext = ".top")
) {
  # Empty FASTA file returns an empty tibble
  if (length(readr::read_lines(fasta_filename)) == 0) {
    return(
      tibble::tibble(
        name = character(0),
        topology = character(0)
      )
    )
  }
  # PureseqTM does not preserve the order of genes in the FASTA file.
  # This is bug in PureseqTM, reported at
  # https://github.com/PureseqTM/PureseqTM_Package/issues/11
  #
  # Up until then, pureseqtmr needs to correct for this:
  #
  # Use the tibble with the correctly ordered genes
  # Contains the protein names and sequences
  t_fasta <- pureseqtmr::load_fasta_file_as_tibble(fasta_filename)
  pureseqtmr::check_protein_sequences(t_fasta$sequence)
  # Contains the protein names and topologies in the same
  # sequence as t_fasta
  t_topology <- tibble::tibble(
    name = t_fasta$name,
    topology = NA
  )

  # Go through the sequence one-by-one due to PureseqTM quirk
  for (i in seq_len(nrow(t_fasta))) {
    # Create a FASTA file only for the sequence
    this_fasta_filename <- tempfile(
      pattern = paste0("pureseqtmr_", i, "_"),
      fileext = ".fasta"
    )
    pureseqtmr::save_tibble_as_fasta_file(
      t = t_fasta[i, ],
      fasta_filename = this_fasta_filename
    )
    pureseqtm_proteome_text <- pureseqtmr::run_pureseqtm_proteome(
      fasta_filename = this_fasta_filename,
      folder_name = folder_name,
      topology_filename = topology_filename
    )
    t_this_topology <- pureseqtmr::parse_pureseqtm_proteome_text(
      pureseqtm_proteome_text
    )
    testthat::expect_equal(1, nrow(t_this_topology))
    t_topology$topology[i] <- t_this_topology$topology[1]
  }
  testthat::expect_equal(nrow(t_fasta), nrow(t_topology))
  testthat::expect_equal(
    t_fasta$name,
    t_topology$name
  )

  if (!all(nchar(t_fasta$sequence) == nchar(t_topology$topology))) {
    bad_indices <- which(nchar(t_fasta$sequence) != nchar(t_topology$topology))
    warning(
      paste0(
        "Protein sequence and topology has a different length for indices: ",
        paste(bad_indices, collapse = " ")
      )
    )
    for (i in bad_indices) {
      warning(
        "index ", i, ": ",
        "sequence length: ", nchar(t_fasta$sequence[i]),
        ", topology length: ", nchar(t_topology$topology[i])
      )
    }

  }
  t_topology
}

Try the pureseqtmr package in your browser

Any scripts or data that you put into this service are public.

pureseqtmr documentation built on April 6, 2023, 5:20 p.m.