R/make_fasta.R

Defines functions check_uniprot_release sub_crap check_uniprot_job make_fasta

Documented in check_uniprot_release make_fasta sub_crap

#' Make a FASTA using UniProt accessions
#'
#' @description Given a vector of UniProt accessions, this function will:
#' 1. Download the sequences
#' 2. Print the current UniProt release (you should put this in the FASTA file name)
#' 3. (Optional) add cRAP numbers to the FASTA headers for each sequence
#' 4. Save the sequences in a FASTA file
#'
#' @param accessions `character vector`, the UniProt accessions to use
#' @param file `character`, filepath to save the fasta to e.g. `"crap.fasta"`
#' @param is_crap `logical`, Is the output going to be a cRAP database? If `TRUE`
#' cRAP001, cRAP002, etc. is appended to the sequence headers in the FASTA file.
#' Default is `FALSE`
#' @param overwrite `logical`, if the FASTA file already exists should it be
#' overwritten? Default is `FALSE`
#' @param verbose `logical`, should the function send messages to the console?
#' Default is `TRUE`
#'
#' @return Returns a FASTA file saved to disk at the specified file path.
#' @examples
#' # specify some UniProt accessions
#' crap <- get_ccp_crap()
#'
#' \dontrun{
#' make_fasta(crap, "2021-01_cRAP.fasta")
#' }
#'
#' @export
make_fasta <- function(accessions, file, is_crap = FALSE, overwrite = FALSE, verbose = TRUE) {
  # define payload used to query uniprot
  payload <- list(
    ids = paste(accessions, collapse = " "),
    from = "UniProtKB_AC-ID",
    to = "UniProtKB"
  )

  # submit job
  post_payload <- httr::POST(url = "https://rest.uniprot.org/idmapping/run",
                             body = payload, encode = "multipart", httr::accept_json())

  post_resp <- httr::content(post_payload, as = "parsed")

  # print UniProtKB release
  if (verbose) message(paste("Downloading from UniProtKB release:",
                             post_payload$headers$`x-uniprot-release`))

  # check status of job
  if (isTRUE(check_uniprot_job(post_resp[["jobId"]]))) {
    result_url <- paste("https://rest.uniprot.org/idmapping/uniprotkb/results/stream/", post_resp[["jobId"]], sep = "")

    # query uniprot and save FASTA to disk
    # security level fix for Ubuntu 20.04
    # see https://msmith.de/2020/10/02/httr-curl-ubuntu-20-04.html
    httr_config <- switch(
      Sys.info()["sysname"],
      "Linux" = httr::config(ssl_cipher_list = "DEFAULT@SECLEVEL=1"),
      httr::config()
    )

    response <- httr::with_config(
      config = httr_config,
      httr::GET(
        url = paste0(result_url, "?format=fasta"),
        config = httr::write_disk(path = file, overwrite = overwrite)
      )
    )

    # basic http error handling
    httr::stop_for_status(response, "query UniProt")
  }

  if (is_crap) {
    crap <- Biostrings::readAAStringSet(filepath = file)
    names(crap) <- sub_crap(names(crap))
    Biostrings::writeXStringSet(crap, filepath = file)
  }
}

check_uniprot_job <- function(jobid) {
  polling_interval <- 5 # seconds
  n_tries <- 20

  for (i in n_tries) {
    status_url <- paste("https://rest.uniprot.org/idmapping/status/", jobid, sep = "")
    status <- httr::content(
      httr::GET(url = status_url, httr::accept_json()),
      as = "parsed"
    )

    if (!is.null(status[["results"]]) || !is.null(status[["failedIds"]])) {
      return(TRUE)
    }
    if (!is.null(status[["messages"]])) {
      print(status[["messages"]])
      return (FALSE)
    }
    Sys.sleep(polling_interval)
  }

  return(FALSE)
}

#' Insert cRAP numbers into a character vector
#'
#' @description This function takes a character vector where each element is
#' roughly in the form of a UniProt header e.g. `"sp|XXXXXX|YYYY_YYYY Text goes here"`
#' and substitutes a cRAP number in place of the first `|` symbol, e.g.
#' `"sp|cRAP001|XXXXXX|YYYY_YYYY Text goes here"`.
#'
#' @param x `character vector`, each element must have two `|` symbols with
#' some text in between e.g. `|sometext|`.
#' @param start `numeric`, the number to increment from, default is `1`
#' @param width `numeric`, how many digits the cRAP number should be, default is `3`
#'
#' @return Returns a `character vector` the same length as x.
#'
#' @examples
#' # basic use
#' sub_crap(c("|sometext|", "|moretext|"))
#'
#' # start from a different number
#' sub_crap(c("|sometext|", "|moretext|"), start = 88)
#'
#' # increase number width
#' sub_crap(c("|sometext|", "|moretext|"), start = 1111, width = 4)
#'
#' @export
sub_crap <- function(x, start = 1, width = 3) {
  # check input
  if (!all(grepl("\\|.*\\|", x))) {
    stop("This function only works when each element of x is a character of the
         form '|anything|'")
  }

  output <- mapply(
    function(input, num) {sub("\\|", paste0("|cRAP", num, "|"), input)},
    x,
    formatC(seq.int(from = start, to = start + length(x) - 1, by = 1),
            width = width, format = "d", flag = "0"),
    USE.NAMES = FALSE
  )
  output
}

#' Check the current UniProt release
#'
#' @description UniProt releases are published approximately every 8 weeks.
#' This function checks what the current UniProt release is.
#'
#' @return Returns a `character`, the current release number in the format
#' YYYY_XX where YYYY is the calendar year and XX a 2-digit number that
#' is incremented for each release of a given year, e.g. 2010_01, 2010_02, etc.
#'
#' @examples
#' # print release number to console
#' check_uniprot_release()
#'
#' # save release number and use in e.g. a file name
#' rls <- check_uniprot_release()
#'
#' paste0("folder/filename_", rls, ".fasta")
#'
#' @export
check_uniprot_release <- function() {
  # query H. sapiens actin
  # security level fix for Ubuntu 20.04
  # see https://msmith.de/2020/10/02/httr-curl-ubuntu-20-04.html
  httr_config <- switch(
    Sys.info()["sysname"],
    "Linux" = httr::config(ssl_cipher_list = "DEFAULT@SECLEVEL=1"),
    httr::config()
  )

  response <- httr::with_config(
    config = httr_config,
    httr::HEAD(
      url = "https://rest.uniprot.org/uniprot/P60709.fasta"
    )
  )

  # basic http error handling
  httr::stop_for_status(response, "query UniProt")

  # get release
  response$headers$`x-uniprot-release`
}
CambridgeCentreForProteomics/camprotR documentation built on July 7, 2024, 2:13 a.m.