R/uniprot-access.R

Defines functions uniprot_add_idmap use_recommended_prokaryotes uniprot_sample_prokaryotes uniprot_map2pfam uniprot_strata uniprot_fill_strata uniprot_retrieve_proteome_table uniprot_retrieve_proteome uniprot_weight_by_ref extract_uniprot_id_from_fasta list_content uniprot_organelle_ids uniprot_downstream_ids query_sparql

Documented in extract_uniprot_id_from_fasta query_sparql uniprot_add_idmap uniprot_downstream_ids uniprot_fill_strata uniprot_map2pfam uniprot_organelle_ids uniprot_retrieve_proteome uniprot_retrieve_proteome_table uniprot_sample_prokaryotes uniprot_strata uniprot_weight_by_ref use_recommended_prokaryotes

#' Execute a SPARQL query
#'
#' @param template filename A SPARQL query file that may contain variables that will be replaced with values from `macros`
#' @param macros named vector where names are macros that occur in the template SPARQL query and values are what will replace the macro
#' @return table containing the rows of data returned from the query
query_sparql <- function(template, macros){
  query <- readLines(template) %>%
    stringr::str_replace_all(macros) %>%
    paste0(collapse="\n") %>%
    URLencode %>%
  {
    httr::content(httr::POST(
      url = 'https://sparql.uniprot.org/sparql',
      httr::accept('text/csv'),
      httr::content_type("application/x-www-form-urlencoded"),
      body = glue::glue("query={.}")
    ), show_col_types = FALSE)
  }
}

#' Get the uniprot ids downstream of a node
#'
#' @param taxid Ancestral node of clade of interest
#' @return A numeric vector of NCBI taxon ids listing all species in this clade
#' for which Uniprot has a complete proteome.
#' @export
uniprot_downstream_ids <- function(taxid){

  template <- system.file("sparql", "get-representative-proteome-taxids.rq", package="phylostratr")

  rows <- query_sparql(template, macros=c("TAXID" = as.character(taxid)))

  # The taxon ids are stored as UIDs, e.g., "http://purl.uniprot.org/taxonomy/3711"
  # So we need to strip out everything but the number and convert to int
  sub("http.*/", "", rows$taxon) %>% as.integer
}

#' Get the uniprot ids for organelle proteins of a given taxid
#'
#' @param taxid Taxon ID for species of interest 
#' @param organelle string, one of 'mitochondrion', 'chloroplast'
#' @return A numeric vector of Uniprot ids
#' @export
uniprot_organelle_ids <- function(taxid, organelle='Mitochondrion'){
  template <- system.file("sparql", "get-all-mitochondrial-uniprot-ids.rq", package="phylostratr")
  macros <- c(TAXID=taxid, ORGANELLE=organelle)
  rows <- query_sparql(template, macros)
  sub("http.*/", "", rows$protein)
}


list_content <- function(con){
  stringi::stri_split_lines(stringi::stri_trim_both(httr::content(con, encoding="UTF-8")))[[1]]
}

#' Parse a UniProt ID from a FASTA file
#'
#' UniProt FASTA files have the initial pattern:
#'   <db>|<UniProt ID>|<alternative ID> <desc>'
#'
#' Here we parse out the UniProt ID. If the FASTA header does not match the
#' pattern, we return NULL
#'
#' @param fastafile A protein FASTA file, assumed to be UniProt format
#' @return character vector of UniProt IDs IF this is a UniProt FASTA, NULL otherwise
#' @export
extract_uniprot_id_from_fasta <- function(fastafile){
  headers <- names(Biostrings::readAAStringSet(fastafile))
  if(all(grepl('^..\\|', headers))){
    sub('^...([^\\|]+).*', '\\1', headers) 
  } else {
    # This is not a UniProt FASTA file
    NULL
  }
}

#' Make reference-species preferring weight vector for diverse_subtree
#'
#' @param weight How much to prefer the reference species. The default, 1.05,
#' will weakly prefer them, acting mostly as a tie-breaker. Higher weights
#' could lead to reduced diversity.
#' @param clade Weight taxa descending from this clade. The default is 2759
#' (Eukaryota).
#' @export
uniprot_weight_by_ref <- function(weight=1.05, clade=2759){
  refs <- uniprot_downstream_ids(clade)
  weights <- rep(1.1, length(refs))
  names(weights) <- refs
  weights 
}

#' Retrieve UniProt proteome
#'
#' The proteome is written to a file with the name '<taxid>.faa', for example,
#' Arabidopsis thaliana, which has the id 3702, will be written the '3702.faa'.
#'
#' @param taxid An NCBI taxonomy id
#' @param dir Directory in which to write all FASTA files
#' @param verbose Be super chatty
#' @export
#' @examples
#' \dontrun{
#' # uniprot_retrieve_proteome(3702)
#' }
uniprot_retrieve_proteome <- function(taxid, dir = 'uniprot-seqs', verbose = TRUE){
  if(!dir.exists(dir)){
    dir.create(dir, recursive=TRUE)
  }
  fastafile <- file.path(dir, paste0(taxid, '.faa'))
  if(file.exists(fastafile)){
    maybe_message("Skipping %s - already retrieved", verbose, taxid)
  } else {
    uniprot_retrieve_proteome_table(taxid) %>%
    {
      paste0(">", .$uniprot_uid, "\n", .$aa_sequence, collapse="\n")
    } %>%
      writeLines(fastafile)
  }
  fastafile
}

#' Retrieve UniProt proteome table
#'
#' Tries to retrieve all proteins in a "representative" proteome for the given
#' species. If the species does not have a representative proteome, retrieve
#' all proteins for the species.
#'
#' @param taxid An NCBI taxonomy id
#' @export
#' @examples
#' \dontrun{
#' x <- uniprot_retrieve_proteome_table(3702)
#' }
uniprot_retrieve_proteome_table <- function(taxid){
  template <- system.file("sparql", "get-representative-proteome-sequences.rq", package="phylostratr")
  macros <- c(TAXID=as.character(taxid))
  rows <- query_sparql(template, macros)

  if(nrow(rows) == 0){
    warning(glue::glue("Could not find representative proteome for {taxid}, retrieving all records, may be incomplete or contain redundant entries"))
    template <- system.file("sparql", "get-all-taxid-sequences.rq", package="phylostratr")
    macros <- c(TAXID=as.character(taxid))
    rows <- query_sparql(template, macros)
  }

  rows$uniprot_uid <- sub("http.*/", "", rows$uniprot_uid)
  rows
}


#' Download sequence data for each species in a UniProt-based strata
#'
#' @param strata Strata object where all species are represented in UniProt.
#' @param ... Additional arguments for \code{uniprot_retrieve_proteome}
#' @return Strata object where 'data' slot is filled with protein FASTA files
#' @export
uniprot_fill_strata <- function(strata, ...){
  species <- strata@tree$tip.label
  strata@data$faa <- lapply(strata@tree$tip.label, uniprot_retrieve_proteome, ...)
  names(strata@data$faa) <- strata@tree$tip.label
  strata
}

#' Given a focal taxid, build a UniProt-based Strata object
#'
#' @param taxid The focal species NCBI taxon id
#' @param from stratum to begin from, where 1 is 'cellular organisms'
#' @return Strata object
#' @export
uniprot_strata <- function(taxid, from=2){

  # ensure the focal gene is included, even if not in uniprot (fix #10)
  add_focal <- function(xs){
    if(!(taxid %in% xs)){
      message("The focal species is not present in UniProt. ",
              "You may add it after retrieving uniprot sequences ",
              "(i.e. with 'uniprot_fill_strata') with a command such as: ",
              "strata_obj@data$faa[[focal_taxid]] <- '/path/to/your/focal-species.faa'")
      xs <- c(taxid, xs)
    }
    xs
  }

  taxizedb::classification(taxid)[[1]]$id[from] %>%
    uniprot_downstream_ids %>%
    add_focal %>%
    taxizedb::classification() %>%
    Filter(f=is.data.frame) %>%
    lineages_to_phylo(clean=TRUE) %>%
    Strata(
      focal_species = taxid,
      tree       = .,
      data       = list()
    )
}

#' Map UniProt IDs for an organism to PFAM IDs
#'
#' @param taxid Species NCBI taxonomy ID
#' @return A data.frame with columns 'uniprotID' and 'pfamID'
#' @export
uniprot_map2pfam <- function(taxid){
  template <- system.file("sparql", "get-representative-proteome-pfam-map.rq", package="phylostratr")
  query_sparql(template, macros=c("TAXID" = as.character(taxid))) %>%
    dplyr::mutate(
      uniprotID = sub("http.*/", "", uniprotID),
      pfamID = sub("http.*/", "", pfamID)
    )
}

#' Randomly sample prokaryotic representatives
#'
#' This function was used to select the prokaryotes used in the paper. I do not
#' recommend using it now though. It can select unclassified taxa.
#'
#' @param downto the lowest phylogenetic rank that shoul be sampled
#' @param remake whether to replace an existing file
#' @return phylo object containing the prokaryptic sample tree
uniprot_sample_prokaryotes <- function(downto='class', remake=FALSE){

  old.file <- system.file('extdata', 'prokaryote_sample.rda', package='phylostratr')
  if(!remake && file.exists(old.file)){
    return(readRDS(old.file))
  }

  # Get all bacterial and Archael classes (class is one level below phylum)
  prokaryote_classes <- taxizedb::downstream(c('eubacteria', 'Archaea'), downto=downto, db='ncbi')

  # Get all uniprot reference genomes for each bacterial class
  bacteria_class_reps <- lapply(
      prokaryote_classes$eubacteria$childtaxa_id,
      uniprot_downstream_ids
    )
  names(bacteria_class_reps) <- prokaryote_classes$eubacteria$childtaxa_id

  # Get all uniprot reference genomes for each bacterial class
  archaea_class_reps <- lapply(
      prokaryote_classes$Archaea$childtaxa_id,
      uniprot_downstream_ids
    )
  names(archaea_class_reps) <- prokaryote_classes$Archaea$childtaxa_id

  # From each class, randomly select a single uniprot reference genome
  sample_taxids <- function(x, ...){
      # This is required, because R is evil. If x is of length 1 and is numeric,
      # then `sample` treats it as the upperbound of a discrete distribution
      # between 1 and x. Otherwise it is treated as a set to be sampled from.
      sample(as.character(x), ...) %>% as.integer
  }
  bacteria_taxids <- bacteria_class_reps %>%
      Filter(f = function(x){ length(x) > 0}) %>%
      lapply(sample_taxids, size=1) %>% unlist
  archaea_taxids <- archaea_class_reps %>%
      Filter(f = function(x){ length(x) > 0}) %>%
      lapply(as.character) %>%
      lapply(sample_taxids, size=1) %>% unlist

  c(bacteria_taxids, archaea_taxids) %>%
    taxizedb::classification() %>%
    lineages_to_phylo
}

#' Use the prokaryotic species that I use
#'
#' This set of species is currently not all that great. It is a sample of
#' bacterial and archaeal species from each class in each domain.
#'
#' @param x Strata object
#' @return x Strata object with the basal stratum (ID=131567) replaced
#' @export
use_recommended_prokaryotes <- function(x){
  prokaryote_sample <- readRDS(system.file(
    'extdata',
    'prokaryote_sample.rda',
    package='phylostratr'
  ))
  # extract the Eukaryota branch
  x@tree <- subtree(x@tree, '2759', type='name')
  # get 'cellular_organism -> Eukaryota' tree (just these two nodes)
  root <- taxizedb::classification(2759)[[1]]$id %>% lineage_to_ancestor_tree
  # bind the prokaryotes to 'cellular_organism'
  y <- ape::bind.tree(root, prokaryote_sample)
  # bind the eukaryote input tree to 'Eukaryota'
  y <- ape::bind.tree(y, x@tree, where=which(tree_names(y) == '2759'))
  x@tree <- y
  x
}

#' Extract an ID map from uniprot fasta files
#'
#' This assumes the fasta files respect the UniProt fasta header format.
#'
#' @param strata Strata object 
#' @return Strata object with 'idmap' data field
#' @export
uniprot_add_idmap <- function(strata){
  is_valid_strata(strata, required='faa')
  strata@data$idmap <- lapply(strata@data$faa, function(x){
     Biostrings::readAAStringSet(x) %>%
       names %>%
       strsplit('\\|') %>%
       purrr::transpose() %>%
       lapply(unlist) %>%
       {tibble::data_frame(
         db = .[[1]],
         uniprot_id = .[[2]],
         other_id = sub(" .*", "", .[[3]])
       )}
  })
  names(strata@data$idmap) <- names(strata@data$faa)
  strata
}
arendsee/phylostratr documentation built on Dec. 31, 2022, 10:22 a.m.