## annotation_uniprot.r: Some functions to simplify working with uniprot web
## services and text files.
## Recently I noticed that uniprot pretty drastically changed the ways in which one may
## access their data. I think these changes will make it drastically easier to gather data
## from them, but I have only poked at a little thus far.
#' Download the txt uniprot data for a given accession/species.
#'
#' Uniprot is an astonishing resource, but man is it a pain to use. Hopefully
#' this function will help. It takes either a uniprot accession, taxonomy ID,
#' or species name and does its best to find the appropriate uniprot data. This
#' is therefore primarily used by load_uniprot_annotations().
#'
#' @param accession Which accession to grab?
#' @param species Or perhaps species?
#' @param taxonomy Query for a specific taxonomy ID rather than species/accession?
#' @param all If there are more than 1 hit, grab them all?
#' @param first Or perhaps just grab the first hit?
#' @return A filename/accession tuple.
#' @seealso [xml2] [rvest]
#' @examples
#' uniprot_sc_downloaded <- load_uniprot_annotations(species = "Saccharomyces cerevisiae S288c")
#' uniprot_sc_downloaded$filename
#' uniprot_sc_downloaded$species
#' @export
load_uniprot_annotations <- function(accession = NULL, species = "H37Rv",
taxonomy = NULL, all = FALSE, first = FALSE) {
final_species <- ""
if (!is.null(taxonomy)) {
request_url <- glue::glue("https://www.uniprot.org/proteomes/?query=taxonomy%3A{xml2::url_escape(taxonomy)}")
destination <- glue("{taxonomy}.txt.gz")
if (!file.exists(destination)) {
## tt <- download.file(url = request_url, destfile = destination, method = "wget", quiet = TRUE)
tt <- download.file(url = request_url, destfile = destination, quiet = TRUE)
}
result <- xml2::read_html(destination)
result_html <- rvest::html_nodes(result, "tr")
accessions_text <- rvest::html_attr(result_html, "id")
## The first two elements are headers
accessions_text <- accessions_text[3:length(accessions_text)]
accessions <- gsub(x = accessions_text, pattern = "^(UP[0-9]+)(.*$)", replacement = "\\1")
species_text <- rvest::html_nodes(result, "td") %>%
rvest::html_nodes("span") %>%
rvest::html_text()
final_species <- species_text[species_text != ""]
if (length(accessions) == 1) {
accession <- accessions
} else {
accession <- accessions[1]
name <- final_species[1]
}
} else {
if (is.null(accession) & is.null(species)) {
message("Defaulting to the Mycobacterium tuberculosis H37Rv strain.")
accession <- "UP000001584"
} else if (is.null(accession)) {
message("Querying uniprot for the accession matching: ", species, ".")
##destination <- glue("{tempfile()}.txt.gz")
## request_url <- glue("https://www.uniprot.org/proteomes/?query={xml2::url_escape(species)}")
## Uniprot changed their web server in a few interesting ways which ultimately will make
## downloading annotations easier/better, but will also require me to rewrite a bunch
## of this code...
## The easiest way to deal with this in the short term is to use the website
## to 'download rest link' and get a new URL which will return TSV text which
## I can just dump to a tbl and play with...
request_url <- glue("https://rest.uniprot.org/proteomes/stream?compressed=false&fields=upid%2Corganism%2Corganism_id%2Cprotein_count%2Cbusco%2Ccpd&format=tsv&query={xml2::url_escape(species)}")
##if (!file.exists(destination)) {
## ## tt <- download.file(url = request_url, destfile = destination, method = "wget", quiet = TRUE)
## tt <- download.file(url = request_url, destfile = destination)
##}
##result <- xml2::read_html(destination)
##result_html <- rvest::html_nodes(result, "tr")
##accessions_text <- rvest::html_attr(result_html, "id")
#### The first two elements are headers
##accessions_text <- accessions_text[3:length(accessions_text)]
##accessions <- gsub(x = accessions_text, pattern = "^(UP[0-9]+)(.*$)", replacement = "\\1")
##species_text <- rvest::html_nodes(result, "td") %>%
## rvest::html_nodes("span") %>%
## rvest::html_text()
##final_species <- species_text[species_text != ""]
##removed <- file.remove(destination)
##if (length(accessions) == 1) {
## accession <- accessions
##} else if (isTRUE(all)) {
## for (a in 1:length(accessions)) {
## name <- final_species[a]
## accession <- accessions[a]
## message("Downloading the proteome for ", name, ".")
## tmp <- download_uniprot_proteome(accession = accession)
## Sys.sleep(time = 3)
## }
##} else if (isTRUE(first)) {
## accession <- accessions[1]
## name <- final_species[1]
## message("Downloading the proteome for ", name, ".")
## tmp <- download_uniprot_proteome(accession = accession)
##} else {
## message("Here are the species found, please choose one and try again.")
## for (a in 1:length(accessions)) {
## name <- final_species[a]
## accession <- accessions[a]
## message(a, ") ", accession, ": ", name)
## }
## message(toString(final_species))
## return(NULL)
##}
accession_tbl <- as.data.frame(readr::read_tsv(request_url, show_col_types = FALSE))
message("This species provides: ", nrow(accession_tbl), " hits.")
message("Arbitrarily downloading the first: ", accession_tbl[1, "Organism"], ".")
accession <- as.character(accession_tbl[1, "Proteome Id"])
}
}
##request_url <- glue(
## "https://www.uniprot.org/uniprot/?query=proteome:\\
## {accession}&compress=yes&force=true&format=txt")
request_url <- paste0(
"https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2C",
"protein_name%2Cgene_names%2Corganism_name%2Cabsorption%2Cft_act_site%2C",
"ft_binding%2Ccc_catalytic_activity%2Ccc_cofactor%2Cft_dna_bind%2Cec%2C",
"cc_activity_regulation%2Ccc_function%2Ckinetics%2Ccc_pathway%2Cph_dependence%2C",
"redox_potential%2Crhea%2Cft_site%2Ctemp_dependence%2Cft_var_seq%2C",
"cc_alternative_products%2Cerror_gmodel_pred%2Cfragment%2Corganelle%2Ccc_mass_spectrometry%2C",
"length%2Cmass%2Cft_variant%2Cft_non_cons%2Cft_non_std%2Cft_non_ter%2Ccc_polymorphism%2C",
"cc_rna_editing%2Csequence%2Ccc_sequence_caution%2Cft_conflict%2Cft_unsure%2C",
"sequence_version%2Cgene_oln%2Cgene_orf%2Cgene_primary%2Cgene_synonym%2Corganism_id%2C",
"xref_proteomes%2Clineage%2Clineage_ids%2Cvirus_hosts%2Cannotation_score%2Ccc_caution%2C",
"keyword%2Ckeywordid%2Ccc_miscellaneous%2Cprotein_existence%2Ctools%2Cuniparc_id%2C",
"comment_count%2Cfeature_count%2Ccc_interaction%2Ccc_subunit%2Ccc_developmental_stage%2C",
"cc_induction%2Ccc_tissue_specificity%2Cgo_p%2Cgo%2Cgo_c%2Cgo_f%2Cgo_id%2Ccc_allergen%2C",
"cc_disruption_phenotype%2Ccc_biotechnology%2Cft_mutagen%2Ccc_disease%2Ccc_pharmaceutical%2C",
"cc_toxic_dose%2Cft_intramem%2Ccc_subcellular_location%2Cft_topo_dom%2Cft_transmem%2C",
"ft_chain%2Cft_crosslnk%2Cft_disulfid%2Cft_carbohyd%2Cft_init_met%2Cft_lipid%2Cft_mod_res%2C",
"ft_peptide%2Ccc_ptm%2Cft_propep%2Cft_signal%2Cft_transit%2Cstructure_3d%2Cft_strand%2C",
"ft_helix%2Cft_turn%2Clit_pubmed_id%2Cft_coiled%2Cft_compbias%2Ccc_domain%2Cft_domain%2C",
"ft_motif%2Cprotein_families%2Cft_region%2Cft_repeat%2Cft_zn_fing&format=tsv&query=%28",
"proteome%3A", accession, "%29")
num_columns <- stringr::str_count(request_url, "%2C")
column_spec <- rep("c", num_columns)
##request_url <- paste0(
## "https://rest.uniprot.org/uniprotkb/stream?compressed=false&fields=accession%2C",
## "lineage%2Cvirus_hosts%2Clineage_ids%2Cgene_synonym%2Corganism_name%2C",
## "organism_id%2Cprotein_name%2Cgene_orf%2Cgene_oln%2Cgene_names%2Cid%2C",
## "gene_primary%2Cxref_proteomes%2Cabsorption%2Cft_act_site%2Cft_binding%2C",
## "ft_ca_bind%2Ccc_catalytic_activity%2Ccc_cofactor%2Cft_dna_bind%2Cec%2C",
## "cc_activity_regulation%2Ccc_function%2Ckinetics%2Cft_metal%2Cft_np_bind%2C",
## "cc_pathway%2Cph_dependence%2Credox_potential%2Crhea%2Cft_site%2Ctemp_dependence%2C",
## "annotation_score%2Ccc_caution%2Ckeyword%2Ckeywordid%2Cprotein_existence%2C",
## "cc_miscellaneous%2Creviewed%2Ctools%2Cuniparc_id%2Ccomment_count%2Cfeature_count%2C",
## "cc_interaction%2Ccc_subunit%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id%2Ccc_allergen%2C",
## "cc_biotechnology%2Ccc_disruption_phenotype%2Ccc_disease%2Cft_mutagen%2C",
## "cc_pharmaceutical%2Ccc_toxic_dose%2Cft_intramem%2Ccc_subcellular_location%2C",
## "ft_topo_dom%2Cft_transmem%2Cft_chain%2Cft_crosslnk%2Cft_init_met%2Cft_lipid%2C",
## "cc_ptm%2Cft_propep%2Cstructure_3d%2Cft_strand%2Cft_helix%2Cft_turn%2C",
## "lit_pubmed_id%2Cft_coiled%2Cft_compbias%2Cft_domain%2Cft_motif%2Cft_region%2C",
##"ft_repeat&format=tsv&query=proteome%3A", accession, "%29")
## tt <- download.file(url = request_url, destfile = "test.tsv")
## Some uniprot results lead to a complaint from readr which looks like:
## row col expected actual file
## <int> <int> <chr> <chr> <chr>
## 1 99 19 1/0/T/F/TRUE/FALSE "BIOPHYSICOCHEMICAL PROPERTIES: Redox p… ""
## 2 219 85 1/0/T/F/TRUE/FALSE "CARBOHYD 49; /note=\"O-linked (Man...) … ""
## 3 700 85 1/0/T/F/TRUE/FALSE "CARBOHYD 48; /note=\"O-linked (Man...) … ""
## When I first read this, I assumed that the column-type interpolation failed, and
## I would be able to just tell it every column is a character (with the above cheesy
## counting of the number of %2Cs
## It turns out this was not the problem, but I am not certain what it is.
retdf <- suppressWarnings(readr::read_tsv(request_url, col_types = column_spec))
return(retdf)
}
#' Extract ontology information from a uniprot dataframe.
#'
#' @param ... Whatever args are required for load_uniprot_annotations()
#' @return Ontology dataframe
#' @seealso [load_uniprot_annotations()] [stringr] [tidyr]
#' @examples
#' \dontrun{
#' uniprot_sc_downloaded <- download_uniprot_proteome(species = "Saccharomyces cerevisiae S288c")
#' sc_uniprot_annot <- load_uniprot_annotations(file = uniprot_sc_downloaded$filename)
#' sc_uniprot_go <- load_uniprot_go(sc_uniprot_annot)
#' head(sc_uniprot_go)
#' }
#' @export
load_uniprot_go <- function(...) {
input <- load_uniprot_annotations(...)
kept <- input[, c("uniprot_accessions", "go", "aa_length")] %>%
tidyr::separate_rows("uniprot_accessions")
kept[["go"]] <- kept[["go"]] %>%
stringr::str_extract_all(pattern = "GO:\\d+")
kept[["go"]] <- I(kept[["go"]])
kept[["go"]] <- as.character(kept[["go"]])
kept <- kept %>%
tidyr::separate_rows("go", sep = ",")
kept[["go"]] <- gsub(pattern='"', replacement = "", x = kept[["go"]])
kept[["go"]] <- gsub(pattern = ")", replacement = "", x = kept[["go"]])
kept[["go"]] <- gsub(pattern = "c\\(", replacement = "", x = kept[["go"]])
kept[["go"]] <- gsub(pattern = "\\s+", replacement = "", x = kept[["go"]])
colnames(kept) <- c("ID", "GO", "length")
return(kept)
}
## EOF
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.