R/uniprot.R

#' Add uniprot information
#'
#' @export
tor_add_uniprot_info <- function(data, uniprot_data, quiet = FALSE) {

  if (missing(data)) stop("no data frame supplied", call. = FALSE)
  if (missing(uniprot_data)) stop("no uniprot data supplied, use tor_fetch_uniprot_proteins", call. = FALSE)

  if ("uniprot_id" %in% names(data) && "uniprot_id" %in% names(data)) {
    join_by <- "uniprot_id"
    if ("prot_id" %in% names(data)) {
      # if prot id is in the dataset, don't take it from uniprot
      uniprot_data <- select(uniprot_data, -prot_id)
    }
  } else if ("prot_id" %in% names(data) && "prot_id" %in% names(data)) {
    join_by <- "prot_id"
  } else
    stop("don't know what to join these data by, missing uniprot_id and prot_id", call. = FALSE)

  if (!quiet)
    glue("Info: adding uniprot information to mass spec data, joining by '{join_by}'...") %>%
      message(appendLF = FALSE)

  # combine
  data_combined <-
    right_join(uniprot_data, data, by = join_by) %>%
    mutate(missing_uniprot = is.na(prot_name))

  # info
  if (!quiet)
    glue("{nrow(filter(data_combined, !missing_uniprot))} records joined successfully, ",
         "{nrow(filter(data_combined, missing_uniprot))} could not be matched with uniprot records") %>%
    message()

  return(data_combined)
}

#' Fetch uniprot species
#'
#' Searches the uniprot data base for bacterial species based on the seacrh term.
#'
#' @param search_term taxa search term
#' @export
tor_fetch_uniprot_species <- function(search_term, quiet = FALSE) {

  if (missing(search_term))
    stop("no species search term supplied", call. = FALSE)

  query <- glue('
PREFIX up:<http://purl.uniprot.org/core/>
PREFIX taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
SELECT ?taxon_id ?taxon_name
WHERE
{
	?taxon a up:Taxon .
	?taxon up:scientificName ?taxon_name .
    # bacteria only
	?taxon rdfs:subClassOf taxon:2 .
  	BIND (STRAFTER(STR(?taxon),"taxonomy/") AS ?taxon_id) .
  	FILTER regex(?taxon_name,"[search_term]","i")
}', .open = '[', .close = ']')

  # run query
  taxa <- run_uniprot_csv_query(
    query = query,
    message = glue("taxa with '{search_term}' in the name"),
    read_cache = FALSE,
    save_file_name = glue("uniprot_last_taxa_search.csv"),
    col_types = cols(
      taxon_id = col_integer(),
      taxon_name = col_character()
    ),
    quiet = quiet)

  if (nrow(taxa) == 0) {
    glue("no taxa match the search term, please try a different search") %>%
      warning(immediate. = TRUE, call. = FALSE)
  }

  return(taxa)
}



#' Fetch uniprot information
#'
#' @param taxon the ID of the species
#' @export
tor_fetch_uniprot_proteins <- function(taxon, read_cache = TRUE, cache_dir = "cache", quiet = FALSE) {

  if (missing(taxon))
    stop("no taxon provided to search for", call. = FALSE)

  # fetch from server
  query <- glue('
PREFIX up:<http://purl.uniprot.org/core/>
PREFIX taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
SELECT ?prot_id, ?uniprot_id, ?gene, ?prot_name, ?prot_mw
WHERE {
  	?protein a up:Protein .
    # organism filter
    ?protein up:organism taxon:[taxon] .
    ##  	?protein up:organism/up:scientificName ?organism .
    # mnemonic (for mapping to massacre output)
    ?protein up:mnemonic ?prot_id .
    # recommended protein name
  	OPTIONAL { ?protein up:recommendedName/up:fullName ?prot_name } .
    ## maybe: recommended EC number - but can have multiple
    ## OPTIONAL { ?protein up:recommendedName/up:ecName ?rec_ec_nr } .
    ## maybe? short name (NOTE: can be multiple, i.e. lead to column duplicates)
    ##   OPTIONAL { ?protein up:recommendedName/up:shortName ?rec_prot_short } .
    # gene
    OPTIONAL { ?protein up:encodedBy/skos:prefLabel ?gene } .
    ## maybe? locus name, can be multipe i.e. lead to column duplicates
    ##   OPTIONAL { ?protein up:encodedBy/up:locusName ?locus } .
    ## maybe? amino acid sequence - can be multiple, i.e. lead to column duplicates
    ##   ?protein up:sequence/rdf:value ?seq .
    # protein mass in Da
    ?protein up:sequence/up:mass ?mass_int .
    BIND(STR(?mass_int) AS ?prot_mw) .
    # protein id
  	BIND (STRAFTER(STR(?protein),"uniprot/") AS ?uniprot_id) .
    ## maybe: metacyc (if available)
    # 	OPTIONAL {
    #    	?protein rdfs:seeAlso ?metacyc .
    # 		?metacyc up:database <http://purl.uniprot.org/database/BioCyc> .
    #     BIND(STRAFTER(STR(?metacyc), "MetaCyc:") AS ?metacyc_id) .
    #   	FILTER regex(?metacyc,"MetaCyc","i")
    #   } .
    ## filter for testing purposes
    ##   #?protein up:mnemonic ?mnemonic .
    ##  	#FILTER regex(?mnemonic,"^cysk","i")
}
', .open = '[', .close = ']')

  # run query
  proteins <- run_uniprot_csv_query(
    query = query,
    message = glue("proteins of taxon {taxon}"),
    read_cache = read_cache, cache_dir = cache_dir,
    save_file_name = glue("uniprot_{taxon}_proteins.csv"),
    col_types = cols(
      prot_id = col_character(),
      uniprot_id = col_character(),
      gene = col_character(),
      prot_name = col_character(),
      prot_mw = col_integer()
    ),
    quiet = quiet)

  # safety checks
  reps <- proteins %>% group_by(uniprot_id) %>% mutate(n = n()) %>% filter(n > 1)
  if (nrow(reps) > 0) {
    glue("{nrow(reps)} of the {nrow(proteins)} records have more than one entry",
         " - care must be taken during merging with protein data") %>%
      warning(immediate. = TRUE, call. = FALSE)
  }
  if (nrow(proteins) == 0) {
    glue("no protein records recovered, please double check the taxon ID ({taxon}) is valid") %>%
      warning(immediate. = TRUE, call. = FALSE)
  }

  return(proteins)
}


# run a uniprot SPARQL query requesting csv data
# @param ... parameters to read_csv
run_uniprot_csv_query <- function(query, message = NULL,
                                  read_cache = TRUE, cache_dir = "cache",
                                  save_file_name = "uniprot.csv", quiet = FALSE, ...) {

  # uniprot SPARQL endpoint
  endpoint <- "http://sparql.uniprot.org/sparql"

  # caching
  if (!dir.exists(cache_dir)) dir.create(cache_dir, recursive = TRUE)
  cache_path <- file.path(cache_dir, save_file_name)

  # run query
  if (!read_cache || !file.exists(cache_path)) {

    if (!quiet) {
      if (!is.null(message))
        glue("Info: querying uniprot database for {message}... ") %>%
        message(appendLF = FALSE)
      else
        message("Info: querying uniprot database... ", appendLF = FALSE)
    }

    # retrieve data
    csv_data <- query %>%
      # URL encode query
      URLencode(reserved = TRUE) %>%
      # + replacement
      str_replace("\\+", "%2B") %>%
      # generate query
      { str_c(endpoint, "?query=", .) } %>%
      # request data (different format options)
      #getURL(httpheader = c(Accept = "application/sparql-results+xml"))
      #getURL(httpheader = c(Accept = "Accept: application/json"))
      getURL(httpheader = c(Accept = "Accept: text/csv"))

    # store in file
    csv_data %>% cat(file = cache_path)

    # user info
    if (!quiet) {
      glue("retrieved {str_count(csv_data, '\\n') - 1L} records"
         #  " - data stored in '{cache_path}'"
         ) %>%
        message()
    }

    data <- read_csv(cache_path, ...)

  } else {
    # read from cache
    if (!quiet) {
      if (!is.null(message))
        glue("Info: reading uniprot {message} from cached file ",
             "(use read_cache = FALSE to disable)... ") %>%
        message(appendLF = FALSE)
      else
        glue("Info: reading uniprot data from cached file ",
             "(use read_cache = FALSE to disable)... ") %>%
        message(appendLF = FALSE)
    }

    data <- read_csv(cache_path, ...)

    if (!quiet)
      glue("retrieved {nrow(data)} records") %>% message()
  }

  return(data)
}
KopfLab/turnoveR documentation built on May 28, 2019, 3:33 p.m.