Nothing
## A couple of constants (referred to by many functions)
## We will maintain a list of supported key types in file in extdata..
keytypeKeysDat <- read.delim(
system.file('extdata', 'keytypes.txt', package='UniProt.ws'),
header=FALSE, stringsAsFactors=FALSE
)
## write.table(keytypeKeysDat, file="keytypes2.txt", quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
## We also keep a list of supported additional cols (things that can be
## retrieved but not used as keys in a file called extraCols.txt
extraColsDat <- read.delim(
system.file('extdata','extraCols.txt', package='UniProt.ws'),
header=FALSE, stringsAsFactors=FALSE
)
## FOR NOW: we are not supporting the following 4 cols (they give us the 505)
## Also remember: adjust/comment these in man page...
## keytypeKeysDat <- keytypeKeysDat[-c(37L,38L),]
## Some code to make the string into a data.frame...
.cleanup <- function(str, from, to){
res <- read.delim(text = gsub("[\t]+", "\t", readLines(textConnection(str)), perl = TRUE), sep = "\t")
res <- res[,c(1, 2)]
colnames(res) <- c(from, to)
res
}
## Try five times and give error if all attempts fail.
.tryGetResult <- function(url, params) {
for (i in 1:5) {
result <- tryCatch({
getForm(url, .params=params, .opts=list(FOLLOWLOCATION=TRUE))
}, error=function(err) NULL)
if (!is.null(result)) return(result)
Sys.sleep(10)
}
stop("no results after 5 attempts; please try again later")
}
.mapUni <- function(query, from, to) {
## query starts as a character vector...
## But the URL expects it to be a space separated string.
query <- paste(query, collapse=" ")
## url is constant here
url <- 'https://www.uniprot.org/mapping/'
params <- c('from'=from, 'to'= to, 'format'='tab', 'query'=query)
res <- .tryGetResult(url, params)
.cleanup(res, from, to)
}
.makeChunkVector <- function(chnkSize,query){
## how many chunks?
chnks <- length(query) %/% chnkSize
## compute the remainder
rem <- length(query) - (chnks * chnkSize)
## make the factor
if(length(query) > chnkSize){
## make a vector
v <- rep(1:chnks, each=chnkSize)
## and add on remainder
if(chnks < length(query)/chnkSize ) v <- c(v, rep(chnks+1, each=rem))
}else{## or we only need the remainder...
v <- rep(chnks+1, each=rem)
}
v
}
.tryToGetAllChunks <- function(res, qs, FUN, ...) {
## call FUN for each
## res <- lapply(qs, FUN, ...)
for (i in seq_along(qs)[is.na(res)]) {
res[[i]] <- tryCatch({
FUN(qs[[i]], ...) ## call the getter method
}, error = function(err) {
message(
"error while trying to retrieve data in chunk ", i, ":",
"\n ", conditionMessage(err),
"\ncontinuing to try"
)
NULL
})
}
res
}
dataNibbler <- function(query, FUN, chnkSize=400, ...){
## make the vector for the chunks
f <- .makeChunkVector(chnkSize, query)
## split by f
qs <- split(query, as.factor(f))
## assign all vals in res to be NA
res <- rep.int(list(NA),length(qs))
while (anyNA(res)) {
## repeat till you get all the answers.
res <- .tryToGetAllChunks(res, qs, FUN, ...)
}
fin <- do.call(rbind, res)
## return combined results
fin
}
mapUniprot <- function(from, to, query){
## 1st we look at that query. Is is longer than 400 long, then we tend to
## get a "bad request" response, so I am simplifying it here to do small
## bites and then reassemble them
message("Getting mapping data for ", query[1], " ... and ", to)
dataNibbler(query=query, FUN=.mapUni, chnkSize=400,
from=from, to=to) ## not a typo that from is used twice here.
}
## Try five times and give error if all attempts fail.
.tryReadResult <- function(url){
for (i in 1:5) {
result <- tryCatch({
read.delim(URLencode(url), stringsAsFactors=FALSE)
}, error=function(err) {
message(
"reading url",
"\n ", URLencode(url),
"\nfailed on attempt ", i, " of 5"
)
NULL
})
if (!is.null(result)) return(result)
Sys.sleep(5)
}
stop("no results after 5 attempts; please try again later")
}
## helper to fill back in missing cols.
backFillCols <- function(tab, cols){
## 1st we need to translate cols to be the expected headers for tab.
ecols <- extraColsDat[,3][match(cols, extraColsDat[,2])]
## Get vector with NAs where we need replacement cols
ind = match(ecols,colnames(tab))
## Make a blank col
blank <- data.frame(val=rep(NA,times=dim(tab)[1]))
## then loop to place it whenever needed.
res <- data.frame()
for(i in seq_len(length(ind))){
if(i==1){
res <- tab[,1,drop=FALSE] ## 1st one is always the ids
}
else{
if(!is.na(ind[i])){
res <- cbind(res,tab[,ind[i],drop=FALSE])
}else{
res <- cbind(res, blank)
}
}
}
colnames(res) <- ecols
res
}
## A function that take UniProt IDs and gets supplementary cols back
.getSomeUniprotGoodies <- function(query, cols){
message(
"Getting extra data for ",
paste(head(query, 3), collapse=", "),
if (length(query) > 3)
paste0("... (", length(query), " total)")
)
## query and cols start as a character vectors
qstring <- paste(query, collapse="+or+")
cstring <- paste(cols, collapse=",")
url <- 'https://www.uniprot.org/uniprot/?query='
fullUrl <- paste0(url,qstring,'&format=tab&columns=id,',cstring)
## This step may need to repeat (in the event that it fails).
dat <- .tryReadResult(fullUrl)
## read.delim will name mangle if colnames have repeats or [CC]:
colnames(dat) <- sub("\\.\\d","",colnames(dat))
colnames(dat) <- sub("\\.\\.CC\\.", "", colnames(dat))
## now remove things that were not in the specific original query...
dat <- dat[dat[,1] %in% query,,drop=FALSE]
if(dim(dat)[2]< (length(cols)+1)){## we have some empty cols.
dat <- backFillCols(dat, cols=c("id",cols))
}
dat
}
getUniprotGoodies <- function(query, cols){
dataNibbler(query=query, FUN=.getSomeUniprotGoodies,
chnkSize=400, cols=cols)
}
.availableSpecies <-
function()
{
res <- digestspecfile()[, c("taxId", "taxname")]
rownames(res) <- NULL
colnames(res) <- paste0("V", 1:2)
res
}
## Need method to return dataFrame of available species.
availableUniprotSpecies <- function(pattern="", n=Inf){
species <- .availableSpecies()
g <- grepl(pattern, species[,2])
res <- species[g,]
colnames(res) <- c("taxon ID","Species name")
rownames(res)<- NULL
head(res, n)
}
## and another method to look up the species name based on the tax ID.
lookupUniprotSpeciesFromTaxId <- function(taxId){
species <- .availableSpecies()
g <- species[,1] %in% taxId
res <- species[g,2]
if(length(res)<1) stop("No species match the requested Tax Id.")
if(length(res)>1) stop("There may be a problem with the Tax Id data file.")
if(length(res)==1) return(res)
}
## important resources:
## query syntax
## http://www.uniprot.org/help/text-search
## how to use the REST site (generally):
## http://www.uniprot.org/faq/28
## query fields (what you can put for the "query=" part)
## http://www.uniprot.org/help/query-fields
## I need a function that can take a UniProt and retrieve associated data.
## something like below...
## these work:
## http://www.uniprot.org/uniprot/?query=P12345&format=tab&columns=id,sequence
## http://www.uniprot.org/uniprot/?query=accession:P30443&format=tab&columns=id,sequence
## This also works (gets ALL the IDs):
## http://www.uniprot.org/uniprot/?query=organism:9606&format=tab&columns=id,sequence
## And this is if you want multiple of the same kind
## http://www.uniprot.org/uniprot/?query="P04217"+or+"P30443"&format=tab&columns=id,sequence
## Notice though that I am getting extra entries. Are they equivalant or cruft?
## The DEFINITELY are not perfectly equivalent...
## I would like to understand why some requests get me multiple hits, while others only get me one hit????
## http://www.uniprot.org/uniprot/?query="P04217"&format=tab&columns=id,sequence
## So lets look at this example here: "P30443" returns data for it and also some data for another species with the ID "G7ZFG0" </puzzled>
## http://www.uniprot.org/uniprot/?query="P30443"&format=tab&columns=id,sequence
## But if I just ask for "G7ZFG0" I don't also get the human version "P30443"...
## http://www.uniprot.org/uniprot/?query="G7ZFG0"&format=tab&columns=id,sequence
## What's more: using single quotes seems to change thigns AGAIN (makes it even LESS specific somehow?)
## So here is how it seems to work: Double quotes indicate one level of non-specificity (exactly what I am unsure), and single quotes make it even LESS specific! So for my purposes, I need to avoid quotes. One more BUMMER: not using quotes is the SAME as using double quotes (a small amount of non-specificity). So I get to use a post filter regardless... :(
## SO I want to format the query like this:
## http://www.uniprot.org/uniprot/?query=P04217+or+P30443&format=tab&columns=id,sequence
## I can also do things like this (thus extracting the stuff from other DBs)
## http://www.uniprot.org/uniprot/?query=P04217+or+P30443&format=tab&columns=id,database%28interpro%29
## potential values for columns (for use by getUniprotGoodies) = c("citation","clusters","comments","domains","domain","ec","id","entry name","existence","families","features","genes","go","go-id","interpro","interactor","keywords","keyword-id","last-modified","length","organism","organism-id","pathway","protein names","reviewed","score","sequence","3d","subcellular locations","taxon","tools","version","virus hosts","database(pfam)","database(pdb)")
## NOTE: parameterized column values have to be "expanded" so database became
## database(pfam) and database(pdb)...
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.