R/methods-select.R

Defines functions .select .getUPMappdata .mergeList .keys .cols .keytypes

## Here we will define the select methods.

## no way to 'discover' the keytypes either (and they are subset of cols).
.keytypes <- function(){
  keytypeKeysDat[,1]
}

setMethod("keytypes", "UniProt.ws", function(x){.keytypes()})


## no way to 'discover' the cols, so I hard code them here.
.cols <- function(){
  c(keytypeKeysDat[,1], extraColsDat[,1])
}

setMethod("columns", "UniProt.ws", function(x){.cols()})

## http://www.UniProt.org/UniProt/?query=organism:9606&format=tab&columns=id,sequence

## To make keys work I just want to return what was asked for...
.keys <- function(x, keytype){
  if(!any(keytypes(x) %in% keytype)){
    stop("keytype argument MUST match a value returned by keytypes method")
  }
  dat <- taxIdUniprots(x) ## pre-cached
  if(keytype == "UNIPROTKB"){
    return(dat)
  }else{
    ## then convert this to be the keytype requested...
    tkt = keytypeKeysDat[keytypeKeysDat[,1] %in% keytype,2]
    dat2 <- mapUniprot(from="ACC+ID", to=tkt, query=dat)
    return(unique(as.character(dat2[,2])))
  }
}

setMethod("keys", "UniProt.ws",
    function(x, keytype){
      if(missing(keytype)){stop("Please supply a keytype argument.")}
      .keys(x, keytype)
    }
)

.mergeList <- function(list, joinType="left"){
  for(i in seq_len(length(list))){
    if(i==1){
      fin <- list[[1]]
    }else{
      if(joinType=="left"){
        fin <- merge(fin, list[[i]], by="ACC+ID", all.x=TRUE) ## left outer join
      }
      else if(joinType=="all"){
        fin <- merge(fin, list[[i]], by="ACC+ID", all=TRUE) ## full outer join
      }
    }
  }
  fin
}

.getUPMappdata <- function(colMappers, keys){
  ## get a list of mapping results (as data.frames)
  res <- lapply(colMappers, FUN=mapUniprot, from="ACC+ID", query=keys)
  ## Them merge all these mappings together based on UniProt.
  .mergeList(res, joinType="all")
}

## Here is the business end of my select method.
## The big plan is to call mapUniprot() and getUniprotGoodies()
## (merging when necessary)
.select <- function(x, keys, cols, keytype){
  if(!any(keytypes(x) %in% keytype)){
    stop("keytype argument MUST match a value returned by keytypes method")
  }
  if(!any(columns(x) %in% cols)){
    stop("columns argument MUST match a value returned by columns method")
  }

  max_key_length <- 100
  if(length(keys) > max_key_length)
  {
    message("Uniprot limits queries with a large amount of keys. ",
            "It's recommended that the select method be invoked ",
            "with fewer than ", max_key_length," keys or the query ",
            "may fail.")
  }

  if(all(c("GENEID", "ENTREZ_GENE") %in% cols)){
      message("GENEID and ENTREZ_GENE are the same.\n",
              "  returning only GENEID in results.")
      cols = cols[-which(cols == "ENTREZ_GENE")]
  }

  ## process columns
  oriTabCols <- unique(c(keytype,cols))
  cols <- cols[!(cols %in% keytype)]  ## remove keytype from cols 
  if (!length(cols))
      stop("'columns' should be different from 'keytype'")
  trueKeys <- keys ## may change depending on keytype.
  ## split into 2 groups: cols in keytypeKeys and cols in extraCols 
  colMappers <- cols[cols %in% keytypeKeysDat[,1]]
  colUPGoodies <- cols[cols %in% extraColsDat[,1]]
  ## then convert those into the internally used IDs
  colMappers <- keytypeKeysDat[keytypeKeysDat[,1] %in% colMappers, 2]
  ## Don't want ACC+ID in colMappers:
  colMappers <- colMappers[colMappers != "ACC+ID"]
  colUPGoodies <- extraColsDat[extraColsDat[,1] %in% colUPGoodies, 2]
  res <- list()
  if(keytype!="UNIPROTKB" ){
    kt <- keytypeKeysDat[keytypeKeysDat[,1] %in% keytype,2]
    dat <- mapUniprot(from=kt, to="ACC", query=keys)
    colnames(dat)[2] <-  "ACC+ID" ## always the 2nd one...
    ## capture UniProts as keys from this point on
    keys <- unique(dat[["ACC+ID"]])
    res <- c(res, list(dat))
  }

  ## All the (UNIPROTKB) possible keys for this organism
  orgSpecificKeys <- keys(x, keytype="UNIPROTKB")
  ## Now filter keys with orgSpecificKeys (uniprots intersected with uniprots)
  keys <- intersect(keys, orgSpecificKeys)
  if(length(keys)==0) stop("No data is available for the keys provided.")

  ## now get the other data (depending what was asked for)
  if (length(colMappers))
      res <- c(res, list(.getUPMappdata(colMappers, keys)))
  if (length(colUPGoodies) > 0) {
    dat <- getUniprotGoodies(keys, colUPGoodies)
    colnames(dat)[1] <- "ACC+ID" ## always the 1st
    res <- c(res, list(dat))
  }
  ## At this point I have some results, Now I just need to merge them base on
  ## UniProt IDs (and upon whether or not they are real)
  tab <- .mergeList(res, joinType="all")
  ## rename cols:
  rosetta <- rbind(keytypeKeysDat, extraColsDat)
  ## We need the third col of rosetta to tell us what the cols will come back
  ## from the service as
  ## match does not suffice - duplicate entries in rosetta[,3] only picks up
  ## first match

  idx <- match(colnames(tab), rosetta[,3])
  colnames(tab) <- rosetta[idx,1]
  if(("ENTREZ_GENE" %in% colnames(tab)) && !("ENTREZ_GENE" %in% oriTabCols))
      colnames(tab)[which(colnames(tab) == "ENTREZ_GENE")] = "GENEID"

  ## unique to this web service is the fact that I sometimes will have an
  ## extra UNIPROTKB col.  Regardless, we ONLY want the cols we asked for..
  ## BUT: we also can't try this if the above code has failed to rename anything
  if (all(!is.na(colnames(tab))))
    tab <- tab[,colnames(tab) %in% oriTabCols]
  ## resort
  tab <- resort_base(tab, trueKeys, keytype, oriTabCols)
  ## Now one last cast to make NAs (and all cols) and make things "uniform"
  cnames <- colnames(tab)
  .blankToNA <- function(row){
      gsub(pattern="^$",replacement=as.character(NA),row)}
  tab <- data.frame( t(apply(tab,MARGIN=1,.blankToNA)), stringsAsFactors=FALSE)
  colnames(tab) <- cnames
  ## then return
  tab
}

setMethod("select", "UniProt.ws",
    function(x, keys, columns, keytype, ...){
          if (missing(keytype)) 
              keytype <- "UNIPROTKB"
          .select(x, keys, columns, keytype)
        }
)

Try the UniProt.ws package in your browser

Any scripts or data that you put into this service are public.

UniProt.ws documentation built on Nov. 8, 2020, 5:58 p.m.