inst/scripts/recoverIDs.R

# recoverIDs.R

recoverIDs <- function(ensg, mart = myMart) {
  # Purpose:
  #     Try to recover IDs for ensg to entrez mapping from biomart
  # Parameters:
  #     ensg: (character) a vector of ensemble gene IDs
  #     mart: (Mart)      an ensemble mart object
  #     "HGNC" must exist in the global namespace
  # Value:
  #     result: a dataframe with columns "ensg" containing the ensemble
  #             gene IDs of the input that could be mapped, and "entrez",
  #             which contains the corresponding entrez symbols, and rownames
  #             ensg.
  
  # Note: to figure out the correct filters and attributes to use in
  #       querying a biomart, first fetch the filters and attributes with
  #       code like:
  #         filt <- biomaRt::listFilters(myMart)
  #         attr <- biomaRt::listAttributes(myMart)
  #       ... and then query:
  #         attr$name[grep("RefSeq", attr$description, ignore.case = TRUE)]
  
  # Define which attributes we want to fetch from biomart, and which columns
  # those match to in "HGNC":
  myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
  
  myAtt <- data.frame(biomart = c("hgnc_symbol",
                                  "entrezgene",
                                  "ucsc"),
                      HGNC =    c("sym",
                                  "GeneID",
                                  "UCSCID"),
                      stringsAsFactors = FALSE)
  
  # Send off biomart query
  bm <- biomaRt::getBM(filters    =   "ensembl_gene_id",
                       attributes = c("ensembl_gene_id",
                                      myAtt$biomart),
                       values     = ensg,
                       mart       = myMart)
  
  if (nrow(bm) > 0) {                   # at least one match was returned
    bm$entrez <- rep(NA, nrow(bm))         # add a column to hold map results
    for (iCol in seq_len(ncol(bm))) {   # replace all "" with NA
      # Careful: combining logical vectors that can include NA is tricky.
      # Select elements that are neither already NA nor not-empty
      sel <- ( ! is.na(bm[ , iCol])) & (bm[ , iCol] == "")
      bm[sel, iCol] <- NA               # replace
    }
  }
  
  for (iAtt in seq_len(nrow(myAtt))) { # iterate over all requested attributes
    thisBmAtt <- myAtt$biomart[iAtt]
    thisHuAtt <- myAtt$HGNC[iAtt]
    if ( ! all(is.na(bm[ , thisBmAtt]))) {
      # some IDs were returned
      IDs <- bm[ , thisBmAtt]
      # get the symbol for a match, NA otherwise
      entrez <- HGNC$GeneID[match(IDs, HGNC[ , thisHuAtt], incomparables = NA)]
      sel <- ( ! is.na(entrez))
      bm$entrez[sel] <- entrez[sel] # Overwrite those that are not NA.
      # If there are multiple IDs returned for one row
      # in effect we return the last one that was
      # matched.
    }
  }
  # Post-process. 
  bm <- bm[! is.na(bm$entrez), c("ensembl_gene_id", "entrez")]
  bm <- bm[! duplicated(bm$ensembl_gene_id), ]
  matchedIDs <- match(ensg, bm$ensembl_gene_id)
  
  esMap <- data.frame(ensg = ensg,
                      entrez = bm$entrez[matchedIDs],
                      stringsAsFactors = FALSE)
  rownames(esMap) <- esMap$ensg
  
  # drop NAs
  esMap <- esMap[ ! is.na(esMap$entrez), ]
  
  return(esMap)
}


# ====  TESTS  =================================================================
if (FALSE) {
  test <- recoverIDs(c("ENSG00000121410", "ENSG00000268895"))
  testthat::expect_identical(test[1,2], 1) #Tests passed
}


# [END]
raywoo32/HPA documentation built on May 8, 2019, 1:22 p.m.