R/getIDs.R

Defines functions getIDs

Documented in getIDs

#' getIDs
#'
#' Mapping gene identifiers.
#' An easy-to-use and robust wrapper around AnnotationDbi's mapIds function.
#'
#' @param identifiers - input gene identifiers
#'
#' @param from - input identifier type, one of (case insensitive):
#' `c("ACCNUM", "ALIAS", "ENSEMBL", "ENSEMBLPROT",
#' "ENSEMBLTRANS", "ENTREZID", "ENZYME", "EVIDENCE", "EVIDENCEALL", "GENENAME",
#' "GO", "GOALL", "IPI", "MGI", "ONTOLOGY", "ONTOLOGYALL", "PATH",
#' "PFAM", "PMID", "PROSITE", "REFSEQ", "SYMBOL", "UNIGENE", "UNIPROT")`
#'
#' @param to - output identifier type, see `from`.
#'
#' @param species - organism identifier for input genes.
#'
#' @return output gene identifiers
#'
#' @author Tyler W Bradshaw, \email{twesleyb10@gmail.com}
#'
#' @references none
#'
#' @keywords none
#'
#' @importFrom AnnotationDbi mapIds
#'
#' @export getIDs
#'
#' @examples
#' getIDs(mygenes, from = "symbol", to = "entrez", species = "mouse")
getIDs <- function(identifiers, from, to, species = NULL, taxid = NULL,
                   quiet = TRUE, multiVals = "first", ...) {

  # Wrapper around AnnotationDbi::mapIds()

  # Check input identifiers.
  if (sum(is.na(identifiers))) {
    message("Warning: missing values (NA) detected in input identifiers.")
  }

  # load annotation database info
  annotationDBs <- mappingDBs()

  # Get organism specific mapping database
  if (!is.null(taxid)) {
    orgDB <- unlist(annotationDBs[sapply(annotationDBs, "[", 1) == taxid])
  } else if (!is.null(species)) {
    orgDB <- unlist(annotationDBs[sapply(annotationDBs, "[", 3) == tolower(species)])
  } else {
    stop("Please provide a species or taxid for gene identifiers.")
  }
  names(orgDB) <- sapply(strsplit(names(orgDB), "\\."), "[", 2)

  # load mapping database
  suppressPackageStartupMessages({
    eval(parse(text = paste0("require(", orgDB[["database"]], ",quietly=TRUE)")))
  })
  osDB <- eval(parse(text = orgDB[["database"]]))

  # Get input type (from) and output type (to)
  colIDto <- grep(toupper(to), columns(osDB))
  colIDfrom <- grep(toupper(from), columns(osDB))

  # Check that from and to map to a single column
  keys <- keytypes(osDB)
  if (length(colIDto) > 1) {
    msg <- c(
      "Input argument 'to' matches multiple keys: ",
      paste(keys[colIDto], collapse = ", "),"\n",
	  paste("Using: ", columns(osDB)[colIDto[1]])
    )
    warning(msg)
	colIDto <- colIDto[1]
  }
  if (length(colIDfrom) > 1) {
    msg <- c(
      "Input argument 'from' matches multiple keys: ",
      paste(keys[colIDfrom], collapse = ", "),"\n",
	  paste("Using: ", columns(osDB)[colIDfrom[1]])
    )
    warning(msg)
	colIDfrom <- colIDfrom[1]
  }

  # Check MGI format if input is MGI
  if (columns(osDB)[colIDfrom] == "MGI") {
    if (!any(grepl("MGI:", identifiers))) {
      stop("Please provide MGI identifiers as MGI:ID")
    }
    identifiers <- paste0(
      "MGI:MGI:",
      sapply(strsplit(identifiers, "MGI:"), tail, 1)
    )
  }

  # Map gene identifiers
  suppressMessages({
    output <- AnnotationDbi::mapIds(osDB,
      keys = as.character(identifiers),
      column = columns(osDB)[colIDto],
      keytype = columns(osDB)[colIDfrom],
      multiVals = multiVals
    )
  })

  # Check if output is a list
  if (is.list(output)) {
    # Replace NULL
    is_null <- sapply(output, is.null)
    output[is_null] <- NA
    output <- unlist(output)
  }

  # Check that all nodes (entrez) are mapped to gene symbols
  not_mapped <- is.na(output)

  if (!quiet & sum(is.na(output)) != 0) {
    message(paste0(
      "Warning: Unable to map ", sum(not_mapped), " ", species, " ",
      from, "(s)", " to ", to, " identifiers!"
    ))
  }

  names(output) <- identifiers

  return(output)
}
soderling-lab/geneLists documentation built on Sept. 6, 2021, 8:22 p.m.