#' Make Gene Names Legal Variable Names
#'
#' It is common that gene names are saved as names of rows or columns of an
#' expression matrix. In R, this could create some problems because some gene
#' names are not legal variable names and thus will be converted when they are
#' saved as column names (which R assumes that they should be legal variable
#' names). This automatic confusion might interfere with name matching.
#' \code{normalize_genename()} converts gene names to make them legal variable
#' names, thereby make sure the gene names compared are consistently after
#' conversin. \code{normalize_genename()} will add a prefix (gn_) to gene
#' names to make gene names starting with a number legal, and then calls
#' \code{\link[base]{make.names}()} to convert illegal punctuations.
#' @param gene a character vector containing gene names
#'
#' @return a character vector containing prefixed and converted gene names
#' @export
#'
#' @seealso \code{\link{denormalize_genename}()}
#'
#' @examples
#' normalize_genename("128up")
normalize_genename <- function(gene) {
# Make gene names legal colnames which can be safely reversed by removing the "gn_" prefix
genelist <- paste0("gn_", gene)
genelist <- make.names(genelist, unique = FALSE)
return(genelist)
}
#' Remove Prefix Added When Making Gene Names Legal Variable Names
#'
#' \code{denormalize_genename()} removes the prefix
#' \code{\link{normalize_genename}()} adds, and make the gene names more
#' readable for humans.
#' @inheritParams normalize_genename
#'
#' @return a character vector containing converted gene names with prefix
#' removed
#' @export
#' @examples
#' gene <- normalize_genename("128up")
#' # [1] "gn_128up"
#' denormalize_genename(gene)
#' # [1] "128up"
denormalize_genename <- function(gene) {
name <- substr(start = 4, stop = nchar(gene), gene)
return(name)
}
#' Make an Reference for Gene Name Conversion
#'
#' \code{make_database()} loads gene name / ID conversion tables from online
#' databases (e.g., FlyBase) and a user-provided GTF file that is used to
#' annotate the features of the NGS dataset of interest (e.g., the GTF file
#' STAR or \code{cellranger count} uses), and makes conversion tables from the
#' loaded data.
#'
#' @inheritParams prepare_database
#' @return a list containing multiple coversion tables and meta data
#' @export
#'
#' @examples
#' dummypath <- system.file("extdata", "dummy.gtf",
#' package = "genofeatutil")
#' dmeldb <- make_database(species = "test",
#' gtf.path = dummypath)
make_database <- function(species = "dmel", gtf.path, version = NULL) {
db <- prepare_database(species = species,
gtf.path = gtf.path,
version = version)
# Generate conversion vectors for FBgn update
db <- generate_fbid_version_table(db)
# Generate conversion vectors from the synonym table
db <- generate_flybase_sym(db)
db <- generate_gene_mapping(db)
return(db)
}
#' Prepare an Unprocessed Database for Gene Name Conversion
#'
#' \code{prepare_database()} loads gene name / ID conversion tables from online
#' databases (e.g., FlyBase) and a user-provided GTF file that is used to
#' annotate the features of the NGS dataset of interest (e.g., the GTF file
#' STAR or \code{cellranger count} uses) into memory for later processing by
#' \code{\link{make_database}()}.
#'
#' @param species a character string describing the species of the genes.
#' (Options: "\code{dmel}")
#' @param gtf.path a character string containing the path to the GTF file
#' containing the input gene names (e.g., the GTF files used by
#' \code{cellranger count})
#' @param version a character specifying the version desired
#' (e.g., "FB2019_01".)
#'
#' @return a list containing data frames and character strings of meta data
prepare_database <- function(species = "dmel", gtf.path, version = NULL) {
if (!species %in% c("dmel", "test")) {
stop("prepare_database does not support ", species, " now.\n")
}
# Load GTF file (contains gene id and names)
id_mapping <- rtracklayer::import(gtf.path)
id_mapping <- as.data.frame(id_mapping)
id_mapping <- id_mapping[id_mapping$type == "gene", ]
# Load data from species-specific gene databases
if (species == "dmel") {
result <- fetch_flybase(version = version)
version <- result[["version"]]
}
# Load data from local if in test environment
if (species == "test") {
fbgnpath <- system.file("extdata", "dummy_fbgn.tsv",
package = "genofeatutil")
synopath <- system.file("extdata", "dummy_syno.tsv",
package = "genofeatutil")
result <- fetch_flybase(paths = c(fbgnpath,
synopath))
version <- "test"
}
# Keep gene id and gene name from the GTF file
gtf <- id_mapping[ , c("gene_id", "type", "gene_name")]
result[["gtf"]] <- gtf
result[["metadata"]] <- c("species" = species,
"date" = date(),
"GTF_path" = gtf.path,
"FlyBase_ver" = version)
return(result)
}
#' Generate Conversion Tables for Symbols and Aliases to FlyBase ID
#'
#' @param db a list generated by \code{\link{prepare_database}()}
#'
#' @return a list of 2 (a named vector that translate symbols to FB ids and
#' another that translates aliases to FB ids)
generate_flybase_sym <- function(db) {
# Generate a a named vector for conversion based on Flybase Synonym
# with which you can use alias/name to query FBid
# Usage: named_vector <- generate_flybase_sym([the tsv file])
# named_vector[*alias/name*] gives its corresponding flybase id
if (!"syno" %in% names(db)) {
stop(paste("The database list seems to be wrong. Please make sure that",
"you generated it by prepare_database() before using gene",
"name conversion functions.\n"))
}
fbsym <- db[["syno"]]
# Keep only Drosophila genes
fbsym <- fbsym[fbsym$organism_abbreviation == "Dmel", ]
fbsym <- fbsym[grepl("^FBgn", fbsym$`##primary_FBid`), ]
# Generate a named vector with *aliases as names* and
# *fbid as values*
alias_dict_list <- list()
for (row in seq(nrow(fbsym))) {
alias <- strsplit(fbsym[row, "symbol_synonym(s)"], ",")[[1]]
if (length(alias) > 0) {
fbid <- fbsym[row, "##primary_FBid"]
dict <- rep(fbid, length(alias))
names(dict) <- normalize_genename(alias)
alias_dict_list[[fbid]] <- dict
}
}
alias_dict <- unlist(alias_dict_list)
# Since unlist appends names of the list items to names of vector items
# For example, if ChAT is #10000 in the file, before unlist() the
# name:value would be ChAT:FBid
# After unlist it would be 10000.ChAT:FBid
# I'll remove the list item names (the "10000." part) for they disrupt
# mapping
names(alias_dict) <- gsub("FBgn[0-9]*\\.", "", names(alias_dict))
# Generate a named vector with *symbols as names* and
# *fbid as values*
symbol_dict <- fbsym$`##primary_FBid`
names(symbol_dict) <- normalize_genename(fbsym$current_symbol)
# Add conversion vectors into db and remove raw data
db[["symbol_dict"]] <- symbol_dict
db[["alias_dict"]] <- alias_dict
db <- db[names(db) != c("syno")]
return(db)
}
#' Generate a Conversion Vector for FlyBase IDs from Older Versions
#'
#' \code{generate_fbid_version_table()} takes the FBgn ID annotation table
#' from FlyBase, and prepare a conversion vector.
#' \code{generate_fbid_version_table()} takes a list generated by
#' \code{\link{prepare_database}()} and updates it by adding the conversion
#' vector and removing the raw FBgn ID annotation table.
#' @param db a list generated by \code{prepare_database()}
#'
#' @return a list without FBgn ID annotation table but with a derived
#' conversion vector
generate_fbid_version_table <- function(db) {
if (!"fbid" %in% names(db)) {
stop(paste("The database list seems to be wrong. Please make sure that",
"you generated it by prepare_database() before using gene",
"name conversion functions.\n"))
}
# Read the FBgn annotation table from FlyBase
id_table <- db[["fbid"]]
# Report the version used
version <- db[["metadata"]]["FlyBase_ver"]
message("Using FlyBase version: ", version, " to update FBgn.\n")
# Generate a conversion vector
query <- id_table$`secondary_FBgn#(s)`
names(query) <- id_table$`primary_FBgn#`
lookup <- lapply(names(query), function(x) {
old_id <- strsplit(query[[x]], split = ",")[[1]]
if (length(old_id) > 0) {
result <- rep(x, length(old_id))
names(result) <- old_id
return(result)
}
})
lookup <- lookup[sapply(lookup, function(x) !is.null(x))]
lookup <- unlist(lookup)
# Remove raw data and return db with the conversion vector
db[["id_dict"]] <- lookup
db <- db[names(db) != "fbid"]
return(db)
}
#' Generate a Mapping Vector to Translate Gene IDs
#'
#' \code{generate_gene_mapping()} takes gene names from a character vector, an
#' expression matrix / data frame, or a Seurat object, and generate a vector
#' that can be used to translate gene IDs to the gene names provided to
#' \code{generate_gene_mapping()}. The purpose of this translation is to make
#' sure different datasets are consistent in the gene names used during
#' analysis, and to thereby ensure comparsion between datasets or querying are
#' accurate.
#' @param db a list generated by \code{\link{prepare_database}()}
#' @param ... other arguments that are passed to \code{\link{get_gene_names}()}
#' (see below)
#' @inheritParams get_gene_names
#'
#' @return a named character vector, in which the values are gene names and
#' the names are gene ids
generate_gene_mapping <- function(db) {
# Check if db is legit
if (!"gtf" %in% names(db)) {
stop(paste("The database list seems to be wrong. Please make sure that",
"you generated it by prepare_database() before using gene",
"name conversion functions.\n"))
}
id_mapping <- db[["gtf"]]
# Update FBgn
if (db[["metadata"]]["species"] %in% c("dmel", "test")) {
if (!"id_dict" %in% names(db)) {
stop(paste("An id conversion table is required to update FlyBase gene",
"numbers. If you see this error message, it could be an error",
"in an internal function, generate_fbid_version_table().\n"))
}
id_mapping$gene_id <- update_fbgn(id_mapping$gene_id, db = db)
}
result <- id_mapping$gene_name
names(result) <- id_mapping$gene_id
db[["to_name_dict"]] <- result
db <- db[names(db) != "gtf"]
return(db)
}
#' Update FlyBase Gene Numbers from an Older Version
#'
#' \code{update_fbgn()} takes a character vector of FlyBase gene numbers and a
#' list from \code{\link{make_database}()}, and convert the vector according
#' to the list.
#' @param id a character vector containing FlyBase gene numbers
#' @param db a list from \code{make_database()}
#'
#' @return a character vector containing updated FlyBase gene numbers
#' @export
#'
#' @examples
#' dummypath <- system.file("extdata", "dummy.gtf",
#' package = "genofeatutil")
#' dmeldb <- make_database(species = "test",
#' gtf.path = dummypath)
#' update_fbgn("FBgn0000020", db = dmeldb)
update_fbgn <- function (id, db) {
# Convert out-dated FBid to current version
## Load lookup table
if ("id_dict" %in% names(db)) {
version_dict <- db[["id_dict"]]
} else {
stop(paste("The database list seems to be wrong. Please make sure that",
"you generated it by make_database() before using gene",
"name conversion functions.\n"))
}
if (!"symbol_dict" %in% names(db)) {
stop(paste("The database list seems to be wrong. Please make sure that",
"you generated it by make_database() before using gene",
"name conversion functions.\n"))
}
## Find FBid that need conversion
index_update <- id %in% names(version_dict)
## Check illegal FBids
if (!Reduce(`&`, index_update)) {
noconvert <- id[!index_update]
legal <- noconvert %in% db[["symbol_dict"]]
if (!Reduce(`&`, legal)) {
stop(paste("The following FlyBase gene number(s) are not found in",
"the database (", db[["metadata"]]["version"], "):",
paste(noconvert[!legal], sep = ", "), ".\n"))
}
}
## Convert with lookup table while keeping the order
result <- sapply(seq(length(id)), function (x) {
if (index_update[x]) {
return(version_dict[id[x]])
}
return(id[x])
})
result <- unname(result)
return(result)
}
#' Convert Gene Names to FlyBase Gene Numbers
#'
#' \code{convert_gene_to_fbgn()} takes a vector of gene names and query the
#' synonym table from FlyBase to convert them to a vector of FlyBase gene
#' numbers. Due to potential multiple mapping (one alias corresponding to many
#' FlyBase gene numbers, like Cha can be ChAT or Charaff), it's important to
#' note that when there's warning message, the vector
#' /code{convert_gene_to_fbgn()} returns will not retain the same order as the
#' input vector of gene names.
#'
#' @param genes a character vector containing gene names
#' @param db a list from \code{make_database()}
#'
#' @return a character vector containing FlyBase gene numbers
#' @export
#'
#' @examples
#' # Prepare the database (using dummy local data)
#' dummypath <- system.file("extdata", "dummy.gtf",
#' package = "genofeatutil")
#' testdb <- make_database(species = "test",
#' gtf.path = dummypath)
#'
#' # Convert gene names
#' query <- c("gene_1", "alias_2.1", "mt:dummy", "alias_3.3", "gene_21")
#' convert_gene_to_fbgn(query[1:4], db = testdb)
convert_gene_to_fbgn <- function(genes, db) {
# Separate gene names to 2 categories and convert to FBgn
# 1. Matching current symbol
# 2. Matching aliases
unordered <- FALSE
## Reading from the db
if (!"symbol_dict" %in% names(db) | !"alias_dict" %in% names(db)) {
stop(paste("The database list seems to be wrong. Please make sure that",
"you generated it by make_database() before using gene",
"name conversion functions.\n"))
}
symbol_dict <- db[["symbol_dict"]]
alias_dict <- db[["alias_dict"]]
## Remove mitochondrial genes
if (length(genes[grepl("^mt:", genes)]) > 0) {
unordered <- TRUE
}
genes <- genes[!grepl("^mt:", genes)]
genes <- normalize_genename(genes)
## Here I suppose that genes with names matching official symbols do
## not need conversion
index_symbol <- genes %in% names(symbol_dict)
genes[index_symbol] <- symbol_dict[genes[index_symbol]]
# Find the names that are not official symbols but are documented aliases
index_alias <-
!(genes %in% names(symbol_dict)) & (genes %in% names(alias_dict))
## Create warnings if there's multiple mapping for aliases
multialias <- vector()
for (item in genes[index_alias]) {
matching_num <- sum(names(alias_dict) == item)
if (matching_num > 1) {
unordered <- TRUE
warning(paste0("'", denormalize_genename(item),
"' is matched with multiple aliases. All the FBgn ",
"that correspond to it will be append to the end ",
"of the result.\n")
)
# Create a vector of the rest of the alias:FBgn pair a non-unique
# alias mapped to, and append that to the alias vector.
multimap_id <- alias_dict[names(alias_dict) == item][-1]
multialias <- append(x = multialias,
values = multimap_id)
}
}
genes[index_alias] <- alias_dict[genes[index_alias]]
genes <- c(genes, unname(multialias))
# Keep only the FBgn of official symbols and the aliases and drop uncoverted
# ones
unmapped <- genes[!(index_symbol | index_alias)]
genes <- genes[index_symbol | index_alias]
## Remind the user about non-convertible gene names
if (length(unmapped) > 0) {
unordered <- TRUE
unmapped <- paste(unmapped, collapse = ", ")
warning(paste("Please note the following genes are not found in",
"the database and won't be processed:",
denormalize_genename(unmapped), ".\n"))
}
if (unordered) {
message(paste("Please note that the conversion result of",
"convert_gene_to_fbgn might not retain the same",
"order of the vector of gene names that are converted.\n"))
}
return(genes)
}
#' Convert Gene Names or FlyBase Gene Numbers to Gene Names Specified in the
#' Database
#'
#' \code{convert_to_genename()} takes a character vector of gene names and
#' FlyBase gene numbers and uses the database list
#' \code{\link{make_database}()} generated to convert them to gene names that
#' are consistent with the version of the provided GTF file.
#'
#' @inheritParams distinguish_fbgn
#' @param db a list from \code{make_database()}
#' @param normalize a logical expression. If set as TRUE, it will call
#' \code{\link{normalize_genename}()} to make consistent the converted gene
#' names.
#' @param remove.dup a logical value to decide whether to remove duplication in
#' the input vector. The default is TRUE.
#'
#' @export
#' @examples
#' # Make databases from dummy data
#' dummypath <- system.file("extdata", "dummy.gtf",
#' package = "genofeatutil")
#' testdb <- make_database(species = "test",
#' gtf.path = dummypath)
#'
#' convert_to_genename("FBgn0000001", testdb)
convert_to_genename <- function(x, db, normalize = TRUE, remove.dup = TRUE) {
if (!"to_name_dict" %in% names(db)) {
stop(paste("The database list seems to be wrong. Please make sure that",
"you generated it by make_database() before using gene",
"name conversion functions.\n"))
}
if (length(x) == 0) {
stop(paste("There is no gene names or FlyBase gene",
"numbers provided to convert.\n"))
}
genename_index <- !distinguish_fbgn(x)
# Update FBgn
if (db[["metadata"]]["species"] %in% c("dmel", "test")) {
if (sum(!genename_index) > 0) {
x[!genename_index] <- update_fbgn(x[!genename_index], db = db)
}
}
if (length(which(genename_index)) > 0) {
# Convert gene names to FBgn
converted_names <- convert_gene_to_fbgn(x[genename_index], db)
if (length(converted_names) == length(x[genename_index])) {
# If there's no multiple mapping, it can be safely saved in the
# original vector without error
x[genename_index] <- converted_names
} else {
x <- c(x[!genename_index], converted_names)
warning(paste("Because there were multiple mappings of the aliases,",
"please note that the length and order of the output is",
"different from the input.\n"))
}
}
# Examine duplication
if (remove.dup) {
dup_all <- Reduce(`|`, duplicated(x))
if (dup_all) {
x <- unique(x)
warning(paste("There are duplications of genes in your input, and",
"duplicated items are removed. As a result, the order and",
"length of output won't be the same as the input.\n"))
}
}
# Convert to gene name
dict <- db[["to_name_dict"]]
mapped_index <- x %in% names(dict)
map_all <- Reduce(`&`, mapped_index)
if(map_all) {
result <- dict[x]
} else {
mapped <- x[mapped_index]
result <- dict[mapped]
warning(paste("The following FlyBase gene numbers are not found in the",
"GTF file you loaded:",
paste(x[!mapped_index], collapse = ", "), ".\n"))
}
if (normalize) {
result <- normalize_genename(result)
}
return(result)
}
#' Distinguish FlyBase Gene Numbers from Gene Names
#'
#' @param x a character vector containing genes and FlyBase gene numbers
#'
#' @return a logical vector
distinguish_fbgn <- function(x) {
if (!is.character(x)) {
stop(paste("distinguish_fbgn() takes only a character vector of FlyBase",
"gene numbers or gene names\n"))
}
FBgn_index <- grepl("^FBgn", x)
return(FBgn_index)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.