#' Convert Ensembl IDs to gene symbols.
#'
#' Convert Ensembl IDs to gene symbols.
#'
#' Gene symbols with more than one Ensembl ID match are determined only from
#' genes present in the BbcSE object, so be sure it is unfiltered.
#'
#' \itemize{
#' \item Symbols that match multiple Ensembl IDs will be resolved by concatenating
#' the Ensembl ID and the gene symbol. Uses scater::uniquifyFeatureNames.
#' \item Genes with multiple possible symbols will be labelled as Ensembl ID.
#' \item Genes absent from OrgDb/Biomart will be labelled as Ensembl ID.
#' }
#'
#' @param x A BbcSE object.
#' @param orgdb A OrgDb object. Download the annotations for your species from
#' 'http://bioconductor.org/packages/release/BiocViews.html#___OrgDb'. Only
#' used if BMdataset="".
#' @param BMdataset character. The name of the BM dataset you want to use. Try
#' to use 'bbcRNA::searchBM("species_name")' to find the appropriate dataset
#' if needed.
#' @return A BbcSE object.
#' @export
#' @importFrom AnnotationDbi keys mapIds keytypes
#' @importFrom biomaRt getBM useMart useDataset listDatasets
ens2sym <- function(x, orgdb, BMdataset="") {
if(!is(x, "BbcSE")) stop("x is not a BbcSE object")
if(BMdataset==""){
message("Using OrgDb.")
message(paste0(nrow(x), " total genes in BbcSE object."))
if(!"ENSEMBL" %in% keytypes(orgdb)){
stop("This org.db does not have Ensembl IDs. Use biomart (set BMdataset option).")
}
# get Ensembl IDs present in the OrgDb
ens_genes <- AnnotationDbi::keys(orgdb, keytype="ENSEMBL")
# keep only genes present in BbcSE object
ens_genes <- ens_genes[ens_genes %in% rownames(x)]
# this returns a list and all matches will be returned for each ensembl gene
gene_syms <- AnnotationDbi::mapIds(orgdb,
keys = ens_genes,
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "list")
stopifnot(identical(length(ens_genes), length(gene_syms)))
# Remove Ensembl IDs with multiple symbols
## Count them
num_w_multi_sym <- sum(sapply(gene_syms, function(x){length(x) > 1}))
message(paste0(num_w_multi_sym,
" genes removed due to more than 1 matching symbol."))
## Remove them
gene_syms <- unlist(gene_syms[sapply(gene_syms, function(x){length(x) == 1})])
} else{
# select mart to use
ensembl <- biomaRt::useMart("ensembl")
# make sure dataset exists and print out version
ensembl_datasets <- biomaRt::listDatasets(mart=ensembl)
ensembl_dataset_to_use <- ensembl_datasets[ensembl_datasets$dataset==BMdataset, ]
if(identical(nrow(ensembl_dataset_to_use), 0)){
stop("BM dataset does not exist.")
} else{
stopifnot(nrow(ensembl_dataset_to_use) == 1) # make sure only one matching dataset
}
message(paste0("Using Biomart. Dataset version: ", ensembl_dataset_to_use$version))
message(paste0(nrow(x), " total genes in BbcSE object."))
# get the dataset from BM
ensembl <- biomaRt::useDataset(BMdataset, mart=ensembl)
ens_2_symbol <- biomaRt::getBM(attributes = c("ensembl_gene_id", "external_gene_name"), mart = ensembl)
# keep only genes present in BbcSE object
ens_2_symbol <- ens_2_symbol[ens_2_symbol$ensembl_gene_id %in% rownames(x), ]
# Make NA if there are multiple possible symbols
## Find Ensembl IDs with multiple symbols
ens_w_multi_sym <- unique(ens_2_symbol[duplicated(ens_2_symbol$ensembl_gene_id),
"ensembl_gene_id", drop=TRUE])
num_genes_b4_filt <- nrow(ens_2_symbol)
## Remove Ensembl IDs with multiple symbols
ens_2_symbol <- ens_2_symbol[!ens_2_symbol$ensembl_gene_id %in% ens_w_multi_sym, ]
message(paste0(num_genes_b4_filt-nrow(ens_2_symbol),
" genes removed due to more than 1 matching symbol."))
gene_syms <- ens_2_symbol$external_gene_name
names(gene_syms) <- ens_2_symbol$ensembl_gene_id
}
# function to uniquify gene names from Scater
uniquifyFeatureNames <- function (ID, names)
{
if (length(ID) != length(names)) {
stop("lengths of 'ID' and 'names' must be equal")
}
missing.name <- is.na(names)
names[missing.name] <- ID[missing.name]
dup.name <- names %in% names[duplicated(names)]
names[dup.name] <- paste0(names[dup.name], "_", ID[dup.name])
return(names)
}
message(paste0(length(gene_syms), " genes with symbols found."))
# uniquify names by concatenating Ensembl ID and gene symbol with a '_' if
# gene symbol is multi-matching
uniq_syms <- uniquifyFeatureNames(names(gene_syms), gene_syms)
message(paste0(sum(gene_syms!=uniq_syms), " genes with uniquified symbols."))
names(uniq_syms) <- names(gene_syms)
# genes in the BbcSE object absent from OrgDb/Biomart
missing_syms <- rownames(x)[!rownames(x) %in% names(uniq_syms)]
names(missing_syms) <- missing_syms
message(paste0(length(missing_syms), " genes either missing from database or filtered due to criteria described in documentation."))
#combine the genes present in OrgDb/Biomart with those absent
uniq_syms <- c(uniq_syms, missing_syms)
# keep only the genes present in the BbcSE object and order based on
# the genes in the BbcSE object
stopifnot(all(rownames(x) %in% names(uniq_syms)))
rowData(x)$uniq_syms <- uniq_syms[rownames(x)]
#rowData(x) <- cbind(rowData(x), uniq_syms = uniq_syms[rownames(x)])
validObject(x)
return(x)
}
###-----------------------------------------------------------------------------
#' Convert Ensembl IDs to Entrez IDs for gene enrichment analyses.
#'
#' Convert Ensembl IDs to Entrez IDs for gene enrichment analyses.
#'
#' Entrez IDs with more than one Ensembl ID match are determined only from genes
#' present in the BbcSE object, so be sure it is unfiltered.
#'
#' \itemize{
#' \item Entrez IDs that match multiple Ensembl IDs are NA
#' \item The first Entrez ID will be returned for genes with multiple possible
#' Entrez IDs.
#' \item Genes with no Entrez ID or absent from the database will be
#' NA. }
#'
#' @param x A BbcSE object.
#' @param orgdb A OrgDb object. Download the annotations for your species from
#' 'http://bioconductor.org/packages/release/BiocViews.html#___OrgDb'. Only
#' used if BMdataset="".
#' @param BMdataset character. The name of the BM dataset you want to use. Try
#' to use 'bbcRNA::searchBM("species_name")' to find the appropriate dataset
#' if needed.
#' @return A BbcSE object.
#' @export
#' @importFrom AnnotationDbi keys mapIds keytypes
#' @importFrom biomaRt getBM useMart useDataset listDatasets
ens2entrez <- function(x, orgdb, BMdataset="") {
if(!is(x, "BbcSE")) stop("x is not a BbcSE object")
if(BMdataset==""){
message("Using OrgDb.")
if(!"ENSEMBL" %in% keytypes(orgdb)){
stop("This org.db does not have Ensembl IDs. Use biomart (set BMdataset option).")
}
# get Ensembl IDs present in the OrgDb
ens_genes <- AnnotationDbi::keys(orgdb, keytype="ENSEMBL")
# keep only genes present in BbcSE object
ens_genes <- ens_genes[ens_genes %in% rownames(x)]
# get entrez ids
entrez_ids <- AnnotationDbi::mapIds(orgdb,
keys = ens_genes,
column = "ENTREZID",
keytype = "ENSEMBL",
multiVals = "first")
stopifnot(identical(length(ens_genes), length(entrez_ids)))
#names(entrez_ids) <- ens_genes
} else {
# select mart to use
ensembl <- biomaRt::useMart("ensembl")
# make sure dataset exists and print out version
ensembl_datasets <- biomaRt::listDatasets(mart=ensembl)
ensembl_dataset_to_use <- ensembl_datasets[ensembl_datasets$dataset==BMdataset, ]
if(identical(nrow(ensembl_dataset_to_use), 0)){
stop("BM dataset does not exist.")
} else{
stopifnot(nrow(ensembl_dataset_to_use) == 1) # make sure only one matching dataset
}
message(paste0("Using Biomart. Dataset version: ", ensembl_dataset_to_use$version))
# get the dataset from BM
ensembl <- biomaRt::useDataset(BMdataset, mart=ensembl)
#ens_2_symbol <- biomaRt::getBM(attributes = c("ensembl_gene_id", "external_gene_name"), mart = ensembl)
ens_2_entrez <- biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene_id"), mart = ensembl)
# keep only genes present in BbcSE object
ens_2_entrez <- ens_2_entrez[ens_2_entrez$ensembl_gene_id %in% rownames(x), ]
# take the first match if there are multiple Entrez matches for a particular Ensembl ID
ens_2_entrez <- ens_2_entrez[!duplicated(ens_2_entrez$ensembl_gene_id), ]
entrez_ids <- ens_2_entrez$entrezgene_id
names(entrez_ids) <- ens_2_entrez$ensembl_gene_id
}
message(paste0(sum(!rownames(x) %in% names(entrez_ids)),
" Ensembl IDs not in the database."))
# remove Entrez genes that match more than one Ensembl gene
dup_genes <- unique(entrez_ids[duplicated(entrez_ids)])
message(paste0(sum(entrez_ids %in% dup_genes),
" Ensembl IDs matching the same Entrez ID as another Ensembl ID."))
entrez_ids <- entrez_ids[!entrez_ids %in% dup_genes]
# make sure entrez ids unique
stopifnot(length(entrez_ids) == length(unique(entrez_ids)))
message(paste0(length(entrez_ids),
" genes with valid Entrez ID out of ",
nrow(x),
" total genes"))
# genes in the BbcSE object absent from OrgDb/Biomart or removed due to more
# than one Ensembl match
missing_genes <- rownames(x)[!rownames(x) %in% names(entrez_ids)]
names(missing_genes) <- missing_genes # store the ensembl ID as names
missing_genes[1:length(missing_genes)] <- NA # convert the ensembl IDs to NAs
#combine the genes present in OrgDb with those absent
entrez_ids <- c(entrez_ids, missing_genes)
# order based on the genes in the BbcSE object
rowData(x)$entrez_ids <- entrez_ids[rownames(x)]
#rowData(x) <- cbind(rowData(x), entrez_ids = entrez_ids[rownames(x)])
validObject(x)
return(x)
}
###-----------------------------------------------------------------------------
#' Search biomart datasets with keyword
#'
#' Search biomart datasets with keyword
#'
#' @param keyword regex to search for. Perl style.
#' @export
#' @importFrom biomaRt listDatasets
searchBM <- function(keyword){
ensembl <- useMart("ensembl")
bm_datasets <- biomaRt::listDatasets(ensembl)
cols_w_keyword <- grep(keyword, bm_datasets, perl = TRUE, ignore.case = TRUE)
rows_w_keyword_list <- lapply(cols_w_keyword, function(x){
grep(keyword, bm_datasets[[x]], perl = TRUE, ignore.case = TRUE)
})
rows_w_keyword <- unique(do.call(c, rows_w_keyword_list))
if(length(rows_w_keyword) == 0){
stop("Keyword not found.")
}
return(bm_datasets[rows_w_keyword, ])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.