#' @title get annotation from GTF
#'
#' @description extract tx2gene and gene annotation file from gtf file
#'
#' @param gtf_file a gtf file, typically downloaded from ensembl(http://www.ensembl.org/info/data/ftp/index.html)
#'
#' @param species it must be one of hs, mm
#'
#' @param datasource dataSrouce, like "ensembl GRCh38.95"
#'
#' @param server define a server which biomaRt will use
#'
#' @param fp if NULL, adding fp info will be omitted, if suppied with a vector(Default), then it will be added.
#'
#' @import biomaRt
#'
#' @return a list with two items, tx2gene was used for conversion between transcript and gene, annotation was gene annotation info
#'
#' @export
#'
#' @examples
#' gtf <- "~/Desktop/Mus_musculus.GRCm38.95.gtf"
#' grcm38 <- gtf_to_annotation(gtf, "mm", "ensembl_GRCm38.95") %>% .[[2]] %>% head()
gtf_to_annotation <- function(gtf_file,
species="hs",
datasource="ensembl",
server="asia.ensembl.org",
fp=c("EGFP", "mCherry", "tdTomato", "ZsGreen",
"turboGFP", "mNeonGreen", "mScarleti")) {
if(!(species %in% c("hs", "mm"))) {
stop("Incorrect species parameter, please check it!")
}
if(species=="hs") {
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "hsapiens_gene_ensembl",
host=server, ssl.verifypeer=F)
attrs <- c("ensembl_gene_id", "external_gene_name", "hgnc_symbol", "entrezgene_id", "gene_biotype",
"description", "chromosome_name", "strand")
organism <- "Homo sapiens"
} else {
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "mmusculus_gene_ensembl",
host=server, ssl.verifypeer=F)
attrs <- c("ensembl_gene_id", "external_gene_name", "mgi_symbol", "entrezgene_id", "gene_biotype",
"description", "chromosome_name", "strand")
organism <- "Mus musculus"
}
cat("Prepare txdb object\n")
txdb <- GenomicFeatures::makeTxDbFromGFF(file=gtf_file, format="auto",
dataSource=datasource, organism=organism)
# AnnotationDbi::keytypes(txdb)
# AnnotationDbi::columns(txdb)
cat("Extract transcripts and gene ids\n")
txname <- AnnotationDbi::keys(txdb, keytype="TXNAME")
tx2gene <- AnnotationDbi::select(txdb, keys=txname,
columns=c("TXNAME", "GENEID"), keytype="TXNAME") %>%
dplyr::distinct(TXNAME, .keep_all=T) %>%
tibble::as_tibble()
# all.attrs <- listAttributes(mart) %>% as_tibble()
# all.filters <- listFilters(mart) %>% as_tibble()
# get attrs
# attrs to return
cat("Annotate genes from biomaRt \n")
annotation <- biomaRt::getBM(attributes=attrs, filters="ensembl_gene_id",
values=unique(tx2gene$GENEID), bmHeader=F, uniqueRows=T, mart=mart) %>%
dplyr::distinct(ensembl_gene_id, .keep_all=T) %>%
tibble::as_tibble()
# annotation$description <- sapply(annotation$description, function(x) strsplit(x, " [[]")[[1]][1],
# simplify=T, USE.NAMES=F)
# colnames(annotation)[7] <- "chromosome"
if(!is.null(fp)) {
## add custom fluorescence protein info
# fp <- c("EGFP","mCherry", "tdTomato")
cat("Add fluorescence protein info\n")
t2g.modify <- dplyr::tibble(TXNAME=fp, GENEID=fp)
tx2gene <- dplyr::bind_rows(tx2gene, t2g.modify)
if(species=="hs") {
annotation.modify <- tibble::tibble(ensembl_gene_id=fp,
external_gene_name=fp,
hgnc_symbol=fp,
entrezgene_id=rep(NA, length(fp)),
gene_biotype=rep("protein-coding", length(fp)),
description=rep("use defined protein", length(fp)),
chromosome_name=NA,
strand=NA
)
} else {
annotation.modify <- tibble::tibble(ensembl_gene_id=fp,
external_gene_name=fp,
mgi_symbol=fp,
entrezgene_id=rep(NA, length(fp)),
gene_biotype=rep("protein-coding", length(fp)),
description=rep("use defined protein", length(fp)),
chromosome_name=NA,
strand=NA
)
}
annotation <- dplyr::bind_rows(annotation, annotation.modify)
}
cat("Successfully created tx2gene and annotation files\n")
return(list(tx2gene=tx2gene, annotation=annotation))
}
# # # for first time use
#
# gtf_file <- "~/Desktop/Homo_sapiens.GRCh38.95.gtf"
# GRCh38 <- gtf_to_annotation(gtf_file, species="hs")
# GRCh38_tx2gene <- GRCh38$tx2gene
# GRCh38_annotation <- GRCh38$annotation
#
# gtf_file <- "~/Desktop/Mus_musculus.GRCm38.95.gtf"
# GRCm38 <- gtf_to_annotation(gtf_file, species="mm")
# GRCm38_tx2gene <- GRCm38$tx2gene
# GRCm38_annotation <- GRCm38$annotation
#
#
# usethis::use_data(GRCh38_tx2gene, GRCh38_annotation, internal=F, overwrite=T)
# usethis::use_data(GRCm38_tx2gene, GRCm38_annotation, internal=F, overwrite=T)
#
#
# writexl::write_xlsx(list(GRCh38_tx2gene=GRCh38_tx2gene, GRCh38_annotation=GRCh38_annotation,
# GRCm38_tx2gene=GRCm38_tx2gene, GRCm38_annotation=GRCm38_annotation),
# path="~/Desktop/annotation_GRC38_ensembl.95.xlsx")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.